TBCI Numerical high perf. C++ Library
2.8.0
Main Page
Related Pages
Namespaces
Classes
Files
File List
File Members
home
abuild
rpmbuild
BUILD
numerix-2.0
lina
include
unroll_prefetch_simd_def.h
Go to the documentation of this file.
1
8
#ifndef H_UNROLL_PREFETCH_SIMD_DEF_H
9
#define H_UNROLL_PREFETCH_SIMD_DEF_H
10
18
#define UNROLL4_PREF_KERNEL5_SIMD(MDOP,ADV,T,SUF,UNA1,UNA2) \
20
if (EL_PER_CL(T) <= 1) { \
21
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
22
i -= 4*ADV; \
23
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
24
v1 += 4*ADV; \
25
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
26
v2 += 4*ADV; \
27
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
28
res += 4*ADV; \
29
} else if (EL_PER_CL(T) <= 2) { \
30
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
31
i -= 4*ADV; \
32
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
33
v1 += 4*ADV; \
34
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
35
v2 += 4*ADV; \
36
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
37
res += 4*ADV; \
38
} else { \
39
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
40
i -= 4*ADV; \
41
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
42
v1 += 4*ADV; \
43
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
44
v2 += 4*ADV; \
45
MDOP(res+4*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
46
res += 4*ADV; \
47
}
48
49
51
#define UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2) \
52
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
53
i -= 4*ADV; \
54
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2);\
55
v1 += 4*ADV; \
56
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
57
v2 += 4*ADV; \
58
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
59
res += 4*ADV
60
61
62
/* Without prefetching */
63
#define VKERN_TEMPL_3V_NP_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
64
if (LIKELY(i >= 4*ADV)) { \
65
STP TMP, LD UNUSED; \
66
do { \
67
UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2); \
68
} while (i >= 4*ADV); \
69
}
70
71
/* Without unrolling */
72
#define VKERN_TEMPL_3V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
73
while (i >= ADV) { \
74
STP TMP, LD UNUSED; \
75
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
76
i -= ADV; res += ADV; v1 += ADV; v2 +=ADV; \
77
}
78
79
/* Without SIMD */
80
#define VKERN_TEMPL_3V_SISD(SDOP,COND,STP,SUF) \
81
while (COND && i) { \
82
STP TMP, LD UNUSED; \
83
SDOP(res,v1,v2,f1,f2,SUF,,); \
84
--i; ++res; ++v1; ++v2; \
85
}
86
87
/*************************************************************/
88
90
#define UNROLL4_PREF_KERNEL4_SIMD(MDOP,ADV,T,SUF,UNA) \
91
if (EL_PER_CL(T) <= 1) { \
92
MDOP(res,v1,f1,f2,SUF,UNA); \
93
i -= 4*ADV; \
94
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
95
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
96
v1 += 4*ADV; \
97
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
98
res += 4*ADV; \
99
} else if (EL_PER_CL(T) <= 2) { \
100
MDOP(res,v1,f1,f2, SUF,UNA); \
101
i -= 4*ADV; \
102
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
103
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
104
v1 += 4*ADV; \
105
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
106
res += 4*ADV; \
107
} else { \
108
MDOP(res,v1,f1,f2,SUF,UNA); \
109
i -= 4*ADV; \
110
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
111
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
112
v1 += 4*ADV; \
113
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
114
res += 4*ADV; \
115
}
116
117
119
#define UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA) \
120
MDOP(res, v1, f1, f2, SUF, UNA); \
121
MDOP(res+ADV, v1+ADV, f1, f2, SUF, UNA); \
122
i -= 4*ADV; \
123
MDOP(res+2*ADV, v1+2*ADV, f1, f2, SUF, UNA); \
124
MDOP(res+3*ADV, v1+3*ADV, f1, f2, SUF, UNA); \
125
v1 += 4*ADV; res += 4*ADV
126
127
128
/* Without prefetching */
129
#define VKERN_TEMPL_2V_NP_SIMD(MDOP,ADV,STP,SUF,UNA) \
130
if (LIKELY(i >= 4*ADV)) { \
131
STP TMP, LD UNUSED, t UNUSED; \
132
do { \
133
UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA); \
134
} while (i >= 4*ADV); \
135
}
136
137
/* Without unrolling */
138
#define VKERN_TEMPL_2V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA) \
139
while (i >= ADV) { \
140
STP TMP, LD UNUSED, t UNUSED; \
141
MDOP(res, v1, f1, f2, SUF, UNA); \
142
i -= ADV; v1 += ADV; res += ADV; \
143
}
144
145
/* Without SIMD */
146
#define VKERN_TEMPL_2V_SISD(SDOP,COND,STP,SUF) \
147
while (COND && i) { \
148
STP TMP, LD UNUSED, t UNUSED; \
149
SDOP(res,v1,f1,f2,SUF,u); \
150
--i; ++v1; ++res; \
151
}
152
153
154
/*************************************************************/
155
160
#define UNROLL4_PREF_KERNEL3_SIMD(MDOP,ADV,T,SUF) \
161
if (EL_PER_CL(T) <= 1) { \
162
MDOP(res, f1, f2, SUF); \
163
MDOP(res+ADV, f1, f2, SUF); \
164
i -= 4*ADV; \
165
MDOP(res+2*ADV, f1, f2, SUF); \
166
MDOP(res+3*ADV, f1, f2, SUF); \
167
res += 4*ADV; \
168
} else if (EL_PER_CL(T) <= 2) { \
169
MDOP(res, f1, f2, SUF); \
170
MDOP(res+ADV, f1, f2, SUF); \
171
i -= 4*ADV; \
172
MDOP(res+2*ADV, f1, f2, SUF); \
173
MDOP(res+3*ADV, f1, f2, SUF); \
174
res += 4*ADV; \
175
} else { \
176
MDOP(res, f1, f2, SUF); \
177
MDOP(res+ADV, f1, f2, SUF); \
178
i -= 4*ADV; \
179
MDOP(res+2*ADV, f1, f2, SUF); \
180
MDOP(res+3*ADV, f1, f2, SUF); \
181
res += 4*ADV; \
182
}
183
184
186
#define UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF) \
187
MDOP(res, f1, f2, SUF); \
188
MDOP(res+ADV, f1, f2, SUF); \
189
i -= 4*ADV; \
190
MDOP(res+2*ADV, f1, f2, SUF); \
191
MDOP(res+3*ADV, f1, f2, SUF); \
192
res += 4*ADV
193
194
/* Without prefetching */
195
#define VKERN_TEMPL_1V_NP_SIMD(MDOP,ADV,STP,SUF) \
196
if (LIKELY(i >= 4*ADV)) { \
197
STP TMP UNUSED; \
198
STP y UNUSED; \
199
STP t UNUSED; \
200
do { \
201
UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF); \
202
} while (i >= 4*ADV); \
203
}
204
205
/* Without unrolling */
206
#define VKERN_TEMPL_1V_PLAIN_SIMD(MDOP,ADV,STP,SUF) \
207
while (i >= ADV) { \
208
STP TMP UNUSED; \
209
STP y UNUSED; \
210
STP t UNUSED; \
211
MDOP(res,f1,f2,SUF); \
212
i -= ADV; res += ADV; \
213
}
214
215
/* Without SIMD */
216
#define VKERN_TEMPL_1V_SISD(SDOP,COND,STP,SUF) \
217
while (COND && i) { \
218
STP TMP UNUSED; \
219
STP y UNUSED; \
220
STP t UNUSED; \
221
SDOP(res,f1,f2,SUF); \
222
--i; ++res; \
223
}
224
225
/* -------------------------------------------------------------- */
229
#ifndef TBCI_SIMD_UNROLL
230
# define NO_TBCI_SIMD_UNROLL
231
#endif
232
233
234
#ifdef TBCI_SIMD_UNROLL
235
# define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_NP_SIMD(m,a,s,f,u1,u2)
236
# define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_NP_SIMD(m,a,s,f,u)
237
# define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_NP_SIMD(m,a,s,f)
238
#else
239
# define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_PLAIN_SIMD(m,a,s,f,u1,u2)
240
# define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_PLAIN_SIMD(m,a,s,f,u)
241
# define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_PLAIN_SIMD(m,a,s,f)
242
#endif
243
244
/* -------------------------------------------------------------- */
245
246
/* On i386, we have no reasonable alignment guarantees, thus we may
247
* need to use the slower, unaligned variants for the load and store
248
* instructions :-(
249
* If malloc_cache is used (and we have memalign), we're fine, though ...
250
*/
251
252
#ifdef AVX512
253
# define ALIGN_REQ 0x3f
254
#elif defined(__AVX__)
255
# define ALIGN_REQ 0x1f
256
#else
257
# define ALIGN_REQ 0x0f
258
#endif
259
260
#if !defined(__x86_64__) && (!defined(MALLOC_CACHE) || defined(SSE_VARS_MAY_BE_UNALIGNED))
261
//# define _mm_load_pd _mm_loadu_pd
262
//# define _mm_load_ps _mm_loadu_ps
263
//# define _mm_store_pd _mm_storeu_pd
264
//# define _mm_store_ps _mm_storeu_ps
265
//# define MISALIGNMENT_CHECK(x) (false)
266
# define MISALIGNMENT_CHECK(x) ((unsigned long)x & ALIGN_REQ)
267
# if defined(__GNUC__) || defined(__INTEL_COMPILER)
268
# warning May have to use slow unaligned SSE insns
269
# endif
270
#else
271
# define MISALIGNMENT_CHECK(x) (UNLIKELY((unsigned long)x & ALIGN_REQ))
272
#endif
273
279
#ifdef WARN_UNALIGNED
280
# define WARN_UNALIGN(v) \
281
STD__ cerr << "TBCI WARN: Unaligned access to " #v " at " << v << " from " << __FUNCTION__ << "\n";
282
#else
283
# define WARN_UNALIGN(v) do {} while (0)
284
#endif
285
290
/* -------------------------------------------------------------- */
291
306
#define VKERN_TEMPL_3V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
307
TWEAK(template <> \
308
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
309
TYPE* RESTRICT const _res, \
310
const TYPE* RESTRICT const _v1, \
311
const TYPE* RESTRICT const _v2)) \
312
{ \
313
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
314
REGISTER TYPE *res = _res; \
315
PREP; \
316
REGISTER long i = sz; \
317
/* Make sure we have proper alignment */
\
318
VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
319
if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
320
WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
321
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
322
} else if (MISALIGNMENT_CHECK(v1)) { \
323
WARN_UNALIGN(v1); \
324
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
325
} else if (MISALIGNMENT_CHECK(v2)) { \
326
WARN_UNALIGN(v2); \
327
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
328
} else { \
329
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
330
} \
331
SFIN; \
332
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
333
FIN; \
334
}
335
337
#define VKERN_TEMPL_3V_SIMD_UA(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
338
TWEAK(template <> \
339
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
340
TYPE* RESTRICT const _res, \
341
const TYPE* RESTRICT const _v1, \
342
const TYPE* RESTRICT const _v2)) \
343
{ \
344
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
345
REGISTER TYPE *res = _res; \
346
PREP; \
347
REGISTER long i = sz; \
348
/* Make sure we have proper alignment */
\
349
VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
350
if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
351
/*WARN_UNALIGN(v1); WARN_UNALIGN(v2);*/
\
352
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
353
} else if (MISALIGNMENT_CHECK(v1)) { \
354
/*WARN_UNALIGN(v1);*/
\
355
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
356
} else if (MISALIGNMENT_CHECK(v2)) { \
357
/*WARN_UNALIGN(v2);*/
\
358
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
359
} else { \
360
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
361
} \
362
SFIN; \
363
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
364
FIN; \
365
}
366
367
#define VKERN_TEMPL_3V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
368
TWEAK(template <> \
369
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
370
TYPE* RESTRICT const _res, \
371
const TYPE* RESTRICT const _v1, \
372
const TYPE* RESTRICT const _v2, \
373
LCTYPED(TYPE) _f2)) \
374
{ \
375
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
376
REGISTER TYPE *res = _res; \
377
PREP(_f2); \
378
REGISTER long i = sz; \
379
/* Make sure we have proper alignment */
\
380
VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
381
if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
382
WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
383
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
384
} else if (MISALIGNMENT_CHECK(v1)) { \
385
WARN_UNALIGN(v1); \
386
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
387
} else if (MISALIGNMENT_CHECK(v2)) { \
388
WARN_UNALIGN(v2); \
389
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
390
} else { \
391
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
392
} \
393
SFIN; \
394
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
395
FIN(_f2); \
396
}
397
398
#define VKERN_TEMPL_3V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
399
TWEAK(template <> \
400
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
401
TYPE* RESTRICT const _res, \
402
const TYPE* RESTRICT const _v1, \
403
const TYPE* RESTRICT const _v2, \
404
LCTYPED(TYPE) _f1, \
405
LCTYPED(TYPE) _f2)) \
406
{ \
407
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
408
REGISTER TYPE *res = _res; \
409
PREP(_f1, _f2); \
410
REGISTER long i = sz; \
411
/* Make sure we have proper alignment */
\
412
VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
413
if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
414
WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
415
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
416
} else if (MISALIGNMENT_CHECK(v1)) { \
417
WARN_UNALIGN(v1); \
418
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
419
} else if (MISALIGNMENT_CHECK(v2)) { \
420
WARN_UNALIGN(v2); \
421
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
422
} else { \
423
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
424
} \
425
SFIN; \
426
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
427
FIN(_f1, _f2); \
428
}
429
430
431
#define VKERN_TEMPL_2V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
432
TWEAK(template <> \
433
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
434
TYPE* RESTRICT const _res, \
435
const TYPE* RESTRICT const _v1)) \
436
{ \
437
REGISTER TYPE *res = _res; \
438
REGISTER const TYPE *v1 = _v1; \
439
PREP; \
440
REGISTER long i = sz; \
441
/* Make sure we have proper alignment */
\
442
VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
443
if (MISALIGNMENT_CHECK(v1)) { \
444
WARN_UNALIGN(v1); \
445
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
446
} else { \
447
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
448
} \
449
SFIN; \
450
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
451
FIN; \
452
}
453
454
#define VKERN_TEMPL_2V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
455
TWEAK(template <> \
456
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
457
TYPE* RESTRICT const _res, \
458
const TYPE* RESTRICT const _v1, \
459
LCTYPED(TYPE) _f2)) \
460
{ \
461
REGISTER const TYPE *v1 = _v1; \
462
REGISTER TYPE *res= _res; \
463
PREP(_f2); \
464
REGISTER long i = sz; \
465
/* Make sure we have proper alignment */
\
466
VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
467
if (MISALIGNMENT_CHECK(v1)) { \
468
WARN_UNALIGN(v1); \
469
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
470
} else { \
471
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
472
} \
473
SFIN; \
474
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
475
FIN(_f2); \
476
}
477
478
#define VKERN_TEMPL_2V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
479
TWEAK(template <> \
480
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
481
TYPE* RESTRICT const _res, \
482
const TYPE* RESTRICT const _v1, \
483
LCTYPED(TYPE) _f1, \
484
LCTYPED(TYPE) _f2)) \
485
{ \
486
REGISTER const TYPE *v1 = _v1; \
487
REGISTER TYPE *res= _res; \
488
PREP(_f1, _f2); \
489
REGISTER long i = sz; \
490
/* Make sure we have proper alignment */
\
491
VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
492
if (MISALIGNMENT_CHECK(v1)) { \
493
WARN_UNALIGN(v1); \
494
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
495
} else { \
496
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
497
} \
498
SFIN; \
499
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
500
FIN(_f1, _f2); \
501
}
502
503
#define VKERN_TEMPL_2V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
504
TWEAK(template <> \
505
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
506
const TYPE* RESTRICT const _res, \
507
const TYPE* RESTRICT const _v1, \
508
TYPE &_f2)) \
509
{ \
510
REGISTER const TYPE *res= _res, *v1 = _v1; \
511
/* PREP(0.0,_f2); */
\
512
PREP(_f2); \
513
REGISTER long i = sz; \
514
/* Make sure we have proper alignment */
\
515
VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
516
if (MISALIGNMENT_CHECK(v1)) { \
517
WARN_UNALIGN(v1); \
518
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
519
} else { \
520
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
521
} \
522
SFIN; \
523
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
524
/* FIN(_f1,_f2); */
\
525
FIN(_f2); \
526
}
527
528
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
529
TWEAK(template <> \
530
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
531
const TYPE* RESTRICT const _res, \
532
const TYPE* RESTRICT const _v1, \
533
volatile long &_f2)) \
534
{ \
535
REGISTER const TYPE *res= _res, *v1 = _v1; \
536
/* PREP(0.0,_f2); */
\
537
PREP(_f2); \
538
REGISTER long i = sz; \
539
REGISTER int rg = 0; \
540
/* Make sure we have proper alignment */
\
541
VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
542
if (MISALIGNMENT_CHECK(v1)) { \
543
WARN_UNALIGN(v1); \
544
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
545
} else { \
546
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
547
} \
548
SFIN; \
549
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
550
/* FIN(_f1,_f2); */
\
551
_fin: \
552
FIN(_f2); \
553
}
554
555
#define VKERN_TEMPL_1V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
556
TWEAK(template <> \
557
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
558
TYPE* RESTRICT const _res)) \
559
{ \
560
REGISTER TYPE *res= _res; \
561
PREP; \
562
REGISTER long i = sz; \
563
/* Make sure we have proper alignment */
\
564
VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
565
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
566
SFIN; \
567
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
568
FIN; \
569
}
570
571
#define VKERN_TEMPL_1V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
572
TWEAK(template <> \
573
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
574
TYPE* RESTRICT const _res, \
575
LCTYPED(TYPE) _f2)) \
576
{ \
577
REGISTER TYPE *res= _res; \
578
PREP(_f2); \
579
REGISTER long i = sz; \
580
/* Make sure we have proper alignment */
\
581
VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
582
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
583
SFIN; \
584
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
585
FIN(_f2); \
586
}
587
588
#define VKERN_TEMPL_1V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
589
TWEAK(template <> \
590
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
591
TYPE* RESTRICT const _res, \
592
LCTYPED(TYPE) _f1, \
593
LCTYPED(TYPE) _f2)) \
594
{ \
595
REGISTER TYPE *res= _res; \
596
PREP(_f1, _f2); \
597
REGISTER long i = sz; \
598
/* Make sure we have proper alignment */
\
599
VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
600
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
601
SFIN; \
602
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
603
FIN(_f1, _f2); \
604
}
605
606
#define VKERN_TEMPL_1V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
607
TWEAK(template <> \
608
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
609
const TYPE* const _res, \
610
TYPE &_f2)) \
611
{ \
612
REGISTER const TYPE *res= _res; \
613
/* PREP(0.0,_f2); */
\
614
PREP(_f2); \
615
REGISTER long i = sz; \
616
/* Make sure we have proper alignment */
\
617
VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
618
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
619
SFIN; \
620
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
621
/* FIN(_f1,_f2); */
\
622
FIN(_f2); \
623
}
624
625
#endif // H_UNROLL_PREFETCH_SIMD_DEF_H
626
Generated by
1.8.5