TBCI Numerical high perf. C++ Library 2.8.0
unroll_prefetch_simd_def.h
Go to the documentation of this file.
1
7
8#ifndef H_UNROLL_PREFETCH_SIMD_DEF_H
9#define H_UNROLL_PREFETCH_SIMD_DEF_H
10
17
19#define UNROLL4_PREF_KERNEL5_SIMD(MDOP,ADV,T,SUF,UNA1,UNA2) \
20 if (EL_PER_CL(T) <= 1) { \
21 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
22 i -= 4*ADV; \
23 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
24 v1 += 4*ADV; \
25 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
26 v2 += 4*ADV; \
27 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
28 res += 4*ADV; \
29 } else if (EL_PER_CL(T) <= 2) { \
30 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
31 i -= 4*ADV; \
32 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
33 v1 += 4*ADV; \
34 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
35 v2 += 4*ADV; \
36 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
37 res += 4*ADV; \
38 } else { \
39 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
40 i -= 4*ADV; \
41 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
42 v1 += 4*ADV; \
43 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
44 v2 += 4*ADV; \
45 MDOP(res+4*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
46 res += 4*ADV; \
47 }
48
49
51#define UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2) \
52 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
53 i -= 4*ADV; \
54 MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2);\
55 v1 += 4*ADV; \
56 MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
57 v2 += 4*ADV; \
58 MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
59 res += 4*ADV
60
61
62/* Without prefetching */
63#define VKERN_TEMPL_3V_NP_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
64 if (LIKELY(i >= 4*ADV)) { \
65 STP TMP, LD UNUSED; \
66 do { \
67 UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2); \
68 } while (i >= 4*ADV); \
69 }
70
71/* Without unrolling */
72#define VKERN_TEMPL_3V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
73 while (i >= ADV) { \
74 STP TMP, LD UNUSED; \
75 MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
76 i -= ADV; res += ADV; v1 += ADV; v2 +=ADV; \
77 }
78
79/* Without SIMD */
80#define VKERN_TEMPL_3V_SISD(SDOP,COND,STP,SUF) \
81 while (COND && i) { \
82 STP TMP, LD UNUSED; \
83 SDOP(res,v1,v2,f1,f2,SUF,,); \
84 --i; ++res; ++v1; ++v2; \
85 }
86
87/*************************************************************/
88
90#define UNROLL4_PREF_KERNEL4_SIMD(MDOP,ADV,T,SUF,UNA) \
91 if (EL_PER_CL(T) <= 1) { \
92 MDOP(res,v1,f1,f2,SUF,UNA); \
93 i -= 4*ADV; \
94 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
95 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
96 v1 += 4*ADV; \
97 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
98 res += 4*ADV; \
99 } else if (EL_PER_CL(T) <= 2) { \
100 MDOP(res,v1,f1,f2, SUF,UNA); \
101 i -= 4*ADV; \
102 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
103 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
104 v1 += 4*ADV; \
105 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
106 res += 4*ADV; \
107 } else { \
108 MDOP(res,v1,f1,f2,SUF,UNA); \
109 i -= 4*ADV; \
110 MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
111 MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
112 v1 += 4*ADV; \
113 MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
114 res += 4*ADV; \
115 }
116
117
119#define UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA) \
120 MDOP(res, v1, f1, f2, SUF, UNA); \
121 MDOP(res+ADV, v1+ADV, f1, f2, SUF, UNA); \
122 i -= 4*ADV; \
123 MDOP(res+2*ADV, v1+2*ADV, f1, f2, SUF, UNA); \
124 MDOP(res+3*ADV, v1+3*ADV, f1, f2, SUF, UNA); \
125 v1 += 4*ADV; res += 4*ADV
126
127
128/* Without prefetching */
129#define VKERN_TEMPL_2V_NP_SIMD(MDOP,ADV,STP,SUF,UNA) \
130 if (LIKELY(i >= 4*ADV)) { \
131 STP TMP, LD UNUSED, t UNUSED; \
132 do { \
133 UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA); \
134 } while (i >= 4*ADV); \
135 }
136
137/* Without unrolling */
138#define VKERN_TEMPL_2V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA) \
139 while (i >= ADV) { \
140 STP TMP, LD UNUSED, t UNUSED; \
141 MDOP(res, v1, f1, f2, SUF, UNA); \
142 i -= ADV; v1 += ADV; res += ADV; \
143 }
144
145/* Without SIMD */
146#define VKERN_TEMPL_2V_SISD(SDOP,COND,STP,SUF) \
147 while (COND && i) { \
148 STP TMP, LD UNUSED, t UNUSED; \
149 SDOP(res,v1,f1,f2,SUF,u); \
150 --i; ++v1; ++res; \
151 }
152
153
154/*************************************************************/
155
160#define UNROLL4_PREF_KERNEL3_SIMD(MDOP,ADV,T,SUF) \
161 if (EL_PER_CL(T) <= 1) { \
162 MDOP(res, f1, f2, SUF); \
163 MDOP(res+ADV, f1, f2, SUF); \
164 i -= 4*ADV; \
165 MDOP(res+2*ADV, f1, f2, SUF); \
166 MDOP(res+3*ADV, f1, f2, SUF); \
167 res += 4*ADV; \
168 } else if (EL_PER_CL(T) <= 2) { \
169 MDOP(res, f1, f2, SUF); \
170 MDOP(res+ADV, f1, f2, SUF); \
171 i -= 4*ADV; \
172 MDOP(res+2*ADV, f1, f2, SUF); \
173 MDOP(res+3*ADV, f1, f2, SUF); \
174 res += 4*ADV; \
175 } else { \
176 MDOP(res, f1, f2, SUF); \
177 MDOP(res+ADV, f1, f2, SUF); \
178 i -= 4*ADV; \
179 MDOP(res+2*ADV, f1, f2, SUF); \
180 MDOP(res+3*ADV, f1, f2, SUF); \
181 res += 4*ADV; \
182 }
183
184
186#define UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF) \
187 MDOP(res, f1, f2, SUF); \
188 MDOP(res+ADV, f1, f2, SUF); \
189 i -= 4*ADV; \
190 MDOP(res+2*ADV, f1, f2, SUF); \
191 MDOP(res+3*ADV, f1, f2, SUF); \
192 res += 4*ADV
193
194/* Without prefetching */
195#define VKERN_TEMPL_1V_NP_SIMD(MDOP,ADV,STP,SUF) \
196 if (LIKELY(i >= 4*ADV)) { \
197 STP TMP UNUSED; \
198 STP y UNUSED; \
199 STP t UNUSED; \
200 do { \
201 UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF); \
202 } while (i >= 4*ADV); \
203 }
204
205/* Without unrolling */
206#define VKERN_TEMPL_1V_PLAIN_SIMD(MDOP,ADV,STP,SUF) \
207 while (i >= ADV) { \
208 STP TMP UNUSED; \
209 STP y UNUSED; \
210 STP t UNUSED; \
211 MDOP(res,f1,f2,SUF); \
212 i -= ADV; res += ADV; \
213 }
214
215/* Without SIMD */
216#define VKERN_TEMPL_1V_SISD(SDOP,COND,STP,SUF) \
217 while (COND && i) { \
218 STP TMP UNUSED; \
219 STP y UNUSED; \
220 STP t UNUSED; \
221 SDOP(res,f1,f2,SUF); \
222 --i; ++res; \
223 }
224
225/* -------------------------------------------------------------- */
229#ifndef TBCI_SIMD_UNROLL
230# define NO_TBCI_SIMD_UNROLL
231#endif
232
233
234#ifdef TBCI_SIMD_UNROLL
235# define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_NP_SIMD(m,a,s,f,u1,u2)
236# define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_NP_SIMD(m,a,s,f,u)
237# define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_NP_SIMD(m,a,s,f)
238#else
239# define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_PLAIN_SIMD(m,a,s,f,u1,u2)
240# define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_PLAIN_SIMD(m,a,s,f,u)
241# define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_PLAIN_SIMD(m,a,s,f)
242#endif
243
244/* -------------------------------------------------------------- */
245
246/* On i386, we have no reasonable alignment guarantees, thus we may
247 * need to use the slower, unaligned variants for the load and store
248 * instructions :-(
249 * If malloc_cache is used (and we have memalign), we're fine, though ...
250 */
251
252#ifdef AVX512
253# define ALIGN_REQ 0x3f
254#elif defined(__AVX__)
255# define ALIGN_REQ 0x1f
256#else
257# define ALIGN_REQ 0x0f
258#endif
259
260#if !defined(__x86_64__) && (!defined(MALLOC_CACHE) || defined(SSE_VARS_MAY_BE_UNALIGNED))
261//# define _mm_load_pd _mm_loadu_pd
262//# define _mm_load_ps _mm_loadu_ps
263//# define _mm_store_pd _mm_storeu_pd
264//# define _mm_store_ps _mm_storeu_ps
265//# define MISALIGNMENT_CHECK(x) (false)
266# define MISALIGNMENT_CHECK(x) ((unsigned long)x & ALIGN_REQ)
267# if defined(__GNUC__) || defined(__INTEL_COMPILER)
268# warning May have to use slow unaligned SSE insns
269# endif
270#else
271# define MISALIGNMENT_CHECK(x) (UNLIKELY((unsigned long)x & ALIGN_REQ))
272#endif
273
278
279#ifdef WARN_UNALIGNED
280# define WARN_UNALIGN(v) \
281 STD__ cerr << "TBCI WARN: Unaligned access to " #v " at " << v << " from " << __FUNCTION__ << "\n";
282#else
283# define WARN_UNALIGN(v) do {} while (0)
284#endif
285
289
290/* -------------------------------------------------------------- */
291
305
306#define VKERN_TEMPL_3V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
307TWEAK(template <> \
308VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
309 TYPE* RESTRICT const _res, \
310 const TYPE* RESTRICT const _v1, \
311 const TYPE* RESTRICT const _v2)) \
312{ \
313 REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
314 REGISTER TYPE *res = _res; \
315 PREP; \
316 REGISTER long i = sz; \
317 /* Make sure we have proper alignment */ \
318 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
319 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
320 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
321 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
322 } else if (MISALIGNMENT_CHECK(v1)) { \
323 WARN_UNALIGN(v1); \
324 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
325 } else if (MISALIGNMENT_CHECK(v2)) { \
326 WARN_UNALIGN(v2); \
327 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
328 } else { \
329 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
330 } \
331 SFIN; \
332 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
333 FIN; \
334}
335
337#define VKERN_TEMPL_3V_SIMD_UA(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
338TWEAK(template <> \
339VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
340 TYPE* RESTRICT const _res, \
341 const TYPE* RESTRICT const _v1, \
342 const TYPE* RESTRICT const _v2)) \
343{ \
344 REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
345 REGISTER TYPE *res = _res; \
346 PREP; \
347 REGISTER long i = sz; \
348 /* Make sure we have proper alignment */ \
349 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
350 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
351 /*WARN_UNALIGN(v1); WARN_UNALIGN(v2);*/ \
352 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
353 } else if (MISALIGNMENT_CHECK(v1)) { \
354 /*WARN_UNALIGN(v1);*/ \
355 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
356 } else if (MISALIGNMENT_CHECK(v2)) { \
357 /*WARN_UNALIGN(v2);*/ \
358 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
359 } else { \
360 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
361 } \
362 SFIN; \
363 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
364 FIN; \
365}
366
367#define VKERN_TEMPL_3V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
368TWEAK(template <> \
369VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
370 TYPE* RESTRICT const _res, \
371 const TYPE* RESTRICT const _v1, \
372 const TYPE* RESTRICT const _v2, \
373 LCTYPED(TYPE) _f2)) \
374{ \
375 REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
376 REGISTER TYPE *res = _res; \
377 PREP(_f2); \
378 REGISTER long i = sz; \
379 /* Make sure we have proper alignment */ \
380 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
381 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
382 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
383 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
384 } else if (MISALIGNMENT_CHECK(v1)) { \
385 WARN_UNALIGN(v1); \
386 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
387 } else if (MISALIGNMENT_CHECK(v2)) { \
388 WARN_UNALIGN(v2); \
389 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
390 } else { \
391 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
392 } \
393 SFIN; \
394 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
395 FIN(_f2); \
396}
397
398#define VKERN_TEMPL_3V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
399TWEAK(template <> \
400VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
401 TYPE* RESTRICT const _res, \
402 const TYPE* RESTRICT const _v1, \
403 const TYPE* RESTRICT const _v2, \
404 LCTYPED(TYPE) _f1, \
405 LCTYPED(TYPE) _f2)) \
406{ \
407 REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
408 REGISTER TYPE *res = _res; \
409 PREP(_f1, _f2); \
410 REGISTER long i = sz; \
411 /* Make sure we have proper alignment */ \
412 VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
413 if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
414 WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
415 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
416 } else if (MISALIGNMENT_CHECK(v1)) { \
417 WARN_UNALIGN(v1); \
418 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
419 } else if (MISALIGNMENT_CHECK(v2)) { \
420 WARN_UNALIGN(v2); \
421 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
422 } else { \
423 VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
424 } \
425 SFIN; \
426 VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
427 FIN(_f1, _f2); \
428}
429
430
431#define VKERN_TEMPL_2V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
432TWEAK(template <> \
433VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
434 TYPE* RESTRICT const _res, \
435 const TYPE* RESTRICT const _v1)) \
436{ \
437 REGISTER TYPE *res = _res; \
438 REGISTER const TYPE *v1 = _v1; \
439 PREP; \
440 REGISTER long i = sz; \
441 /* Make sure we have proper alignment */ \
442 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
443 if (MISALIGNMENT_CHECK(v1)) { \
444 WARN_UNALIGN(v1); \
445 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
446 } else { \
447 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
448 } \
449 SFIN; \
450 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
451 FIN; \
452}
453
454#define VKERN_TEMPL_2V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
455TWEAK(template <> \
456VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
457 TYPE* RESTRICT const _res, \
458 const TYPE* RESTRICT const _v1, \
459 LCTYPED(TYPE) _f2)) \
460{ \
461 REGISTER const TYPE *v1 = _v1; \
462 REGISTER TYPE *res= _res; \
463 PREP(_f2); \
464 REGISTER long i = sz; \
465 /* Make sure we have proper alignment */ \
466 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
467 if (MISALIGNMENT_CHECK(v1)) { \
468 WARN_UNALIGN(v1); \
469 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
470 } else { \
471 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
472 } \
473 SFIN; \
474 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
475 FIN(_f2); \
476}
477
478#define VKERN_TEMPL_2V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
479TWEAK(template <> \
480VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
481 TYPE* RESTRICT const _res, \
482 const TYPE* RESTRICT const _v1, \
483 LCTYPED(TYPE) _f1, \
484 LCTYPED(TYPE) _f2)) \
485{ \
486 REGISTER const TYPE *v1 = _v1; \
487 REGISTER TYPE *res= _res; \
488 PREP(_f1, _f2); \
489 REGISTER long i = sz; \
490 /* Make sure we have proper alignment */ \
491 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
492 if (MISALIGNMENT_CHECK(v1)) { \
493 WARN_UNALIGN(v1); \
494 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
495 } else { \
496 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
497 } \
498 SFIN; \
499 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
500 FIN(_f1, _f2); \
501}
502
503#define VKERN_TEMPL_2V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
504TWEAK(template <> \
505VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
506 const TYPE* RESTRICT const _res, \
507 const TYPE* RESTRICT const _v1, \
508 TYPE &_f2)) \
509{ \
510 REGISTER const TYPE *res= _res, *v1 = _v1; \
511 /* PREP(0.0,_f2); */ \
512 PREP(_f2); \
513 REGISTER long i = sz; \
514 /* Make sure we have proper alignment */ \
515 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
516 if (MISALIGNMENT_CHECK(v1)) { \
517 WARN_UNALIGN(v1); \
518 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
519 } else { \
520 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
521 } \
522 SFIN; \
523 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
524 /* FIN(_f1,_f2); */ \
525 FIN(_f2); \
526}
527
528#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
529TWEAK(template <> \
530VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
531 const TYPE* RESTRICT const _res, \
532 const TYPE* RESTRICT const _v1, \
533 volatile long &_f2)) \
534{ \
535 REGISTER const TYPE *res= _res, *v1 = _v1; \
536 /* PREP(0.0,_f2); */ \
537 PREP(_f2); \
538 REGISTER long i = sz; \
539 REGISTER int rg = 0; \
540 /* Make sure we have proper alignment */ \
541 VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
542 if (MISALIGNMENT_CHECK(v1)) { \
543 WARN_UNALIGN(v1); \
544 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
545 } else { \
546 VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
547 } \
548 SFIN; \
549 VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
550 /* FIN(_f1,_f2); */ \
551 _fin: \
552 FIN(_f2); \
553}
554
555#define VKERN_TEMPL_1V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
556TWEAK(template <> \
557VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
558 TYPE* RESTRICT const _res)) \
559{ \
560 REGISTER TYPE *res= _res; \
561 PREP; \
562 REGISTER long i = sz; \
563 /* Make sure we have proper alignment */ \
564 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
565 VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
566 SFIN; \
567 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
568 FIN; \
569}
570
571#define VKERN_TEMPL_1V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
572TWEAK(template <> \
573VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
574 TYPE* RESTRICT const _res, \
575 LCTYPED(TYPE) _f2)) \
576{ \
577 REGISTER TYPE *res= _res; \
578 PREP(_f2); \
579 REGISTER long i = sz; \
580 /* Make sure we have proper alignment */ \
581 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
582 VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
583 SFIN; \
584 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
585 FIN(_f2); \
586}
587
588#define VKERN_TEMPL_1V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
589TWEAK(template <> \
590VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
591 TYPE* RESTRICT const _res, \
592 LCTYPED(TYPE) _f1, \
593 LCTYPED(TYPE) _f2)) \
594{ \
595 REGISTER TYPE *res= _res; \
596 PREP(_f1, _f2); \
597 REGISTER long i = sz; \
598 /* Make sure we have proper alignment */ \
599 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
600 VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
601 SFIN; \
602 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
603 FIN(_f1, _f2); \
604}
605
606#define VKERN_TEMPL_1V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
607TWEAK(template <> \
608VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
609 const TYPE* const _res, \
610 TYPE &_f2)) \
611{ \
612 REGISTER const TYPE *res= _res; \
613 /* PREP(0.0,_f2); */ \
614 PREP(_f2); \
615 REGISTER long i = sz; \
616 /* Make sure we have proper alignment */ \
617 VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
618 VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
619 SFIN; \
620 VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
621 /* FIN(_f1,_f2); */ \
622 FIN(_f2); \
623}
624
625#endif // H_UNROLL_PREFETCH_SIMD_DEF_H
626