TBCI Numerical high perf. C++ Library  2.8.0
unroll_prefetch_simd_def.h
Go to the documentation of this file.
1 
8 #ifndef H_UNROLL_PREFETCH_SIMD_DEF_H
9 #define H_UNROLL_PREFETCH_SIMD_DEF_H
10 
18 #define UNROLL4_PREF_KERNEL5_SIMD(MDOP,ADV,T,SUF,UNA1,UNA2) \
20  if (EL_PER_CL(T) <= 1) { \
21  MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
22  i -= 4*ADV; \
23  MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
24  v1 += 4*ADV; \
25  MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
26  v2 += 4*ADV; \
27  MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
28  res += 4*ADV; \
29  } else if (EL_PER_CL(T) <= 2) { \
30  MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
31  i -= 4*ADV; \
32  MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
33  v1 += 4*ADV; \
34  MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
35  v2 += 4*ADV; \
36  MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
37  res += 4*ADV; \
38  } else { \
39  MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
40  i -= 4*ADV; \
41  MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
42  v1 += 4*ADV; \
43  MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
44  v2 += 4*ADV; \
45  MDOP(res+4*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
46  res += 4*ADV; \
47  }
48 
49 
51 #define UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2) \
52  MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
53  i -= 4*ADV; \
54  MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2);\
55  v1 += 4*ADV; \
56  MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
57  v2 += 4*ADV; \
58  MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
59  res += 4*ADV
60 
61 
62 /* Without prefetching */
63 #define VKERN_TEMPL_3V_NP_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
64  if (LIKELY(i >= 4*ADV)) { \
65  STP TMP, LD UNUSED; \
66  do { \
67  UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2); \
68  } while (i >= 4*ADV); \
69  }
70 
71 /* Without unrolling */
72 #define VKERN_TEMPL_3V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA1,UNA2) \
73  while (i >= ADV) { \
74  STP TMP, LD UNUSED; \
75  MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
76  i -= ADV; res += ADV; v1 += ADV; v2 +=ADV; \
77  }
78 
79 /* Without SIMD */
80 #define VKERN_TEMPL_3V_SISD(SDOP,COND,STP,SUF) \
81  while (COND && i) { \
82  STP TMP, LD UNUSED; \
83  SDOP(res,v1,v2,f1,f2,SUF,,); \
84  --i; ++res; ++v1; ++v2; \
85  }
86 
87 /*************************************************************/
88 
90 #define UNROLL4_PREF_KERNEL4_SIMD(MDOP,ADV,T,SUF,UNA) \
91  if (EL_PER_CL(T) <= 1) { \
92  MDOP(res,v1,f1,f2,SUF,UNA); \
93  i -= 4*ADV; \
94  MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
95  MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
96  v1 += 4*ADV; \
97  MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
98  res += 4*ADV; \
99  } else if (EL_PER_CL(T) <= 2) { \
100  MDOP(res,v1,f1,f2, SUF,UNA); \
101  i -= 4*ADV; \
102  MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
103  MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
104  v1 += 4*ADV; \
105  MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
106  res += 4*ADV; \
107  } else { \
108  MDOP(res,v1,f1,f2,SUF,UNA); \
109  i -= 4*ADV; \
110  MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
111  MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
112  v1 += 4*ADV; \
113  MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
114  res += 4*ADV; \
115  }
116 
117 
119 #define UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA) \
120  MDOP(res, v1, f1, f2, SUF, UNA); \
121  MDOP(res+ADV, v1+ADV, f1, f2, SUF, UNA); \
122  i -= 4*ADV; \
123  MDOP(res+2*ADV, v1+2*ADV, f1, f2, SUF, UNA); \
124  MDOP(res+3*ADV, v1+3*ADV, f1, f2, SUF, UNA); \
125  v1 += 4*ADV; res += 4*ADV
126 
127 
128 /* Without prefetching */
129 #define VKERN_TEMPL_2V_NP_SIMD(MDOP,ADV,STP,SUF,UNA) \
130  if (LIKELY(i >= 4*ADV)) { \
131  STP TMP, LD UNUSED, t UNUSED; \
132  do { \
133  UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA); \
134  } while (i >= 4*ADV); \
135  }
136 
137 /* Without unrolling */
138 #define VKERN_TEMPL_2V_PLAIN_SIMD(MDOP,ADV,STP,SUF,UNA) \
139  while (i >= ADV) { \
140  STP TMP, LD UNUSED, t UNUSED; \
141  MDOP(res, v1, f1, f2, SUF, UNA); \
142  i -= ADV; v1 += ADV; res += ADV; \
143  }
144 
145 /* Without SIMD */
146 #define VKERN_TEMPL_2V_SISD(SDOP,COND,STP,SUF) \
147  while (COND && i) { \
148  STP TMP, LD UNUSED, t UNUSED; \
149  SDOP(res,v1,f1,f2,SUF,u); \
150  --i; ++v1; ++res; \
151  }
152 
153 
154 /*************************************************************/
155 
160 #define UNROLL4_PREF_KERNEL3_SIMD(MDOP,ADV,T,SUF) \
161  if (EL_PER_CL(T) <= 1) { \
162  MDOP(res, f1, f2, SUF); \
163  MDOP(res+ADV, f1, f2, SUF); \
164  i -= 4*ADV; \
165  MDOP(res+2*ADV, f1, f2, SUF); \
166  MDOP(res+3*ADV, f1, f2, SUF); \
167  res += 4*ADV; \
168  } else if (EL_PER_CL(T) <= 2) { \
169  MDOP(res, f1, f2, SUF); \
170  MDOP(res+ADV, f1, f2, SUF); \
171  i -= 4*ADV; \
172  MDOP(res+2*ADV, f1, f2, SUF); \
173  MDOP(res+3*ADV, f1, f2, SUF); \
174  res += 4*ADV; \
175  } else { \
176  MDOP(res, f1, f2, SUF); \
177  MDOP(res+ADV, f1, f2, SUF); \
178  i -= 4*ADV; \
179  MDOP(res+2*ADV, f1, f2, SUF); \
180  MDOP(res+3*ADV, f1, f2, SUF); \
181  res += 4*ADV; \
182  }
183 
184 
186 #define UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF) \
187  MDOP(res, f1, f2, SUF); \
188  MDOP(res+ADV, f1, f2, SUF); \
189  i -= 4*ADV; \
190  MDOP(res+2*ADV, f1, f2, SUF); \
191  MDOP(res+3*ADV, f1, f2, SUF); \
192  res += 4*ADV
193 
194 /* Without prefetching */
195 #define VKERN_TEMPL_1V_NP_SIMD(MDOP,ADV,STP,SUF) \
196  if (LIKELY(i >= 4*ADV)) { \
197  STP TMP UNUSED; \
198  STP y UNUSED; \
199  STP t UNUSED; \
200  do { \
201  UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF); \
202  } while (i >= 4*ADV); \
203  }
204 
205 /* Without unrolling */
206 #define VKERN_TEMPL_1V_PLAIN_SIMD(MDOP,ADV,STP,SUF) \
207  while (i >= ADV) { \
208  STP TMP UNUSED; \
209  STP y UNUSED; \
210  STP t UNUSED; \
211  MDOP(res,f1,f2,SUF); \
212  i -= ADV; res += ADV; \
213  }
214 
215 /* Without SIMD */
216 #define VKERN_TEMPL_1V_SISD(SDOP,COND,STP,SUF) \
217  while (COND && i) { \
218  STP TMP UNUSED; \
219  STP y UNUSED; \
220  STP t UNUSED; \
221  SDOP(res,f1,f2,SUF); \
222  --i; ++res; \
223  }
224 
225 /* -------------------------------------------------------------- */
229 #ifndef TBCI_SIMD_UNROLL
230 # define NO_TBCI_SIMD_UNROLL
231 #endif
232 
233 
234 #ifdef TBCI_SIMD_UNROLL
235 # define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_NP_SIMD(m,a,s,f,u1,u2)
236 # define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_NP_SIMD(m,a,s,f,u)
237 # define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_NP_SIMD(m,a,s,f)
238 #else
239 # define VKERN_TEMPL_3V_K_SIMD(m,a,s,f,u1,u2) VKERN_TEMPL_3V_PLAIN_SIMD(m,a,s,f,u1,u2)
240 # define VKERN_TEMPL_2V_K_SIMD(m,a,s,f,u) VKERN_TEMPL_2V_PLAIN_SIMD(m,a,s,f,u)
241 # define VKERN_TEMPL_1V_K_SIMD(m,a,s,f) VKERN_TEMPL_1V_PLAIN_SIMD(m,a,s,f)
242 #endif
243 
244 /* -------------------------------------------------------------- */
245 
246 /* On i386, we have no reasonable alignment guarantees, thus we may
247  * need to use the slower, unaligned variants for the load and store
248  * instructions :-(
249  * If malloc_cache is used (and we have memalign), we're fine, though ...
250  */
251 
252 #ifdef AVX512
253 # define ALIGN_REQ 0x3f
254 #elif defined(__AVX__)
255 # define ALIGN_REQ 0x1f
256 #else
257 # define ALIGN_REQ 0x0f
258 #endif
259 
260 #if !defined(__x86_64__) && (!defined(MALLOC_CACHE) || defined(SSE_VARS_MAY_BE_UNALIGNED))
261 //# define _mm_load_pd _mm_loadu_pd
262 //# define _mm_load_ps _mm_loadu_ps
263 //# define _mm_store_pd _mm_storeu_pd
264 //# define _mm_store_ps _mm_storeu_ps
265 //# define MISALIGNMENT_CHECK(x) (false)
266 # define MISALIGNMENT_CHECK(x) ((unsigned long)x & ALIGN_REQ)
267 # if defined(__GNUC__) || defined(__INTEL_COMPILER)
268 # warning May have to use slow unaligned SSE insns
269 # endif
270 #else
271 # define MISALIGNMENT_CHECK(x) (UNLIKELY((unsigned long)x & ALIGN_REQ))
272 #endif
273 
279 #ifdef WARN_UNALIGNED
280 # define WARN_UNALIGN(v) \
281  STD__ cerr << "TBCI WARN: Unaligned access to " #v " at " << v << " from " << __FUNCTION__ << "\n";
282 #else
283 # define WARN_UNALIGN(v) do {} while (0)
284 #endif
285 
290 /* -------------------------------------------------------------- */
291 
306 #define VKERN_TEMPL_3V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
307 TWEAK(template <> \
308 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
309  TYPE* RESTRICT const _res, \
310  const TYPE* RESTRICT const _v1, \
311  const TYPE* RESTRICT const _v2)) \
312 { \
313  REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
314  REGISTER TYPE *res = _res; \
315  PREP; \
316  REGISTER long i = sz; \
317  /* Make sure we have proper alignment */ \
318  VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
319  if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
320  WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
321  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
322  } else if (MISALIGNMENT_CHECK(v1)) { \
323  WARN_UNALIGN(v1); \
324  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
325  } else if (MISALIGNMENT_CHECK(v2)) { \
326  WARN_UNALIGN(v2); \
327  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
328  } else { \
329  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
330  } \
331  SFIN; \
332  VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
333  FIN; \
334 }
335 
337 #define VKERN_TEMPL_3V_SIMD_UA(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
338 TWEAK(template <> \
339 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
340  TYPE* RESTRICT const _res, \
341  const TYPE* RESTRICT const _v1, \
342  const TYPE* RESTRICT const _v2)) \
343 { \
344  REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
345  REGISTER TYPE *res = _res; \
346  PREP; \
347  REGISTER long i = sz; \
348  /* Make sure we have proper alignment */ \
349  VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
350  if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
351  /*WARN_UNALIGN(v1); WARN_UNALIGN(v2);*/ \
352  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
353  } else if (MISALIGNMENT_CHECK(v1)) { \
354  /*WARN_UNALIGN(v1);*/ \
355  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
356  } else if (MISALIGNMENT_CHECK(v2)) { \
357  /*WARN_UNALIGN(v2);*/ \
358  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
359  } else { \
360  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
361  } \
362  SFIN; \
363  VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
364  FIN; \
365 }
366 
367 #define VKERN_TEMPL_3V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
368 TWEAK(template <> \
369 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
370  TYPE* RESTRICT const _res, \
371  const TYPE* RESTRICT const _v1, \
372  const TYPE* RESTRICT const _v2, \
373  LCTYPED(TYPE) _f2)) \
374 { \
375  REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
376  REGISTER TYPE *res = _res; \
377  PREP(_f2); \
378  REGISTER long i = sz; \
379  /* Make sure we have proper alignment */ \
380  VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
381  if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
382  WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
383  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
384  } else if (MISALIGNMENT_CHECK(v1)) { \
385  WARN_UNALIGN(v1); \
386  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
387  } else if (MISALIGNMENT_CHECK(v2)) { \
388  WARN_UNALIGN(v2); \
389  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
390  } else { \
391  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
392  } \
393  SFIN; \
394  VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
395  FIN(_f2); \
396 }
397 
398 #define VKERN_TEMPL_3V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
399 TWEAK(template <> \
400 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
401  TYPE* RESTRICT const _res, \
402  const TYPE* RESTRICT const _v1, \
403  const TYPE* RESTRICT const _v2, \
404  LCTYPED(TYPE) _f1, \
405  LCTYPED(TYPE) _f2)) \
406 { \
407  REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
408  REGISTER TYPE *res = _res; \
409  PREP(_f1, _f2); \
410  REGISTER long i = sz; \
411  /* Make sure we have proper alignment */ \
412  VKERN_TEMPL_3V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
413  if (MISALIGNMENT_CHECK(v1) && MISALIGNMENT_CHECK(v2)) { \
414  WARN_UNALIGN(v1); WARN_UNALIGN(v2); \
415  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
416  } else if (MISALIGNMENT_CHECK(v1)) { \
417  WARN_UNALIGN(v1); \
418  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
419  } else if (MISALIGNMENT_CHECK(v2)) { \
420  WARN_UNALIGN(v2); \
421  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
422  } else { \
423  VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
424  } \
425  SFIN; \
426  VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
427  FIN(_f1, _f2); \
428 }
429 
430 
431 #define VKERN_TEMPL_2V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
432 TWEAK(template <> \
433 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
434  TYPE* RESTRICT const _res, \
435  const TYPE* RESTRICT const _v1)) \
436 { \
437  REGISTER TYPE *res = _res; \
438  REGISTER const TYPE *v1 = _v1; \
439  PREP; \
440  REGISTER long i = sz; \
441  /* Make sure we have proper alignment */ \
442  VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
443  if (MISALIGNMENT_CHECK(v1)) { \
444  WARN_UNALIGN(v1); \
445  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
446  } else { \
447  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
448  } \
449  SFIN; \
450  VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
451  FIN; \
452 }
453 
454 #define VKERN_TEMPL_2V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
455 TWEAK(template <> \
456 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
457  TYPE* RESTRICT const _res, \
458  const TYPE* RESTRICT const _v1, \
459  LCTYPED(TYPE) _f2)) \
460 { \
461  REGISTER const TYPE *v1 = _v1; \
462  REGISTER TYPE *res= _res; \
463  PREP(_f2); \
464  REGISTER long i = sz; \
465  /* Make sure we have proper alignment */ \
466  VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
467  if (MISALIGNMENT_CHECK(v1)) { \
468  WARN_UNALIGN(v1); \
469  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
470  } else { \
471  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
472  } \
473  SFIN; \
474  VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
475  FIN(_f2); \
476 }
477 
478 #define VKERN_TEMPL_2V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
479 TWEAK(template <> \
480 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
481  TYPE* RESTRICT const _res, \
482  const TYPE* RESTRICT const _v1, \
483  LCTYPED(TYPE) _f1, \
484  LCTYPED(TYPE) _f2)) \
485 { \
486  REGISTER const TYPE *v1 = _v1; \
487  REGISTER TYPE *res= _res; \
488  PREP(_f1, _f2); \
489  REGISTER long i = sz; \
490  /* Make sure we have proper alignment */ \
491  VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
492  if (MISALIGNMENT_CHECK(v1)) { \
493  WARN_UNALIGN(v1); \
494  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
495  } else { \
496  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
497  } \
498  SFIN; \
499  VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
500  FIN(_f1, _f2); \
501 }
502 
503 #define VKERN_TEMPL_2V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
504 TWEAK(template <> \
505 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
506  const TYPE* RESTRICT const _res, \
507  const TYPE* RESTRICT const _v1, \
508  TYPE &_f2)) \
509 { \
510  REGISTER const TYPE *res= _res, *v1 = _v1; \
511  /* PREP(0.0,_f2); */ \
512  PREP(_f2); \
513  REGISTER long i = sz; \
514  /* Make sure we have proper alignment */ \
515  VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
516  if (MISALIGNMENT_CHECK(v1)) { \
517  WARN_UNALIGN(v1); \
518  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
519  } else { \
520  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
521  } \
522  SFIN; \
523  VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
524  /* FIN(_f1,_f2); */ \
525  FIN(_f2); \
526 }
527 
528 #define VKERN_TEMPL_2V_T_SIMD_VL(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
529 TWEAK(template <> \
530 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
531  const TYPE* RESTRICT const _res, \
532  const TYPE* RESTRICT const _v1, \
533  volatile long &_f2)) \
534 { \
535  REGISTER const TYPE *res= _res, *v1 = _v1; \
536  /* PREP(0.0,_f2); */ \
537  PREP(_f2); \
538  REGISTER long i = sz; \
539  REGISTER int rg = 0; \
540  /* Make sure we have proper alignment */ \
541  VKERN_TEMPL_2V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
542  if (MISALIGNMENT_CHECK(v1)) { \
543  WARN_UNALIGN(v1); \
544  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
545  } else { \
546  VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
547  } \
548  SFIN; \
549  VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
550  /* FIN(_f1,_f2); */ \
551  _fin: \
552  FIN(_f2); \
553 }
554 
555 #define VKERN_TEMPL_1V_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
556 TWEAK(template <> \
557 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
558  TYPE* RESTRICT const _res)) \
559 { \
560  REGISTER TYPE *res= _res; \
561  PREP; \
562  REGISTER long i = sz; \
563  /* Make sure we have proper alignment */ \
564  VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
565  VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
566  SFIN; \
567  VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
568  FIN; \
569 }
570 
571 #define VKERN_TEMPL_1V_C_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
572 TWEAK(template <> \
573 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
574  TYPE* RESTRICT const _res, \
575  LCTYPED(TYPE) _f2)) \
576 { \
577  REGISTER TYPE *res= _res; \
578  PREP(_f2); \
579  REGISTER long i = sz; \
580  /* Make sure we have proper alignment */ \
581  VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
582  VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
583  SFIN; \
584  VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
585  FIN(_f2); \
586 }
587 
588 #define VKERN_TEMPL_1V_CC_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
589 TWEAK(template <> \
590 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
591  TYPE* RESTRICT const _res, \
592  LCTYPED(TYPE) _f1, \
593  LCTYPED(TYPE) _f2)) \
594 { \
595  REGISTER TYPE *res= _res; \
596  PREP(_f1, _f2); \
597  REGISTER long i = sz; \
598  /* Make sure we have proper alignment */ \
599  VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
600  VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
601  SFIN; \
602  VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
603  FIN(_f1, _f2); \
604 }
605 
606 #define VKERN_TEMPL_1V_T_SIMD(FNAME,OP,SSUF,MSUF,PREP,SFIN,FIN,ADV,TYPE,STP) \
607 TWEAK(template <> \
608 VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
609  const TYPE* const _res, \
610  TYPE &_f2)) \
611 { \
612  REGISTER const TYPE *res= _res; \
613  /* PREP(0.0,_f2); */ \
614  PREP(_f2); \
615  REGISTER long i = sz; \
616  /* Make sure we have proper alignment */ \
617  VKERN_TEMPL_1V_SISD(OP,MISALIGNMENT_CHECK(res),STP,SSUF); \
618  VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
619  SFIN; \
620  VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
621  /* FIN(_f1,_f2); */ \
622  FIN(_f2); \
623 }
624 
625 #endif // H_UNROLL_PREFETCH_SIMD_DEF_H
626