TBCI Numerical high perf. C++ Library  2.8.0
vec_kern_special2.h
Go to the documentation of this file.
1 
9 #ifndef H_VEC_KERN_SPECIAL2_H
10 #define H_VEC_KERN_SPECIAL2_H
11 
64 #if defined(__AVX__) && defined(HAVE_IMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65  ( defined(__x86_64__) || defined(__i386__) )
66 
67 #include <immintrin.h>
68 
69 #include "tbci/unroll_prefetch_simd_def.h"
70 
71 /* TODO: Add define, controlling instantiation */
72 
73 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
74 # include "vec_kern_special2_gd.h"
75 #else
76 
77 #ifdef WARN_SSE
78 # define WARN_SIMD
79 #endif
80 
82 
83 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
84 # warning Info: Using unrolled AVX vector kernels
85 #endif
86 
87 // TODO: Are integers useful as well?
88 // Unfortunately, the SSE commands follow a slightly different
89 // naming and systematics, so there's some manual work required.
90 // Maybe later ...
91 
92 #ifdef AVX512
93 # define __MAVXD __m512d
94 # define __MAVXS __m512
95 # define _MMAVX(x) _mm512_##x
96 # define AVXMD 8
97 # define AVXMS 16
98 #else
99 # define __MAVXD __m256d
100 # define __MAVXS __m256
101 # define _MMAVX(x) _mm256_##x
102 # define AVXMD 4
103 # define AVXMS 8
104 #endif
105 
106 
107 #define SIMD_EMPTY0 do {} while (0)
108 #define SIMD_EMPTY1(x) do {} while (0)
109 #define SIMD_EMPTY2(x,y) do {} while (0)
110 
111 // Note: The
112 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set1_pd(x))
113 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __MAVXD f1 = _MMAVX(set1_pd(x)), f2 = _MMAVX(set1_pd(y))
114 
115 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set1_ps(x))
116 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __MAVXS f1 = _MMAVX(set1_ps(x)), f2 = _MMAVX(set1_ps(y))
117 
118 
119 /* _mm256_load_sd, _ss do not exist */
120 #define _mm256_set_sd(d) (__extension__ (__m256d){ d, 0.0, 0.0, 0.0 })
121 #define _mm256_set_ss(f) (__extension__ (__m256 ){ f, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
122 #define _mm256_load_sd(m) (__extension__ (__m256d){ *m, 0.0, 0.0, 0.0 })
123 #define _mm256_loadu_sd(m) (__extension__ (__m256d_u){ *m, 0.0, 0.0, 0.0 })
124 #define _mm256_load_ss(m) (__extension__ (__m256 ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
125 #define _mm256_loadu_ss(m) (__extension__ (__m256_u ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
126 #define _mm256_store_sd(m, r) *m = ((__v4df)r)[0]
127 #define _mm256_store_ss(m, r) *m = ((__v8sf)r)[0]
128 
129 /* First the stuff from basics.h */
130 
131 /* Found a compiler bug in gcc 4.0.0 (PR 21239) */
132 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
133  __GNUC_MINOR__ == 0 && \
134  (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
135 # define _MM_STORE(mem, reg, SUF, UNA) \
136  asm(""::"x"(reg)); \
137  _MMAVX(store##UNA##_##SUF(mem, reg))
138 #else
139 # define _MM_STORE(mem, reg, SUF, UNA) \
140  _MMAVX(store##UNA##_##SUF(mem, reg))
141 #endif
142 
143 /* General policy: We'll always ensure alignment of the result
144  * vector, so no need to use unaligned insns for it. */
145 
146 #ifndef C_MEMALLOC
147 /* Template decl should be in basics.h */
149 //# define COPY2(res,v1,f1,f2) res = v1
150 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
151  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
152  _MM_STORE(r, TMP, SUF,)
153 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, sd, pd,
154  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
155  AVXMD, double, __MAVXD)
156 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, ss, ps,
157  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
158  AVXMS, float, __MAVXS)
159 /* We could use more xmm REGISTERs here ... but the CPU internal
160  * REGISTER renaming should hide the latencies by not doing so. */
161 
162 /* TODO: Benchmark copy and fill ... */
163 /* TODO: Avoid gcc unrolling this even more ... */
164 /* TODO: Here integers and pointers certainly make sense */
165 
167 //# define FILL1(res,f1,f2) res = f2
168 #define FILL1_SIMD(r,f1,f2,SUF) \
169  _MM_STORE(r, f2, SUF,)
170 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, sd, pd,
171  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
172  AVXMD, double, __MAVXD)
173 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, ss, ps,
174  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
175  AVXMS, float, __MAVXS)
176 /* TODO: Here integers and pointers certainly make sense */
177 #endif
178 
179 // TODO: Should we rather use _mm_add_sd, _mm_sub_sd
180 #define _mm256_add_sd _mm256_add_pd
181 #define _mm256_add_ss _mm256_add_ps
182 #define _mm256_sub_sd _mm256_sub_pd
183 #define _mm256_sub_ss _mm256_sub_ps
184 
185 #define _mm256_mul_sd _mm256_mul_pd
186 #define _mm256_mul_ss _mm256_mul_ps
187 #define _mm256_div_sd _mm256_div_pd
188 #define _mm256_div_ss _mm256_div_ps
189 
190 #define _mm512_add_sd _mm512_add_pd
191 #define _mm512_add_ss _mm512_add_ps
192 #define _mm512_sub_sd _mm512_sub_pd
193 #define _mm512_sub_ss _mm512_sub_ps
194 
195 #define _mm512_mul_sd _mm512_mul_pd
196 #define _mm512_mul_ss _mm512_mul_ps
197 #define _mm512_div_sd _mm512_div_pd
198 #define _mm512_div_ss _mm512_div_ps
199 
200 
201 /* 3Vec operations */
203 //#define ADD3(r,v1,v2,f1,f2) r = v1 + v2
204 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
205  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
206  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
207  TMP = _MMAVX(add_##SUF(TMP, LD)); \
208  _MM_STORE(r, TMP, SUF,)
209 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, sd, pd,
210  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
211  AVXMD, double, __MAVXD)
212 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, ss, ps,
213  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
214  AVXMS, float, __MAVXS)
215 
217 //#define SUB3(r,v1,v2,f1,f2) r = v1 - v2
218 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
219  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
220  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
221  TMP = _MMAVX(sub_##SUF(TMP, LD)); \
222  _MM_STORE(r, TMP, SUF,)
223 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, sd, pd,
224  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225  AVXMD, double, __MAVXD)
226 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, ss, ps,
227  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228  AVXMS, float, __MAVXS)
229 
231 //#define MUL3(r,v1,v2,f1,f2) r = v1 * v2
232 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
233  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
234  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
235  TMP = _MMAVX(mul_##SUF(TMP, LD)); \
236  _MM_STORE(r, TMP, SUF,)
237 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, sd, pd,
238  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
239  AVXMD, double, __MAVXD)
240 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, ss, ps,
241  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
242  AVXMS, float, __MAVXS)
243 
244 template <> inline void do_vec_vec_cmul<double>(const unsigned long sz,
245  double* RESTRICT const res, const double* RESTRICT const v1,
246  const double* RESTRICT const v2)
247 {
248  do_vec_vec_mul<double>(sz, res, v1, v2);
249 }
250 template <> inline void do_vec_vec_cmul<float>(const unsigned long sz,
251  float* RESTRICT const res, const float* RESTRICT const v1,
252  const float* RESTRICT const v2)
253 {
254  do_vec_vec_mul<float>(sz, res, v1, v2);
255 }
256 
258 //#define DIV3(r,v1,v2,f1,f2) r = v1 / v2
259 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
260  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
261  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
262  TMP = _MMAVX(div_##SUF(TMP, LD)); \
263  _MM_STORE(r, TMP, SUF,)
264 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, sd, pd,
265  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
266  AVXMD, double, __MAVXD)
267 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, ss, ps,
268  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
269  AVXMS, float, __MAVXS)
270 
271 template <> inline void do_vec_vec_cdiv<double>(const unsigned long sz,
272  double* RESTRICT const res, const double* RESTRICT const v1,
273  const double* RESTRICT const v2)
274 {
275  do_vec_vec_div<double>(sz, res, v1, v2);
276 }
277 template <> inline void do_vec_vec_cdiv<float>(const unsigned long sz,
278  float* RESTRICT const res, const float* RESTRICT const v1,
279  const float* RESTRICT const v2)
280 {
281  do_vec_vec_div<float>(sz, res, v1, v2);
282 }
283 
284 
286 //#define ADD2(r,v1,f1,f2) r += v1
287 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
288  TMP = _MMAVX(load_##SUF(r)); \
289  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
290  TMP = _MMAVX(add_##SUF(TMP, LD)); \
291  _MM_STORE(r, TMP, SUF,)
292 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, sd, pd,
293  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
294  AVXMD, double, __MAVXD)
295 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, ss, ps,
296  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
297  AVXMS, float, __MAVXS)
298 
300 //#define SUB2(r,v1,f1,f2) r -= v1
301 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
302  TMP = _MMAVX(load_##SUF(r)); \
303  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
304  TMP = _MMAVX(sub_##SUF(TMP, LD)); \
305  _MM_STORE(r, TMP, SUF,)
306 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, sd, pd,
307  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
308  AVXMD, double, __MAVXD)
309 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, ss, ps,
310  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
311  AVXMS, float, __MAVXS)
312 
314 //#define SUB2I(r,v1,f1,f2) r = v1 - r
315 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
316  TMP = _MMAVX(load_##SUF(r)); \
317  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
318  LD = _MMAVX(sub_##SUF(LD, TMP)); \
319  _MM_STORE(r, LD, SUF,)
320 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, sd, pd,
321  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
322  AVXMD, double, __MAVXD)
323 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, ss, ps,
324  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
325  AVXMS, float, __MAVXS)
326 
328 //#define MUL2(r,v1,f1,f2) r *= v1
329 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
330  TMP = _MMAVX(load_##SUF(r)); \
331  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
332  TMP = _MMAVX(mul_##SUF(TMP, LD)); \
333  _MM_STORE(r, TMP, SUF,)
334 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, sd, pd,
335  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
336  AVXMD, double, __MAVXD)
337 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, ss, ps,
338  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339  AVXMS, float, __MAVXS)
340 
342 //#define CMUL2(r,v1,f1,f2) r = CPLX__ conj(r) * v1
343 template <> inline void do_vec_cmul_vec<double>(const unsigned long sz,
344  double* RESTRICT const res, const double* RESTRICT const v1)
345 {
346  do_vec_mul_vec<double>(sz, res, v1);
347 }
348 template <> inline void do_vec_cmul_vec<float>(const unsigned long sz,
349  float* RESTRICT const res, const float* RESTRICT const v1)
350 {
351  do_vec_mul_vec<float>(sz, res, v1);
352 }
354 //#define CMUL2I(r,v1,f1,f2) r *= CPLX__ conj(v1)
355 template <> inline void do_vec_cmul_vec_inv<double>(const unsigned long sz,
356  double* RESTRICT const res, const double* RESTRICT const v1)
357 {
358  do_vec_mul_vec<double>(sz, res, v1);
359 }
360 template <> inline void do_vec_cmul_vec_inv<float>(const unsigned long sz,
361  float* RESTRICT const res, const float* RESTRICT const v1)
362 {
363  do_vec_mul_vec<float>(sz, res, v1);
364 }
365 
367 //#define DIV2(r,v1,f1,f2) r /= v1
368 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
369  TMP = _MMAVX(load_##SUF(r)); \
370  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
371  TMP = _MMAVX(div_##SUF(TMP, LD)); \
372  _MM_STORE(r, TMP, SUF,)
373 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, sd, pd,
374  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
375  AVXMD, double, __MAVXD)
376 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, ss, ps,
377  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
378  AVXMS, float, __MAVXS)
379 
381 //#define DIV2I(r,v1,f1,f2) r = v1 / r
382 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
383  TMP = _MMAVX(load_##SUF(r)); \
384  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
385  LD = _MMAVX(div_##SUF(LD, TMP)); \
386  _MM_STORE(r, LD, SUF,)
387 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, sd, pd,
388  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
389  AVXMD, double, __MAVXD)
390 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, ss, ps,
391  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
392  AVXMS, float, __MAVXS)
393 
395 //#define CDIV2(r,v1,f1,f2) r = CPLX__ conj(r) / v1
396 template <> inline void do_vec_cdiv_vec<double>(const unsigned long sz,
397  double* RESTRICT const res, const double* RESTRICT const v1)
398 {
399  do_vec_div_vec<double>(sz, res, v1);
400 }
401 template <> inline void do_vec_cdiv_vec<float>(const unsigned long sz,
402  float* RESTRICT const res, const float* RESTRICT const v1)
403 {
404  do_vec_div_vec<float>(sz, res, v1);
405 }
406 
407 
409 //#define CDIV2I(r,v1,f1,f2) r = CPLX__ conj(v1) / r
410 template <> inline void do_vec_cdiv_vec_inv<double>(const unsigned long sz,
411  double* RESTRICT const res, const double* RESTRICT const v1)
412 {
413  do_vec_div_vec_inv<double>(sz, res, v1);
414 }
415 template <> inline void do_vec_cdiv_vec_inv<float>(const unsigned long sz,
416  float* RESTRICT const res, const float* RESTRICT const v1)
417 {
418  do_vec_div_vec_inv<float>(sz, res, v1);
419 }
420 
422 //#define ADD2NV(r,v1,f1,f2) r = v1 + f2
423 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
424  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
425  TMP = _MMAVX(add_##SUF(TMP, f2)); \
426  _MM_STORE(r, TMP, SUF,)
427 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, sd, pd,
428  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
429  AVXMD, double, __MAVXD)
430 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, ss, ps,
431  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
432  AVXMS, float, __MAVXS)
433 
435 //#define SUB2NV(r,v1,f1,f2) r = v1 - f2
436 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
438  TMP = _MMAVX(sub_##SUF(TMP, f2)); \
439  _MM_STORE(r, TMP, SUF,)
440 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, sd, pd,
441  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442  AVXMD, double, __MAVXD)
443 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, ss, ps,
444  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445  AVXMS, float, __MAVXS)
446 
448 //#define MUL2NV(r,v1,f1,f2) r = v1 * f2
449 //VKERN_TEMPL_2V_C(do_vec_val_mul, MUL2NV);
450 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
451  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
452  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
453  _MM_STORE(r, TMP, SUF,)
454 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, sd, pd,
455  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
456  AVXMD, double, __MAVXD)
457 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, ss, ps,
458  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
459  AVXMS, float, __MAVXS)
460 
461 
463 //#define ADD2RV(r,v1,f1,f2) r = f2 + v1
464 template <> inline void do_val_vec_add<double>(const unsigned long sz,
465  double* RESTRICT const res, const double* RESTRICT const v1,
466  LCTYPED(double) _f2)
467 {
468  do_vec_val_add<double>(sz, res, v1, _f2);
469 }
470 template <> inline void do_val_vec_add<float>(const unsigned long sz,
471  float* RESTRICT const res, const float* RESTRICT const v1,
472  LCTYPED(float) _f2)
473 {
474  do_vec_val_add<float>(sz, res, v1, _f2);
475 }
476 
478 //#define SUB2RV(r,v1,f1,f2) r = f2 - v1
479 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
480  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
481  TMP = _MMAVX(sub_##SUF(f2, TMP)); \
482  _MM_STORE(r, TMP, SUF,)
483 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, sd, pd,
484  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
485  AVXMD, double, __MAVXD)
486 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, ss, ps,
487  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
488  AVXMS, float, __MAVXS)
489 
491 //#define MUL2RV(r,v1,f1,f2) r = f2 * v1
492 template <> inline void do_val_vec_mul<double>(const unsigned long sz,
493  double* RESTRICT const res, const double* RESTRICT const v1,
494  LCTYPED(double) _f2)
495 {
496  do_vec_val_mul<double>(sz, res, v1, _f2);
497 }
498 template <> inline void do_val_vec_mul<float>(const unsigned long sz,
499  float* RESTRICT const res, const float* RESTRICT const v1,
500  LCTYPED(float) _f2)
501 {
502  do_vec_val_mul<float>(sz, res, v1, _f2);
503 }
504 
506 //#define DIV2RV(r,v1,f1,f2) r = f2 / v1
507 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
508  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
509  TMP = _MMAVX(div_##SUF(f2, TMP)); \
510  _MM_STORE(r, TMP, SUF,)
511 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, sd, pd,
512  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513  AVXMD, double, __MAVXD)
514 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, ss, ps,
515  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516  AVXMS, float, __MAVXS)
517 
518 /* The one vec operations don't need unaligned versions,
519  * as the loop preamble will always be able to lead us to an
520  * aligned situation */
521 
523 //#define ADD1NV(r,f1,f2) r += f2
524 #define ADD1NV_SIMD(r,f1,f2,SUF) \
525  TMP = _MMAVX(load_##SUF(r)); \
526  TMP = _MMAVX(add_##SUF(TMP, f2)); \
527  _MM_STORE(r, TMP, SUF,)
528 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, sd, pd,
529  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
530  AVXMD, double, __MAVXD)
531 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, ss, ps,
532  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533  AVXMS, float, __MAVXS)
534 
536 //#define SUB1NV(r,f1,f2) r -= f2
537 #define SUB1NV_SIMD(r,f1,f2,SUF) \
538  TMP = _MMAVX(load_##SUF(r)); \
539  TMP = _MMAVX(sub_##SUF(TMP, f2)); \
540  _MM_STORE(r, TMP, SUF,)
541 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, sd, pd,
542  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
543  AVXMD, double, __MAVXD)
544 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, ss, ps,
545  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546  AVXMS, float, __MAVXS)
547 
549 //#define SUB1RV(r,f1,f2) r = f2 - r
550 #define SUB1RV_SIMD(r,f1,f2,SUF) \
551  TMP = _MMAVX(load_##SUF(r)); \
552  TMP = _MMAVX(sub_##SUF(f2, TMP)); \
553  _MM_STORE(r, TMP, SUF,)
554 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, sd, pd,
555  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
556  AVXMD, double, __MAVXD)
557 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, ss, ps,
558  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559  AVXMS, float, __MAVXS)
560 
562 //#define MUL1NV(r,f1,f2) r *= f2
563 #define MUL1NV_SIMD(r,f1,f2,SUF) \
564  TMP = _MMAVX(load_##SUF(r)); \
565  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
566  _MM_STORE(r, TMP, SUF,)
567 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, sd, pd,
568  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
569  AVXMD, double, __MAVXD)
570 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, ss, ps,
571  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
572  AVXMS, float, __MAVXS)
573 
575 //#define DIV1NV(r,f1,f2) r /= f2
576 #define DIV1NV_SIMD(r,f1,f2,SUF) \
577  TMP = _MMAVX(load_##SUF(r)); \
578  TMP = _MMAVX(div_##SUF(TMP, f2)); \
579  _MM_STORE(r, TMP, SUF,)
580 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, sd, pd,
581  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
582  AVXMD, double, __MAVXD)
583 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, ss, ps,
584  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
585  AVXMS, float, __MAVXS)
586 
588 //#define DIV1RV(r,f1,f2) r = f2 / r
589 #define DIV1RV_SIMD(r,f1,f2,SUF) \
590  TMP = _MMAVX(load_##SUF(r)); \
591  TMP = _MMAVX(div_##SUF(f2, TMP)); \
592  _MM_STORE(r, TMP, SUF,)
593 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, sd, pd,
594  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
595  AVXMD, double, __MAVXD)
596 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, ss, ps,
597  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
598  AVXMS, float, __MAVXS)
599 
601 //#define ADD1RV(r,f1,f2) r = f2 + r
602 template <> inline void do_val_add_vec<double>(const unsigned long sz,
603  double* RESTRICT const res, LCTYPED(double) _f2)
604 {
605  do_vec_add_val<double>(sz, res, _f2);
606 }
607 template <> inline void do_val_add_vec<float>(const unsigned long sz,
608  float* RESTRICT const res, LCTYPED(float) _f2)
609 {
610  do_vec_add_val<float>(sz, res, _f2);
611 }
612 
613 // unused ...
614 //#define MUL1RV(r,f1,f2) r = f2 * r;
615 
616 
617 
618 /* TSVector stuff */
619 
621 
623 //#define ADD2NS(r,v1,f1,f2) r += f2*v1
624 #if defined( __FMA__) && !defined(NO_FMA)
625 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
626 #warning Using FMA
627 #endif
628 #define _mm256_fmadd_sd _mm256_fmadd_pd
629 #define _mm256_fmadd_ss _mm256_fmadd_ps
630 #define _mm256_fmsub_sd _mm256_fmsub_pd
631 #define _mm256_fmsub_ss _mm256_fmsub_ps
632 
633 #define _mm256_FMA(SUF,a,b,c,d) \
634  d = _mm256_fmadd_##SUF(a,b,c)
635 #define _mm256_FMS(SUF,a,b,c,d) \
636  d = _mm256_fmsub_##SUF(a,b,c)
637 #else
638 #define _mm256_FMA(SUF,a,b,c,d) \
639  b = _mm256_mul_##SUF(a, b); \
640  d = _mm256_add_##SUF(b, c)
641 #define _mm256_FMS(SUF,a,b,c,d) \
642  b = _mm256_mul_##SUF(a, b); \
643  d = _mm256_sub_##SUF(b, c)
644 #endif
645 
646 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
647  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
648  TMP = _MMAVX(load_##SUF(r)); \
649  _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
650  _MM_STORE(r, TMP, SUF,)
651 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, sd, pd,
652  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
653  AVXMD, double, __MAVXD)
654 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, ss, ps,
655  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
656  AVXMS, float, __MAVXS)
657 
659 //#define SUB2NS(r,v1,f1,f2) r -= f2*v1
660 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
661  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
662  TMP = _MMAVX(load_##SUF(r)); \
663  LD = _MMAVX(mul_##SUF(f2,LD)); \
664  TMP = _MMAVX(sub_##SUF(TMP,LD)); \
665  _MM_STORE(r, TMP, SUF,)
666 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, sd, pd,
667  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
668  AVXMD, double, __MAVXD)
669 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, ss, ps,
670  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
671  AVXMS, float, __MAVXS)
672 
674 //#define SUB2RS(r,v1,f1,f2) r = f2*v1 - r
675 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
676  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
677  TMP = _MMAVX(load_##SUF(r)); \
678  _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
679  _MM_STORE(r, LD, SUF,)
680 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, sd, pd,
681  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
682  AVXMD, double, __MAVXD)
683 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, ss, ps,
684  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
685  AVXMS, float, __MAVXS)
686 
688 //#define ADD3NS(r,v1,v2,f1,f2) r = v1 + f2*v2
689 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
690  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
691  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
692  _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
693  _MM_STORE(r, TMP, SUF,)
694 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, sd, pd,
695  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
696  AVXMD, double, __MAVXD)
697 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, ss, ps,
698  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
699  AVXMS, float, __MAVXS)
700 
702 //#define SUB3NS(r,v1,v2,f1,f2) r = v1 - f2*v2
703 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
704  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
705  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
706  LD = _MMAVX(mul_##SUF(LD, f2)); \
707  TMP = _MMAVX(sub_##SUF(TMP, LD)); \
708  _MM_STORE(r, TMP, SUF,)
709 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, sd, pd,
710  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
711  AVXMD, double, __MAVXD)
712 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, ss, ps,
713  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
714  AVXMS, float, __MAVXS)
715 
716 
718 //#define ADD3SN(r,v1,v2,f1,f2) r = f2*v1 + v2
719 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
720  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
721  TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
722  _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
723  _MM_STORE(r, TMP, SUF,)
724 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, sd, pd,
725  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
726  AVXMD, double, __MAVXD)
727 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, ss, ps,
728  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729  AVXMS, float, __MAVXS)
730 
732 //#define SUB3SN(r,v1,v2,f1,f2) r = f2*v1 - v2
733 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
734  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
735  TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
736  _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
737  _MM_STORE(r, LD, SUF,)
738 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, sd, pd,
739  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
740  AVXMD, double, __MAVXD)
741 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, ss, ps,
742  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
743  AVXMS, float, __MAVXS)
744 
745 
747 //#define ADD3SS(r,v1,v2,f1,f2) r = f1*v1 + f2*v2
748 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
749  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
750  TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
751  TMP = _MMAVX(mul_##SUF(f2,TMP)); \
752  _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
753  _MM_STORE(r, LD, SUF,)
754 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, sd, pd,
755  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
756  AVXMD, double, __MAVXD)
757 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, ss, ps,
758  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
759  AVXMS, float, __MAVXS)
760 
762 //#define SUB3SS(r,v1,v2,f1,f2) r = f1*v1 - f2*v2
763 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
764  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
765  TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
766  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
767  _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
768  _MM_STORE(r, LD, SUF,)
769 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, sd, pd,
770  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
771  AVXMD, double, __MAVXD)
772 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, ss, ps,
773  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774  AVXMS, float, __MAVXS)
775 
776 
778 //#define ADD2SN(r,v1,f1,f2) r = f2*r + v1
779 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
780  LD = _MMAVX(load_##SUF(r)); \
781  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
782  _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
783  _MM_STORE(r, TMP, SUF,)
784 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, sd, pd,
785  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
786  AVXMD, double, __MAVXD)
787 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, ss, ps,
788  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
789  AVXMS, float, __MAVXS)
790 
792 //#define SUB2SN(r,v1,f1,f2) r = f2*r - v1
793 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
794  LD = _MMAVX(load_##SUF(r)); \
795  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
796  _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
797  _MM_STORE(r, LD, SUF,)
798 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, sd, pd,
799  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
800  AVXMD, double, __MAVXD)
801 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, ss, ps,
802  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
803  AVXMS, float, __MAVXS)
804 
806 //#define ADD2SS(r,v1,f1,f2) r = f1*r + f2*v1
807 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
808  LD = _MMAVX(load_##SUF(r)); \
809  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
810  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
811  _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
812  _MM_STORE(r, LD, SUF,)
813 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, sd, pd,
814  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
815  AVXMD, double, __MAVXD)
816 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, ss, ps,
817  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818  AVXMS, float, __MAVXS)
819 
821 //#define SUB2SS(r,v1,f1,f2) r = f1*r - f2*v1
822 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
823  LD = _MMAVX(load_##SUF(r)); \
824  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
825  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
826  _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
827  _MM_STORE(r, LD, SUF,)
828 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, sd, pd,
829  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
830  AVXMD, double, __MAVXD)
831 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, ss, ps,
832  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
833  AVXMS, float, __MAVXS)
834 
835 
837 //#define ADD2SV(r,v1,f1,f2) r = f1*v1 + f2
838 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
839  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
840  _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
841  _MM_STORE(r, TMP, SUF,)
842 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, sd, pd,
843  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
844  AVXMD, double, __MAVXD)
845 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, ss, ps,
846  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
847  AVXMS, float, __MAVXS)
848 
850 //#define SUB2SV(r,v1,f1,f2) r = f1*v1 - f2
851 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
852  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
853  _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
854  _MM_STORE(r, TMP, SUF,)
855 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, sd, pd,
856  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
857  AVXMD, double, __MAVXD)
858 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, ss, ps,
859  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
860  AVXMS, float, __MAVXS)
861 
862 
864 //#define ADD1SV(r,f1,f2) r = f1*r + f2
865 #define ADD1SV_SIMD(r,f1,f2,SUF) \
866  TMP = _MMAVX(load_##SUF(r)); \
867  _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
868  _MM_STORE(r, TMP, SUF,)
869 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, sd, pd,
870  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
871  AVXMD, double, __MAVXD)
872 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, ss, ps,
873  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
874  AVXMS, float, __MAVXS)
875 
877 //#define SUB1SV(r,f1,f2) r = f1*r - f2
878 #define SUB1SV_SIMD(r,f1,f2,SUF) \
879  TMP = _MMAVX(load_##SUF(r)); \
880  _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
881  _MM_STORE(r, TMP, SUF,)
882 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, sd, pd,
883  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
884  AVXMD, double, __MAVXD)
885 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, ss, ps,
886  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
887  AVXMS, float, __MAVXS)
888 
889 
891 //#define ADD2VS(r,v1,f1,f2) r = f1 + f2*v1
892 template <> inline void do_val_svc_add<double>(const unsigned long sz,
893  double* RESTRICT const res, const double* RESTRICT const v1,
894  LCTYPED(double) f1, LCTYPED(double) f2)
895 {
896  do_svc_val_add<double>(sz, res, v1, f2, f1); // note the reverse order!
897 }
898 template <> inline void do_val_svc_add<float>(const unsigned long sz,
899  float* RESTRICT const res, const float* RESTRICT const v1,
900  LCTYPED(float) f1, LCTYPED(float) f2)
901 {
902  do_svc_val_add<float>(sz, res, v1, f2, f1); // note the reverse order!
903 }
904 
906 //#define SUB2VS(r,v1,f1,f2) r = f1 - f2*v1
907 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
908  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
909  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
910  TMP = _MMAVX(sub_##SUF(f1, TMP)); \
911  _MM_STORE(r, TMP, SUF,)
912 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, sd, pd,
913  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
914  AVXMD, double, __MAVXD)
915 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, ss, ps,
916  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
917  AVXMS, float, __MAVXS)
918 
920 //#define DIV2VS(r,v1,f1,f2) r = f1 / (f2*v1)
921 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
922  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
923  TMP = _MMAVX(mul_##SUF(TMP, f2)); \
924  TMP = _MMAVX(div_##SUF(f1, TMP)); \
925  _MM_STORE(r, TMP, SUF,)
926 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, sd, pd,
927  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
928  AVXMD, double, __MAVXD)
929 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, ss, ps,
930  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
931  AVXMS, float, __MAVXS)
932 
933 
934 /* ... */
935 
936 
937 /* For negation, use our knowledge of the position of the sign bits */
938 #ifdef AVX512
939 #ifdef HAVE_LONG_LONG
940 #define NEG_DOUBLE_PREP \
941  static union _negmask { \
942  unsigned LONG_LONG lng[8]; \
943  double dbl[8]; \
944  __m512d m512d; \
945  } ALIGN(64) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
946  __m512d neg = _MMAVX(load_pd(negmask.dbl))
947 #else
948 #define NEG_DOUBLE_PREP \
949  static union _negmask { \
950  unsigned int lng[16]; \
951  double dbl[4]; \
952  __m512d m512d; \
953  } ALIGN(64) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
954  __m512d neg = _MMAVX(load_pd(negmask.dbl))
955 #endif
956 #define NEG_FLOAT_PREP \
957  static union _negmask { \
958  unsigned int itg[8]; \
959  float flt[8]; \
960  __m512 m512s; \
961  } ALIGN(64) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
962  __m512 neg = _MMAVX(load_ps(negmask.flt))
963 #else
964 #ifdef HAVE_LONG_LONG
965 #define NEG_DOUBLE_PREP \
966  static union _negmask { \
967  unsigned LONG_LONG lng[4]; \
968  double dbl[4]; \
969  __m256d m256d; \
970  } ALIGN(32) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
971  __m256d neg = _MMAVX(load_pd(negmask.dbl))
972 #else
973 #define NEG_DOUBLE_PREP \
974  static union _negmask { \
975  unsigned int lng[8]; \
976  double dbl[4]; \
977  __m256d m256d; \
978  } ALIGN(32) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
979  __m256d neg = _MMAVX(load_pd(negmask.dbl))
980 #endif
981 #define NEG_FLOAT_PREP \
982  static union _negmask { \
983  unsigned int itg[8]; \
984  float flt[8]; \
985  __m256 m256s; \
986  } ALIGN(32) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
987  __m256 neg = _MMAVX(load_ps(negmask.flt))
988 
989 #endif
990 /* single val xor operations don't exist, but we don't care ... */
991 #define _mm256_xor_sd _mm256_xor_pd
992 #define _mm256_xor_ss _mm256_xor_ps
993 #define _mm512_xor_sd _mm512_xor_pd
994 #define _mm512_xor_ss _mm512_xor_ps
995 
997 //#define NEG2(r,v1,f1,f2) r = -v1
998 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
999  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1000  TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1001  _MM_STORE(r, TMP, SUF,)
1002 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, sd, pd,
1003  NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1004  AVXMD, double, __MAVXD)
1005 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, ss, ps,
1006  NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1007  AVXMS, float, __MAVXS)
1008 
1010 //#define NEG1(r,f1,f2) r = -r
1011 #define NEG1_SIMD(r,f1,f2,SUF) \
1012  TMP = _MMAVX(load_##SUF(r)); \
1013  TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1014  _MM_STORE(r, TMP, SUF,)
1015 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, sd, pd,
1016  NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1017  AVXMD, double, __MAVXD)
1018 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, ss, ps,
1019  NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1020  AVXMS, float, __MAVXS)
1021 
1022 
1024 //#define COMP2(r,v1,f1,f2) if (r != v1) { ++f2; break; }
1025 //VKERN_TEMPL_2V_T(do_vv_comp, COMP2, volatile long);
1026 #define VL_PREP(x) long f2 = (x)
1027 #define VL_FIN(x) x = f2
1028 #define _mm256_movemask_sd(x) \
1029  _mm256_movemask_pd(x); rg &= 0x1
1030 #define _mm256_movemask_ss(x) \
1031  _mm256_movemask_ps(x); rg &= 0x1
1032 #define _mm512_movemask_sd(x) \
1033  _mm512_movemask_pd(x); rg &= 0x1
1034 #define _mm512_movemask_ss(x) \
1035  _mm512_movemask_ps(x); rg &= 0x1
1036 
1037 
1038 #define _mm256_cmpneq_pd(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_UQ)
1039 #define _mm256_cmpneq_ps(a,b) _mm256_cmp_ps(a,b,_CMP_NEQ_UQ)
1040 #define _mm256_cmpneq_sd _mm256_cmpneq_pd
1041 #define _mm256_cmpneq_ss _mm256_cmpneq_ps
1042 //#define _mm256_cmp_pd(a,b) _mm256_cmp_pd(a,b,_CMP_EQ_OQ)
1043 //#define _mm256_cmp_ps(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_OQ)
1044 #define _mm256_cmp_sd _mm256_cmp_pd
1045 #define _mm256_cmp_ss _mm256_cmp_ps
1046 
1047 
1048 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
1049  TMP = _MMAVX(load_##SUF(r)); \
1050  LD = _MMAVX(load##UNA##_##SUF(v1)); \
1051  TMP = _MMAVX(cmpneq_##SUF(TMP, LD)); \
1052  /* And now? movmskpd and bt? */ \
1053  rg = _MMAVX(movemask_##SUF(TMP)); \
1054  if (rg) { ++f2; /*fprintf(stderr, "DIFF @ %li: %i\n", sz-i, rg);*/ goto _fin; }
1055 VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, sd, pd,
1056  VL_PREP, SIMD_EMPTY0, VL_FIN,
1057  AVXMD, double, __MAVXD)
1058 VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, ss, ps,
1059  VL_PREP, SIMD_EMPTY0, VL_FIN,
1060  AVXMS, float, __MAVXS)
1061 
1062 
1063 // Used in do_bdmat_vec_mult
1064 #define DECL_DOUBLE __MAVXD TM2
1065 #define DECL_FLOAT __MAVXS TM2
1066 // Unaligned accesses for single loads are harmless ...
1067 #define _mm_loadu_sd _mm_load_sd
1068 #define _mm_loadu_ss _mm_load_ss
1069 
1070 //#define SUMMULT3(r,v1,v2,f1,f2) r += v1*v2
1071 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
1072  TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1073  LD = _MMAVX(load##UNA2##_##SUF(v2)); \
1074  TM2 = _MMAVX(load_##SUF(r)); \
1075  _MMAVX(FMA(SUF,TMP,LD,TM2,TM2)); \
1076  _MM_STORE(r, TM2, SUF,)
1077 #if 1 /* These are used in bdmat_vec_mul -- unaligned accesses are unavoidable */
1078 VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
1079  DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1080  AVXMD, double, __MAVXD);
1081 VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
1082  DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1083  AVXMS, float, __MAVXS);
1084 #else
1085 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
1086  DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1087  AVXMD, double, __MAVXD)
1088 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
1089  DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1090  AVXMS, float, __MAVXS)
1091 #endif
1092 
1093 //#define SUMCMULT3(r,v1,v2,f1,f2) r += CPLX__ conj(v1)*v2
1094 template <> inline void do_add_vec_vec_cmul<double>(const unsigned long sz,
1095  double* RESTRICT const r, const double* RESTRICT const v1,
1096  const double* RESTRICT const v2)
1097 {
1098  do_add_vec_vec_mul<double>(sz, r, v1, v2);
1099 }
1100 template <> inline void do_add_vec_vec_cmul<float>(const unsigned long sz,
1101  float* RESTRICT const r, const float* RESTRICT const v1,
1102  const float* RESTRICT const v2)
1103 {
1104  do_add_vec_vec_mul<float>(sz, r, v1, v2);
1105 }
1106 
1107 
1130 #ifndef TBCI_NO_SIMD_SUM
1131 
1132 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1133 # warning Info: Using unrolled AVX vector kernels for sums (reductions)
1134 #endif
1135 
1136 #define SUM_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1137 #define SUM_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1138 
1139 #define XSUM_DOUBLE_PREP(x) \
1140  REGISTER __MAVXD f1 = _MMAVX(setzero_pd()); \
1141  REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1142 #define XSUM_FLOAT_PREP(x) \
1143  REGISTER __MAVXS f1 = _MMAVX(setzero_ps()); \
1144  REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1145 
1151 // FIXME: Do we need more horizontal summing for AVX512
1152 #if 1 //def __SSE3__
1153 #ifndef AVX512
1154 # define SUM_DOUBLE_SIMD_FINX(f) \
1155  f = _mm256_hadd_pd(f, _mm256_permute2f128_pd(f, f, 0x33)); \
1156  f = _mm256_hadd_pd(f, f)
1157 
1158 # define SUM_FLOAT_SIMD_FINX(f) \
1159  f = _mm256_hadd_ps(f, _mm256_permute2f128_ps(f, f, 0x33)); \
1160  f = _mm256_hadd_ps(f, f); \
1161  f = _mm256_hadd_ps(f, f)
1162 #else
1163 # warning horizontal sum AVX512 not yet implemented
1164 #endif
1165 #else // __SSE3__
1166 # define SUM_DOUBLE_SIMD_FINX(f) \
1167  __MAVXD TM##f = f; \
1168  TM##f = _MMAVX(unpackhi_pd(TM##f, f)); \
1169  f = _MMAVX(add_sd(f, TM##f))
1170 # define SUM_FLOAT_SIMD_FINX(f) \
1171  __MAVXS TM##f = f; \
1172  TM##f = _MMAVX(shuffle_ps(TM##f, f, 0xb1)); \
1173  f = _MMAVX(add_ps(f, TM##f)); \
1174  TM##f = f; \
1175  TM##f = _MMAVX(shuffle_ps(TM##f, f, 0x1b)); \
1176  f = _MMAVX(add_ss(f, TM##f))
1177 # if defined(__GNUC__) && defined(WARN_SSE)
1178 # warning Not using SSE3 -- consider passing -msse3
1179 # endif
1180 #endif // __SSE3__
1181 
1182 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1183 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1184 
1185 #define SUM_DOUBLE_FINAL(x) \
1186  _MMAVX(store_sd(&x, f2))
1187 #define SUM_FLOAT_FINAL(x) \
1188  _MMAVX(store_ss(&x, f2))
1189 
1190 
1191 /* Define missing intrinsics for full REGISTER copies */
1192 #define _mm256_move_ps(f, x) x
1193 #define _mm256_move_pd(f, x) x
1194 #define _mm256_move_ss(x, f) (__extension__ (__m256 ) { ((__v8sf)f)[0],((__v8sf)x)[1],((__v8sf)x)[2],((__v8sf)x)[3], \
1195  ((__v8sf)x)[4],((__v8sf)x)[5],((__v8sf)x)[6],((__v8sf)x)[7] })
1196 #define _mm256_move_sd(x, f) (__extension__ (__m256d) { ((__v4df)f)[0],((__v4df)x)[1],((__v4df)x)[2],((__v4df)x)[3] })
1197 
1198 
1205 /* We don't need to save the upper values of f2 any more,
1206  * as the SISD (sd,ss) loop tails now preserve them */
1207 #define XSUM_DOUBLE_SIMD_FIN_STORE \
1208  /*double hif1, hif2;*/ \
1209  /*_MMAVX(storeh_pd(&hif1, f1));*/ \
1210  /*_MMAVX(storeh_pd(&hif2, f2))*/ \
1211  do {} while(0)
1212 #define XSUM_FLOAT_SIMD_FIN_STORE \
1213  /*float hif1[4], hif2[4];*/ \
1214  /*_MMAVX(store_ps(hif1, f1));*/ \
1215  /*_MMAVX(store_ps(hif2, f2))*/ \
1216  do {} while(0)
1217 
1218 /* Do the horizontal sums and the final application of the correction */
1219 /* FIXME: These may need adjustment for AVX/AVX-512 */
1220 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1221  /*f2 = _MMAVX(loadh_pd(f2, &hif2));*/ \
1222  /*f1 = _MMAVX(loadh_pd(f1, &hif1));*/ \
1223  SUM_DOUBLE_SIMD_FINX(f2); \
1224  SUM_DOUBLE_SIMD_FINX(f1); \
1225  f2 = _MMAVX(sub_sd(f2, f1)); \
1226  _MMAVX(store_sd(&x, f2))
1227 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1228  /*_MMAVX(store_ss(hif2, f2));*/ \
1229  /*_MMAVX(store_ss(hif1, f1));*/ \
1230  /*f2 = _MMAVX(load_ps(hif2));*/ \
1231  /*f1 = _MMAVX(load_ps(hif1));*/ \
1232  SUM_FLOAT_SIMD_FINX(f2); \
1233  SUM_FLOAT_SIMD_FINX(f1); \
1234  f2 = _MMAVX(sub_ss(f2, f1)); \
1235  _MMAVX(store_ss(&x, f2))
1236 
1237 /* Variant with compensation for lost bits in hadd_pd */
1238 // FIXME: This is not elegant ...
1239 // ... and maybe not worth the trouble
1240 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1241  /*f2 = _MMAVX(loadh_pd(f2, &hif2));*/ \
1242  /*f1 = _MMAVX(loadh_pd(f1, &hif1));*/ \
1243  __MAVXD TMP = f2; \
1244  SUM_DOUBLE_SIMD_FINX(f2); \
1245  __MAVXD COR = f2; \
1246  COR = _MMAVX(sub_sd(COR, TMP)); \
1247  TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1248  COR = _MMAVX(sub_sd(COR, TMP)); \
1249  TMP = _MMAVX(unpackhi_pd(TMP, TMP)); \
1250  COR = _MMAVX(sub_sd(COR, TMP)); \
1251  TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1252  COR = _MMAVX(sub_sd(COR, TMP)); \
1253  COR = _MMAVX(set_sd(((__v4df)COR)[0])); \
1254  f1 = _MMAVX(add_sd(f1, COR)); \
1255  SUM_DOUBLE_SIMD_FINX(f1); \
1256  f2 = _MMAVX(sub_sd(f2, f1)); \
1257  _MMAVX(store_sd(&x, f2))
1258 
1259 /* TODO: Variant for floats with compensation for lost bits in hadd_ps */
1260 
1261 
1263 // #define MULT2 (r,v1,f1,f2) f2 += r * v1
1264 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1265  TMP = _MMAVX(load_##SUF(r)); \
1266  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1267  _MMAVX(FMA(SUF,TMP,LD,f2,f2))
1268 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, sd, pd,
1269  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1270  AVXMD, double, __MAVXD)
1271 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, ss, ps,
1272  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1273  AVXMS, float, __MAVXS)
1274 
1275 //#define XMULT2(r,v1,f1,f2) { T y = r * v1; T t = f2+y; f1 += (t-f2)-y; f2 = t; }
1276  /* TMP = r*v1;
1277  * LD = TMP; // y = r*v1 // lower bits = v1 ?
1278  * TMP = TMP+f2; // y+f2
1279  * t = TMP; // t = y+f2
1280  * TMP = TMP-f2; // t-f2
1281  * TMP = TMP-LD; // (t-f2)-y
1282  * f1 = f1+TMP; // f1 += (t-f2)-y
1283  * f2 = t; // f2 += t; // lower bits = unchanged ?
1284  */
1285 //do_vec_mult_exact
1286 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1287  TMP = _MMAVX(load_##SUF(r)); \
1288  LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1289  TMP = _MMAVX(mul_##SUF(TMP, LD)); \
1290  LD = _MMAVX(move_##SUF(LD, TMP)); \
1291  /* LD = TMP; */ \
1292  TMP = _MMAVX(add_##SUF(TMP, f2)); \
1293  t = TMP; \
1294  TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1295  TMP = _MMAVX(sub_##SUF(TMP, LD)); \
1296  f1 = _MMAVX(add_##SUF(f1, TMP)); \
1297  /* f2 = t */ \
1298  f2 = _MMAVX(move_##SUF(f2, t))
1299 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, sd, pd,
1300  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1301  XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1302  AVXMD, double, __MAVXD)
1303 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, ss, ps,
1304  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1305  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1306  AVXMS, float, __MAVXS)
1307 
1308 
1309 template <> inline void do_vec_dot_exact<double>(const unsigned long sz,
1310  const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1311  double& _f2)
1312 {
1313  do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1314 }
1315 
1316 template <> inline void do_vec_dot_quick<double>(const unsigned long sz,
1317  const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1318  double& _f2)
1319 {
1320  do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1321 }
1322 
1323 template <> inline void do_vec_dot_exact<float>(const unsigned long sz,
1324  const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1325  float& _f2)
1326 {
1327  do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1328 }
1329 
1330 template <> inline void do_vec_dot_quick<float>(const unsigned long sz,
1331  const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1332  float& _f2)
1333 {
1334  do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1335 }
1336 
1338 VKERN_TEMPL_2V_T(do_vec_mult_unaligned_exact, XMULT2, T)
1339 VKERN_TEMPL_2V_T(do_vec_mult_unaligned_quick, MULT2, T)
1340 
1341 // TODO: Implement do_vec_sumsqr_exact
1342 
1344 // #define SQR1(r,f1,f2) f2 += r*r
1345 #define SQR1_SIMD(r,f1,f2,SUF) \
1346  TMP = _MMAVX(load_##SUF(r)); \
1347  _MMAVX(FMA(SUF,TMP,TMP,f2,f2))
1348 
1349 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, sd, pd,
1350  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1351  AVXMD, double, __MAVXD)
1352 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, ss, ps,
1353  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1354  AVXMS, float, __MAVXS)
1355 
1356 //do_vec_sumsqr_exact
1357 #define XSQR1_SIMD(r,f1,f2,SUF) \
1358  TMP = _MMAVX(load_##SUF(r)); \
1359  TMP = _MMAVX(mul_##SUF(TMP, TMP)); \
1360  y = TMP; \
1361  TMP = _MMAVX(add_##SUF(TMP, f2)); \
1362  t = TMP; \
1363  TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1364  TMP = _MMAVX(sub_##SUF(TMP, y)); \
1365  f1 = _MMAVX(add_##SUF(f1, TMP)); \
1366  /* f2 = t */ \
1367  f2 = _MMAVX(move_##SUF(f2, t))
1368 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, sd, pd,
1369  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1370  XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1371  AVXMD, double, __MAVXD)
1372 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, ss, ps,
1373  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1374  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1375  AVXMS, float, __MAVXS)
1376 
1377 
1378 #ifndef TBCI_NO_SIMD_FABSSQR
1379 template <> inline void do_vec_fabssqr_quick<double>(const unsigned long sz,
1380  const double * const _v1, double& _f2)
1381 {
1382  double F2 = _f2;
1383  do_vec_sumsqr_quick<double>(sz, _v1, F2);
1384  _f2 = F2;
1385 }
1386 template <> inline void do_vec_fabssqr_exact<double>(const unsigned long sz,
1387  const double * const _v1, double& _f2)
1388 {
1389  double F2 = _f2;
1390  do_vec_sumsqr_exact<double>(sz, _v1, F2);
1391  _f2 = F2;
1392 }
1393 #endif // TBCI_NO_SIMD_FABSSQR
1394 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1395 template <> inline void do_vec_fabssqr_quick<float>(const unsigned long sz,
1396  const float * const _v1, double& _f2)
1397 {
1398  float F2 = _f2;
1399  do_vec_sumsqr_quick<float>(sz, _v1, F2);
1400  _f2 = F2;
1401 }
1402 template <> inline void do_vec_fabssqr_exact<float>(const unsigned long sz,
1403  const float * const _v1, double& _f2)
1404 {
1405  float F2 = _f2;
1406  do_vec_sumsqr_exact<float>(sz, _v1, F2);
1407  _f2 = F2;
1408 }
1409 #endif // TBCI_SIMD_FABSSQR_FLOAT
1410 
1412 //#define SUM1(r,f1,f2) f2 += r
1413 #define SUM1_SIMD(r,f1,f2,SUF) \
1414  TMP = _MMAVX(load_##SUF(r)); \
1415  f2 = _MMAVX(add_##SUF(f2, TMP))
1416 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, sd, pd,
1417  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1418  AVXMD, double, __MAVXD)
1419 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, ss, ps,
1420  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1421  AVXMS, float, __MAVXS)
1422 
1423 //#define XSUM1(r,f1,f2) { T t = f2+r; f1 += (t-f2)-r; f2 = t; }
1425 #define XSUM1_SIMD(r,f1,f2,SUF) \
1426  y = _MMAVX(load_##SUF(r)); \
1427  t = _MMAVX(add_##SUF(f2, y)); \
1428  TMP = _MMAVX(sub_##SUF(t, f2)); \
1429  TMP = _MMAVX(sub_##SUF(TMP, y)); \
1430  f1 = _MMAVX(add_##SUF(f1, TMP)); \
1431  /*f2 = _MMAVX(move_##SUF(f2, t))*/ \
1432  f2 = t
1433 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, sd, pd,
1434  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1435  XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1436  AVXMD, double, __MAVXD)
1437 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, ss, ps,
1438  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1439  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1440  AVXMS, float, __MAVXS)
1441 
1442 #endif // TBCI_SIMD_SUM
1443 
1445 
1446 #endif // TBCI_SELECTIVE_INST
1447 
1448 #endif // __AVX__
1449 
1450 #endif // H_VEC_KERN_SPECIAL_H
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
Definition: basics.h:907
#define NAMESPACE_TBCI
Definition: basics.h:317
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition: plain_def.h:119
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define NAMESPACE_END
Definition: basics.h:323
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define MULT2(r, v1, f1, f2)
#define T
Definition: bdmatlib.cc:20
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
Definition: basics.h:975
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)
Definition: basics.h:891