9 #ifndef H_VEC_KERN_SPECIAL2_H
10 #define H_VEC_KERN_SPECIAL2_H
64 #if defined(__AVX__) && defined(HAVE_IMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65 ( defined(__x86_64__) || defined(__i386__) )
67 #include <immintrin.h>
69 #include "tbci/unroll_prefetch_simd_def.h"
73 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
74 # include "vec_kern_special2_gd.h"
83 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
84 # warning Info: Using unrolled AVX vector kernels
93 # define __MAVXD __m512d
94 # define __MAVXS __m512
95 # define _MMAVX(x) _mm512_##x
99 # define __MAVXD __m256d
100 # define __MAVXS __m256
101 # define _MMAVX(x) _mm256_##x
107 #define SIMD_EMPTY0 do {} while (0)
108 #define SIMD_EMPTY1(x) do {} while (0)
109 #define SIMD_EMPTY2(x,y) do {} while (0)
112 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set1_pd(x))
113 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __MAVXD f1 = _MMAVX(set1_pd(x)), f2 = _MMAVX(set1_pd(y))
115 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set1_ps(x))
116 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __MAVXS f1 = _MMAVX(set1_ps(x)), f2 = _MMAVX(set1_ps(y))
120 #define _mm256_set_sd(d) (__extension__ (__m256d){ d, 0.0, 0.0, 0.0 })
121 #define _mm256_set_ss(f) (__extension__ (__m256 ){ f, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
122 #define _mm256_load_sd(m) (__extension__ (__m256d){ *m, 0.0, 0.0, 0.0 })
123 #define _mm256_loadu_sd(m) (__extension__ (__m256d_u){ *m, 0.0, 0.0, 0.0 })
124 #define _mm256_load_ss(m) (__extension__ (__m256 ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
125 #define _mm256_loadu_ss(m) (__extension__ (__m256_u ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
126 #define _mm256_store_sd(m, r) *m = ((__v4df)r)[0]
127 #define _mm256_store_ss(m, r) *m = ((__v8sf)r)[0]
132 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
133 __GNUC_MINOR__ == 0 && \
134 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
135 # define _MM_STORE(mem, reg, SUF, UNA) \
137 _MMAVX(store##UNA##_##SUF(mem, reg))
139 # define _MM_STORE(mem, reg, SUF, UNA) \
140 _MMAVX(store##UNA##_##SUF(mem, reg))
150 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
151 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
152 _MM_STORE(r, TMP, SUF,)
154 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
155 AVXMD,
double, __MAVXD)
157 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
158 AVXMS,
float, __MAVXS)
168 #define FILL1_SIMD(r,f1,f2,SUF) \
169 _MM_STORE(r, f2, SUF,)
171 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
172 AVXMD,
double, __MAVXD)
174 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
175 AVXMS,
float, __MAVXS)
180 #define _mm256_add_sd _mm256_add_pd
181 #define _mm256_add_ss _mm256_add_ps
182 #define _mm256_sub_sd _mm256_sub_pd
183 #define _mm256_sub_ss _mm256_sub_ps
185 #define _mm256_mul_sd _mm256_mul_pd
186 #define _mm256_mul_ss _mm256_mul_ps
187 #define _mm256_div_sd _mm256_div_pd
188 #define _mm256_div_ss _mm256_div_ps
190 #define _mm512_add_sd _mm512_add_pd
191 #define _mm512_add_ss _mm512_add_ps
192 #define _mm512_sub_sd _mm512_sub_pd
193 #define _mm512_sub_ss _mm512_sub_ps
195 #define _mm512_mul_sd _mm512_mul_pd
196 #define _mm512_mul_ss _mm512_mul_ps
197 #define _mm512_div_sd _mm512_div_pd
198 #define _mm512_div_ss _mm512_div_ps
204 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
205 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
206 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
207 TMP = _MMAVX(add_##SUF(TMP, LD)); \
208 _MM_STORE(r, TMP, SUF,)
210 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
211 AVXMD,
double, __MAVXD)
213 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
214 AVXMS,
float, __MAVXS)
218 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
219 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
220 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
221 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
222 _MM_STORE(r, TMP, SUF,)
224 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 AVXMD,
double, __MAVXD)
227 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228 AVXMS,
float, __MAVXS)
232 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
233 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
234 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
235 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
236 _MM_STORE(r, TMP, SUF,)
238 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
239 AVXMD,
double, __MAVXD)
241 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
242 AVXMS,
float, __MAVXS)
244 template <> inline
void do_vec_vec_cmul<
double>(const
unsigned long sz,
248 do_vec_vec_mul<double>(sz,
res, v1, v2);
250 template <>
inline void do_vec_vec_cmul<float>(
const unsigned long sz,
254 do_vec_vec_mul<float>(sz,
res, v1, v2);
259 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
260 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
261 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
262 TMP = _MMAVX(div_##SUF(TMP, LD)); \
263 _MM_STORE(r, TMP, SUF,)
265 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
266 AVXMD,
double, __MAVXD)
268 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
269 AVXMS,
float, __MAVXS)
271 template <> inline
void do_vec_vec_cdiv<
double>(const
unsigned long sz,
275 do_vec_vec_div<double>(sz,
res, v1, v2);
277 template <>
inline void do_vec_vec_cdiv<float>(
const unsigned long sz,
281 do_vec_vec_div<float>(sz,
res, v1, v2);
287 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
288 TMP = _MMAVX(load_##SUF(r)); \
289 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
290 TMP = _MMAVX(add_##SUF(TMP, LD)); \
291 _MM_STORE(r, TMP, SUF,)
293 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
294 AVXMD,
double, __MAVXD)
296 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
297 AVXMS,
float, __MAVXS)
301 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
302 TMP = _MMAVX(load_##SUF(r)); \
303 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
304 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
305 _MM_STORE(r, TMP, SUF,)
307 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
308 AVXMD,
double, __MAVXD)
310 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
311 AVXMS,
float, __MAVXS)
315 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
316 TMP = _MMAVX(load_##SUF(r)); \
317 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
318 LD = _MMAVX(sub_##SUF(LD, TMP)); \
319 _MM_STORE(r, LD, SUF,)
321 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
322 AVXMD,
double, __MAVXD)
324 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
325 AVXMS,
float, __MAVXS)
329 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
330 TMP = _MMAVX(load_##SUF(r)); \
331 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
332 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
333 _MM_STORE(r, TMP, SUF,)
335 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
336 AVXMD,
double, __MAVXD)
338 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339 AVXMS,
float, __MAVXS)
343 template <> inline
void do_vec_cmul_vec<
double>(const
unsigned long sz,
346 do_vec_mul_vec<double>(sz,
res, v1);
348 template <>
inline void do_vec_cmul_vec<float>(
const unsigned long sz,
351 do_vec_mul_vec<float>(sz,
res, v1);
355 template <>
inline void do_vec_cmul_vec_inv<double>(
const unsigned long sz,
358 do_vec_mul_vec<double>(sz,
res, v1);
360 template <>
inline void do_vec_cmul_vec_inv<float>(
const unsigned long sz,
363 do_vec_mul_vec<float>(sz,
res, v1);
368 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
369 TMP = _MMAVX(load_##SUF(r)); \
370 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
371 TMP = _MMAVX(div_##SUF(TMP, LD)); \
372 _MM_STORE(r, TMP, SUF,)
374 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
375 AVXMD,
double, __MAVXD)
377 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
378 AVXMS,
float, __MAVXS)
382 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
383 TMP = _MMAVX(load_##SUF(r)); \
384 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
385 LD = _MMAVX(div_##SUF(LD, TMP)); \
386 _MM_STORE(r, LD, SUF,)
388 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
389 AVXMD,
double, __MAVXD)
391 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
392 AVXMS,
float, __MAVXS)
396 template <> inline
void do_vec_cdiv_vec<
double>(const
unsigned long sz,
399 do_vec_div_vec<double>(sz,
res, v1);
401 template <>
inline void do_vec_cdiv_vec<float>(
const unsigned long sz,
404 do_vec_div_vec<float>(sz,
res, v1);
410 template <>
inline void do_vec_cdiv_vec_inv<double>(
const unsigned long sz,
413 do_vec_div_vec_inv<double>(sz,
res, v1);
415 template <>
inline void do_vec_cdiv_vec_inv<float>(
const unsigned long sz,
418 do_vec_div_vec_inv<float>(sz,
res, v1);
423 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
424 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
425 TMP = _MMAVX(add_##SUF(TMP, f2)); \
426 _MM_STORE(r, TMP, SUF,)
428 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
429 AVXMD,
double, __MAVXD)
431 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
432 AVXMS,
float, __MAVXS)
436 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
438 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
439 _MM_STORE(r, TMP, SUF,)
441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442 AVXMD,
double, __MAVXD)
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445 AVXMS,
float, __MAVXS)
450 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
451 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
452 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
453 _MM_STORE(r, TMP, SUF,)
455 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
456 AVXMD,
double, __MAVXD)
458 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
459 AVXMS,
float, __MAVXS)
464 template <> inline
void do_val_vec_add<
double>(const
unsigned long sz,
468 do_vec_val_add<double>(sz,
res, v1, _f2);
470 template <>
inline void do_val_vec_add<float>(
const unsigned long sz,
474 do_vec_val_add<float>(sz,
res, v1, _f2);
479 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
480 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
481 TMP = _MMAVX(sub_##SUF(f2, TMP)); \
482 _MM_STORE(r, TMP, SUF,)
484 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
485 AVXMD,
double, __MAVXD)
487 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
488 AVXMS,
float, __MAVXS)
492 template <> inline
void do_val_vec_mul<
double>(const
unsigned long sz,
496 do_vec_val_mul<double>(sz,
res, v1, _f2);
498 template <>
inline void do_val_vec_mul<float>(
const unsigned long sz,
502 do_vec_val_mul<float>(sz,
res, v1, _f2);
507 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
508 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
509 TMP = _MMAVX(div_##SUF(f2, TMP)); \
510 _MM_STORE(r, TMP, SUF,)
512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513 AVXMD,
double, __MAVXD)
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516 AVXMS,
float, __MAVXS)
524 #define ADD1NV_SIMD(r,f1,f2,SUF) \
525 TMP = _MMAVX(load_##SUF(r)); \
526 TMP = _MMAVX(add_##SUF(TMP, f2)); \
527 _MM_STORE(r, TMP, SUF,)
529 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
530 AVXMD,
double, __MAVXD)
532 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533 AVXMS,
float, __MAVXS)
537 #define SUB1NV_SIMD(r,f1,f2,SUF) \
538 TMP = _MMAVX(load_##SUF(r)); \
539 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
540 _MM_STORE(r, TMP, SUF,)
542 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
543 AVXMD,
double, __MAVXD)
545 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546 AVXMS,
float, __MAVXS)
550 #define SUB1RV_SIMD(r,f1,f2,SUF) \
551 TMP = _MMAVX(load_##SUF(r)); \
552 TMP = _MMAVX(sub_##SUF(f2, TMP)); \
553 _MM_STORE(r, TMP, SUF,)
555 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
556 AVXMD,
double, __MAVXD)
558 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559 AVXMS,
float, __MAVXS)
563 #define MUL1NV_SIMD(r,f1,f2,SUF) \
564 TMP = _MMAVX(load_##SUF(r)); \
565 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
566 _MM_STORE(r, TMP, SUF,)
568 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
569 AVXMD,
double, __MAVXD)
571 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
572 AVXMS,
float, __MAVXS)
576 #define DIV1NV_SIMD(r,f1,f2,SUF) \
577 TMP = _MMAVX(load_##SUF(r)); \
578 TMP = _MMAVX(div_##SUF(TMP, f2)); \
579 _MM_STORE(r, TMP, SUF,)
581 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
582 AVXMD,
double, __MAVXD)
584 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
585 AVXMS,
float, __MAVXS)
589 #define DIV1RV_SIMD(r,f1,f2,SUF) \
590 TMP = _MMAVX(load_##SUF(r)); \
591 TMP = _MMAVX(div_##SUF(f2, TMP)); \
592 _MM_STORE(r, TMP, SUF,)
594 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
595 AVXMD,
double, __MAVXD)
597 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
598 AVXMS,
float, __MAVXS)
602 template <> inline
void do_val_add_vec<
double>(const
unsigned long sz,
605 do_vec_add_val<double>(sz,
res, _f2);
607 template <>
inline void do_val_add_vec<float>(
const unsigned long sz,
610 do_vec_add_val<float>(sz,
res, _f2);
624 #if defined( __FMA__) && !defined(NO_FMA)
625 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
628 #define _mm256_fmadd_sd _mm256_fmadd_pd
629 #define _mm256_fmadd_ss _mm256_fmadd_ps
630 #define _mm256_fmsub_sd _mm256_fmsub_pd
631 #define _mm256_fmsub_ss _mm256_fmsub_ps
633 #define _mm256_FMA(SUF,a,b,c,d) \
634 d = _mm256_fmadd_##SUF(a,b,c)
635 #define _mm256_FMS(SUF,a,b,c,d) \
636 d = _mm256_fmsub_##SUF(a,b,c)
638 #define _mm256_FMA(SUF,a,b,c,d) \
639 b = _mm256_mul_##SUF(a, b); \
640 d = _mm256_add_##SUF(b, c)
641 #define _mm256_FMS(SUF,a,b,c,d) \
642 b = _mm256_mul_##SUF(a, b); \
643 d = _mm256_sub_##SUF(b, c)
646 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
647 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
648 TMP = _MMAVX(load_##SUF(r)); \
649 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
650 _MM_STORE(r, TMP, SUF,)
652 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
653 AVXMD,
double, __MAVXD)
655 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
656 AVXMS,
float, __MAVXS)
660 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
661 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
662 TMP = _MMAVX(load_##SUF(r)); \
663 LD = _MMAVX(mul_##SUF(f2,LD)); \
664 TMP = _MMAVX(sub_##SUF(TMP,LD)); \
665 _MM_STORE(r, TMP, SUF,)
667 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
668 AVXMD,
double, __MAVXD)
670 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
671 AVXMS,
float, __MAVXS)
675 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
676 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
677 TMP = _MMAVX(load_##SUF(r)); \
678 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
679 _MM_STORE(r, LD, SUF,)
681 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
682 AVXMD,
double, __MAVXD)
684 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
685 AVXMS,
float, __MAVXS)
689 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
690 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
691 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
692 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
693 _MM_STORE(r, TMP, SUF,)
695 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
696 AVXMD,
double, __MAVXD)
698 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
699 AVXMS,
float, __MAVXS)
703 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
704 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
705 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
706 LD = _MMAVX(mul_##SUF(LD, f2)); \
707 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
708 _MM_STORE(r, TMP, SUF,)
710 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
711 AVXMD,
double, __MAVXD)
713 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
714 AVXMS,
float, __MAVXS)
719 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
720 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
721 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
722 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
723 _MM_STORE(r, TMP, SUF,)
725 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
726 AVXMD,
double, __MAVXD)
728 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 AVXMS,
float, __MAVXS)
733 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
734 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
735 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
736 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
737 _MM_STORE(r, LD, SUF,)
739 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
740 AVXMD,
double, __MAVXD)
742 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
743 AVXMS,
float, __MAVXS)
748 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
749 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
750 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
751 TMP = _MMAVX(mul_##SUF(f2,TMP)); \
752 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
753 _MM_STORE(r, LD, SUF,)
755 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
756 AVXMD,
double, __MAVXD)
758 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
759 AVXMS,
float, __MAVXS)
763 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
764 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
765 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
766 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
767 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
768 _MM_STORE(r, LD, SUF,)
770 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
771 AVXMD,
double, __MAVXD)
773 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774 AVXMS,
float, __MAVXS)
779 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
780 LD = _MMAVX(load_##SUF(r)); \
781 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
782 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
783 _MM_STORE(r, TMP, SUF,)
785 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
786 AVXMD,
double, __MAVXD)
788 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
789 AVXMS,
float, __MAVXS)
793 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
794 LD = _MMAVX(load_##SUF(r)); \
795 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
796 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
797 _MM_STORE(r, LD, SUF,)
799 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
800 AVXMD,
double, __MAVXD)
802 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
803 AVXMS,
float, __MAVXS)
807 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
808 LD = _MMAVX(load_##SUF(r)); \
809 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
810 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
811 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
812 _MM_STORE(r, LD, SUF,)
814 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
815 AVXMD,
double, __MAVXD)
817 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818 AVXMS,
float, __MAVXS)
822 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
823 LD = _MMAVX(load_##SUF(r)); \
824 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
825 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
826 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
827 _MM_STORE(r, LD, SUF,)
829 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
830 AVXMD,
double, __MAVXD)
832 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
833 AVXMS,
float, __MAVXS)
838 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
839 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
840 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
841 _MM_STORE(r, TMP, SUF,)
843 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
844 AVXMD,
double, __MAVXD)
846 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
847 AVXMS,
float, __MAVXS)
851 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
852 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
853 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
854 _MM_STORE(r, TMP, SUF,)
856 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
857 AVXMD,
double, __MAVXD)
859 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
860 AVXMS,
float, __MAVXS)
865 #define ADD1SV_SIMD(r,f1,f2,SUF) \
866 TMP = _MMAVX(load_##SUF(r)); \
867 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
868 _MM_STORE(r, TMP, SUF,)
870 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
871 AVXMD,
double, __MAVXD)
873 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
874 AVXMS,
float, __MAVXS)
878 #define SUB1SV_SIMD(r,f1,f2,SUF) \
879 TMP = _MMAVX(load_##SUF(r)); \
880 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
881 _MM_STORE(r, TMP, SUF,)
883 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
884 AVXMD,
double, __MAVXD)
886 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
887 AVXMS,
float, __MAVXS)
892 template <> inline
void do_val_svc_add<
double>(const
unsigned long sz,
896 do_svc_val_add<double>(sz,
res, v1, f2, f1);
898 template <>
inline void do_val_svc_add<float>(
const unsigned long sz,
902 do_svc_val_add<float>(sz,
res, v1, f2, f1);
907 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
908 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
909 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
910 TMP = _MMAVX(sub_##SUF(f1, TMP)); \
911 _MM_STORE(r, TMP, SUF,)
913 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
914 AVXMD,
double, __MAVXD)
916 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
917 AVXMS,
float, __MAVXS)
921 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
922 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
923 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
924 TMP = _MMAVX(div_##SUF(f1, TMP)); \
925 _MM_STORE(r, TMP, SUF,)
927 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
928 AVXMD,
double, __MAVXD)
930 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
931 AVXMS,
float, __MAVXS)
939 #ifdef HAVE_LONG_LONG
940 #define NEG_DOUBLE_PREP \
941 static union _negmask { \
942 unsigned LONG_LONG lng[8]; \
945 } ALIGN(64) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
946 __m512d neg = _MMAVX(load_pd(negmask.dbl))
948 #define NEG_DOUBLE_PREP \
949 static union _negmask { \
950 unsigned int lng[16]; \
953 } ALIGN(64) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
954 __m512d neg = _MMAVX(load_pd(negmask.dbl))
956 #define NEG_FLOAT_PREP \
957 static union _negmask { \
958 unsigned int itg[8]; \
961 } ALIGN(64) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
962 __m512 neg = _MMAVX(load_ps(negmask.flt))
964 #ifdef HAVE_LONG_LONG
965 #define NEG_DOUBLE_PREP \
966 static union _negmask { \
967 unsigned LONG_LONG lng[4]; \
970 } ALIGN(32) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
971 __m256d neg = _MMAVX(load_pd(negmask.dbl))
973 #define NEG_DOUBLE_PREP \
974 static union _negmask { \
975 unsigned int lng[8]; \
978 } ALIGN(32) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
979 __m256d neg = _MMAVX(load_pd(negmask.dbl))
981 #define NEG_FLOAT_PREP \
982 static union _negmask { \
983 unsigned int itg[8]; \
986 } ALIGN(32) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
987 __m256 neg = _MMAVX(load_ps(negmask.flt))
991 #define _mm256_xor_sd _mm256_xor_pd
992 #define _mm256_xor_ss _mm256_xor_ps
993 #define _mm512_xor_sd _mm512_xor_pd
994 #define _mm512_xor_ss _mm512_xor_ps
998 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
999 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1000 TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1001 _MM_STORE(r, TMP, SUF,)
1003 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1004 AVXMD,
double, __MAVXD)
1006 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1007 AVXMS,
float, __MAVXS)
1011 #define NEG1_SIMD(r,f1,f2,SUF) \
1012 TMP = _MMAVX(load_##SUF(r)); \
1013 TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1014 _MM_STORE(r, TMP, SUF,)
1016 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1017 AVXMD,
double, __MAVXD)
1019 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1020 AVXMS,
float, __MAVXS)
1026 #define VL_PREP(x) long f2 = (x)
1027 #define VL_FIN(x) x = f2
1028 #define _mm256_movemask_sd(x) \
1029 _mm256_movemask_pd(x); rg &= 0x1
1030 #define _mm256_movemask_ss(x) \
1031 _mm256_movemask_ps(x); rg &= 0x1
1032 #define _mm512_movemask_sd(x) \
1033 _mm512_movemask_pd(x); rg &= 0x1
1034 #define _mm512_movemask_ss(x) \
1035 _mm512_movemask_ps(x); rg &= 0x1
1038 #define _mm256_cmpneq_pd(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_UQ)
1039 #define _mm256_cmpneq_ps(a,b) _mm256_cmp_ps(a,b,_CMP_NEQ_UQ)
1040 #define _mm256_cmpneq_sd _mm256_cmpneq_pd
1041 #define _mm256_cmpneq_ss _mm256_cmpneq_ps
1044 #define _mm256_cmp_sd _mm256_cmp_pd
1045 #define _mm256_cmp_ss _mm256_cmp_ps
1048 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
1049 TMP = _MMAVX(load_##SUF(r)); \
1050 LD = _MMAVX(load##UNA##_##SUF(v1)); \
1051 TMP = _MMAVX(cmpneq_##SUF(TMP, LD)); \
1053 rg = _MMAVX(movemask_##SUF(TMP)); \
1054 if (rg) { ++f2; goto _fin; }
1056 VL_PREP, SIMD_EMPTY0, VL_FIN,
1057 AVXMD,
double, __MAVXD)
1059 VL_PREP, SIMD_EMPTY0, VL_FIN,
1060 AVXMS,
float, __MAVXS)
1064 #define DECL_DOUBLE __MAVXD TM2
1065 #define DECL_FLOAT __MAVXS TM2
1067 #define _mm_loadu_sd _mm_load_sd
1068 #define _mm_loadu_ss _mm_load_ss
1071 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
1072 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1073 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
1074 TM2 = _MMAVX(load_##SUF(r)); \
1075 _MMAVX(FMA(SUF,TMP,LD,TM2,TM2)); \
1076 _MM_STORE(r, TM2, SUF,)
1079 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1080 AVXMD,
double, __MAVXD);
1082 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1083 AVXMS,
float, __MAVXS);
1086 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1087 AVXMD,
double, __MAVXD)
1089 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1090 AVXMS,
float, __MAVXS)
1094 template <>
inline void do_add_vec_vec_cmul<double>(
const unsigned long sz,
1098 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1100 template <>
inline void do_add_vec_vec_cmul<float>(
const unsigned long sz,
1104 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1130 #ifndef TBCI_NO_SIMD_SUM
1132 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1133 # warning Info: Using unrolled AVX vector kernels for sums (reductions)
1136 #define SUM_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1137 #define SUM_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1139 #define XSUM_DOUBLE_PREP(x) \
1140 REGISTER __MAVXD f1 = _MMAVX(setzero_pd()); \
1141 REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1142 #define XSUM_FLOAT_PREP(x) \
1143 REGISTER __MAVXS f1 = _MMAVX(setzero_ps()); \
1144 REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1152 #if 1 //def __SSE3__
1154 # define SUM_DOUBLE_SIMD_FINX(f) \
1155 f = _mm256_hadd_pd(f, _mm256_permute2f128_pd(f, f, 0x33)); \
1156 f = _mm256_hadd_pd(f, f)
1158 # define SUM_FLOAT_SIMD_FINX(f) \
1159 f = _mm256_hadd_ps(f, _mm256_permute2f128_ps(f, f, 0x33)); \
1160 f = _mm256_hadd_ps(f, f); \
1161 f = _mm256_hadd_ps(f, f)
1163 # warning horizontal sum AVX512 not yet implemented
1166 # define SUM_DOUBLE_SIMD_FINX(f) \
1167 __MAVXD TM##f = f; \
1168 TM##f = _MMAVX(unpackhi_pd(TM##f, f)); \
1169 f = _MMAVX(add_sd(f, TM##f))
1170 # define SUM_FLOAT_SIMD_FINX(f) \
1171 __MAVXS TM##f = f; \
1172 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0xb1)); \
1173 f = _MMAVX(add_ps(f, TM##f)); \
1175 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0x1b)); \
1176 f = _MMAVX(add_ss(f, TM##f))
1177 # if defined(__GNUC__) && defined(WARN_SSE)
1178 # warning Not using SSE3 -- consider passing -msse3
1182 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1183 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1185 #define SUM_DOUBLE_FINAL(x) \
1186 _MMAVX(store_sd(&x, f2))
1187 #define SUM_FLOAT_FINAL(x) \
1188 _MMAVX(store_ss(&x, f2))
1192 #define _mm256_move_ps(f, x) x
1193 #define _mm256_move_pd(f, x) x
1194 #define _mm256_move_ss(x, f) (__extension__ (__m256 ) { ((__v8sf)f)[0],((__v8sf)x)[1],((__v8sf)x)[2],((__v8sf)x)[3], \
1195 ((__v8sf)x)[4],((__v8sf)x)[5],((__v8sf)x)[6],((__v8sf)x)[7] })
1196 #define _mm256_move_sd(x, f) (__extension__ (__m256d) { ((__v4df)f)[0],((__v4df)x)[1],((__v4df)x)[2],((__v4df)x)[3] })
1207 #define XSUM_DOUBLE_SIMD_FIN_STORE \
1212 #define XSUM_FLOAT_SIMD_FIN_STORE \
1220 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1223 SUM_DOUBLE_SIMD_FINX(f2); \
1224 SUM_DOUBLE_SIMD_FINX(f1); \
1225 f2 = _MMAVX(sub_sd(f2, f1)); \
1226 _MMAVX(store_sd(&x, f2))
1227 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1232 SUM_FLOAT_SIMD_FINX(f2); \
1233 SUM_FLOAT_SIMD_FINX(f1); \
1234 f2 = _MMAVX(sub_ss(f2, f1)); \
1235 _MMAVX(store_ss(&x, f2))
1240 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1244 SUM_DOUBLE_SIMD_FINX(f2); \
1246 COR = _MMAVX(sub_sd(COR, TMP)); \
1247 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1248 COR = _MMAVX(sub_sd(COR, TMP)); \
1249 TMP = _MMAVX(unpackhi_pd(TMP, TMP)); \
1250 COR = _MMAVX(sub_sd(COR, TMP)); \
1251 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1252 COR = _MMAVX(sub_sd(COR, TMP)); \
1253 COR = _MMAVX(set_sd(((__v4df)COR)[0])); \
1254 f1 = _MMAVX(add_sd(f1, COR)); \
1255 SUM_DOUBLE_SIMD_FINX(f1); \
1256 f2 = _MMAVX(sub_sd(f2, f1)); \
1257 _MMAVX(store_sd(&x, f2))
1264 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1265 TMP = _MMAVX(load_##SUF(r)); \
1266 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1267 _MMAVX(FMA(SUF,TMP,LD,f2,f2))
1269 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1270 AVXMD,
double, __MAVXD)
1272 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1273 AVXMS,
float, __MAVXS)
1286 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1287 TMP = _MMAVX(load_##SUF(r)); \
1288 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1289 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
1290 LD = _MMAVX(move_##SUF(LD, TMP)); \
1292 TMP = _MMAVX(add_##SUF(TMP, f2)); \
1294 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1295 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
1296 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1298 f2 = _MMAVX(move_##SUF(f2, t))
1300 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1301 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1302 AVXMD,
double, __MAVXD)
1304 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1305 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1306 AVXMS,
float, __MAVXS)
1309 template <> inline
void do_vec_dot_exact<
double>(const
unsigned long sz,
1313 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1316 template <>
inline void do_vec_dot_quick<double>(
const unsigned long sz,
1320 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1323 template <>
inline void do_vec_dot_exact<float>(
const unsigned long sz,
1327 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1330 template <>
inline void do_vec_dot_quick<float>(
const unsigned long sz,
1334 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1345 #define SQR1_SIMD(r,f1,f2,SUF) \
1346 TMP = _MMAVX(load_##SUF(r)); \
1347 _MMAVX(FMA(SUF,TMP,TMP,f2,f2))
1350 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1351 AVXMD,
double, __MAVXD)
1353 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1354 AVXMS,
float, __MAVXS)
1357 #define XSQR1_SIMD(r,f1,f2,SUF) \
1358 TMP = _MMAVX(load_##SUF(r)); \
1359 TMP = _MMAVX(mul_##SUF(TMP, TMP)); \
1361 TMP = _MMAVX(add_##SUF(TMP, f2)); \
1363 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1364 TMP = _MMAVX(sub_##SUF(TMP, y)); \
1365 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1367 f2 = _MMAVX(move_##SUF(f2, t))
1369 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1370 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1371 AVXMD,
double, __MAVXD)
1373 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1374 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1375 AVXMS,
float, __MAVXS)
1378 #ifndef TBCI_NO_SIMD_FABSSQR
1379 template <>
inline void do_vec_fabssqr_quick<double>(
const unsigned long sz,
1380 const double *
const _v1,
double& _f2)
1383 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1386 template <>
inline void do_vec_fabssqr_exact<double>(
const unsigned long sz,
1387 const double *
const _v1,
double& _f2)
1390 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1393 #endif // TBCI_NO_SIMD_FABSSQR
1394 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1395 template <>
inline void do_vec_fabssqr_quick<float>(
const unsigned long sz,
1396 const float *
const _v1,
double& _f2)
1399 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1402 template <>
inline void do_vec_fabssqr_exact<float>(
const unsigned long sz,
1403 const float *
const _v1,
double& _f2)
1406 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1409 #endif // TBCI_SIMD_FABSSQR_FLOAT
1413 #define SUM1_SIMD(r,f1,f2,SUF) \
1414 TMP = _MMAVX(load_##SUF(r)); \
1415 f2 = _MMAVX(add_##SUF(f2, TMP))
1417 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1418 AVXMD,
double, __MAVXD)
1420 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1421 AVXMS,
float, __MAVXS)
1425 #define XSUM1_SIMD(r,f1,f2,SUF) \
1426 y = _MMAVX(load_##SUF(r)); \
1427 t = _MMAVX(add_##SUF(f2, y)); \
1428 TMP = _MMAVX(sub_##SUF(t, f2)); \
1429 TMP = _MMAVX(sub_##SUF(TMP, y)); \
1430 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1434 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1435 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1436 AVXMD,
double, __MAVXD)
1438 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1439 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1440 AVXMS,
float, __MAVXS)
1442 #endif // TBCI_SIMD_SUM
1446 #endif // TBCI_SELECTIVE_INST
1450 #endif // H_VEC_KERN_SPECIAL_H
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
#define MULT2(r, v1, f1, f2)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)