9 #ifndef H_VEC_KERN_SPECIAL2_H 10 #define H_VEC_KERN_SPECIAL2_H 64 #if defined(__AVX__) && defined(HAVE_IMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \ 65 ( defined(__x86_64__) || defined(__i386__) ) 67 #include <immintrin.h> 69 #include "tbci/unroll_prefetch_simd_def.h" 73 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL) 74 # include "vec_kern_special2_gd.h" 83 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD) 84 # warning Info: Using unrolled AVX vector kernels 93 # define __MAVXD __m512d 94 # define __MAVXS __m512 95 # define _MMAVX(x) _mm512_##x 99 # define __MAVXD __m256d 100 # define __MAVXS __m256 101 # define _MMAVX(x) _mm256_##x 107 #define SIMD_EMPTY0 do {} while (0) 108 #define SIMD_EMPTY1(x) do {} while (0) 109 #define SIMD_EMPTY2(x,y) do {} while (0) 112 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set1_pd(x)) 113 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __MAVXD f1 = _MMAVX(set1_pd(x)), f2 = _MMAVX(set1_pd(y)) 115 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set1_ps(x)) 116 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __MAVXS f1 = _MMAVX(set1_ps(x)), f2 = _MMAVX(set1_ps(y)) 120 #define _mm256_set_sd(d) (__extension__ (__m256d){ d, 0.0, 0.0, 0.0 }) 121 #define _mm256_set_ss(f) (__extension__ (__m256 ){ f, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }) 122 #define _mm256_load_sd(m) (__extension__ (__m256d){ *m, 0.0, 0.0, 0.0 }) 123 #define _mm256_loadu_sd(m) (__extension__ (__m256d_u){ *m, 0.0, 0.0, 0.0 }) 124 #define _mm256_load_ss(m) (__extension__ (__m256 ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }) 125 #define _mm256_loadu_ss(m) (__extension__ (__m256_u ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }) 126 #define _mm256_store_sd(m, r) *m = ((__v4df)r)[0] 127 #define _mm256_store_ss(m, r) *m = ((__v8sf)r)[0] 132 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \ 133 __GNUC_MINOR__ == 0 && \ 134 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0) 135 # define _MM_STORE(mem, reg, SUF, UNA) \ 137 _MMAVX(store##UNA##_##SUF(mem, reg)) 139 # define _MM_STORE(mem, reg, SUF, UNA) \ 140 _MMAVX(store##UNA##_##SUF(mem, reg)) 150 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 151 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 152 _MM_STORE(r, TMP, SUF,) 154 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
155 AVXMD,
double, __MAVXD)
157 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
158 AVXMS,
float, __MAVXS)
168 #define FILL1_SIMD(r,f1,f2,SUF) \ 169 _MM_STORE(r, f2, SUF,) 171 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
172 AVXMD,
double, __MAVXD)
174 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
175 AVXMS,
float, __MAVXS)
180 #define _mm256_add_sd _mm256_add_pd 181 #define _mm256_add_ss _mm256_add_ps 182 #define _mm256_sub_sd _mm256_sub_pd 183 #define _mm256_sub_ss _mm256_sub_ps 185 #define _mm256_mul_sd _mm256_mul_pd 186 #define _mm256_mul_ss _mm256_mul_ps 187 #define _mm256_div_sd _mm256_div_pd 188 #define _mm256_div_ss _mm256_div_ps 190 #define _mm512_add_sd _mm512_add_pd 191 #define _mm512_add_ss _mm512_add_ps 192 #define _mm512_sub_sd _mm512_sub_pd 193 #define _mm512_sub_ss _mm512_sub_ps 195 #define _mm512_mul_sd _mm512_mul_pd 196 #define _mm512_mul_ss _mm512_mul_ps 197 #define _mm512_div_sd _mm512_div_pd 198 #define _mm512_div_ss _mm512_div_ps 204 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 205 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 206 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 207 TMP = _MMAVX(add_##SUF(TMP, LD)); \ 208 _MM_STORE(r, TMP, SUF,) 210 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
211 AVXMD,
double, __MAVXD)
213 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
214 AVXMS,
float, __MAVXS)
218 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 219 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 220 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 221 TMP = _MMAVX(sub_##SUF(TMP, LD)); \ 222 _MM_STORE(r, TMP, SUF,) 224 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 AVXMD,
double, __MAVXD)
227 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228 AVXMS,
float, __MAVXS)
232 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 233 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 234 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 235 TMP = _MMAVX(mul_##SUF(TMP, LD)); \ 236 _MM_STORE(r, TMP, SUF,) 238 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
239 AVXMD,
double, __MAVXD)
241 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
242 AVXMS,
float, __MAVXS)
244 template <>
inline void do_vec_vec_cmul<double>(
const unsigned long sz,
248 do_vec_vec_mul<double>(sz,
res, v1, v2);
250 template <>
inline void do_vec_vec_cmul<float>(
const unsigned long sz,
254 do_vec_vec_mul<float>(sz,
res, v1, v2);
259 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 260 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 261 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 262 TMP = _MMAVX(div_##SUF(TMP, LD)); \ 263 _MM_STORE(r, TMP, SUF,) 265 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
266 AVXMD,
double, __MAVXD)
268 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
269 AVXMS,
float, __MAVXS)
271 template <>
inline void do_vec_vec_cdiv<double>(
const unsigned long sz,
275 do_vec_vec_div<double>(sz,
res, v1, v2);
277 template <>
inline void do_vec_vec_cdiv<float>(
const unsigned long sz,
281 do_vec_vec_div<float>(sz,
res, v1, v2);
287 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 288 TMP = _MMAVX(load_##SUF(r)); \ 289 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 290 TMP = _MMAVX(add_##SUF(TMP, LD)); \ 291 _MM_STORE(r, TMP, SUF,) 293 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
294 AVXMD,
double, __MAVXD)
296 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
297 AVXMS,
float, __MAVXS)
301 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 302 TMP = _MMAVX(load_##SUF(r)); \ 303 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 304 TMP = _MMAVX(sub_##SUF(TMP, LD)); \ 305 _MM_STORE(r, TMP, SUF,) 307 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
308 AVXMD,
double, __MAVXD)
310 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
311 AVXMS,
float, __MAVXS)
315 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \ 316 TMP = _MMAVX(load_##SUF(r)); \ 317 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 318 LD = _MMAVX(sub_##SUF(LD, TMP)); \ 319 _MM_STORE(r, LD, SUF,) 321 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
322 AVXMD,
double, __MAVXD)
324 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
325 AVXMS,
float, __MAVXS)
329 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 330 TMP = _MMAVX(load_##SUF(r)); \ 331 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 332 TMP = _MMAVX(mul_##SUF(TMP, LD)); \ 333 _MM_STORE(r, TMP, SUF,) 335 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
336 AVXMD,
double, __MAVXD)
338 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339 AVXMS,
float, __MAVXS)
343 template <>
inline void do_vec_cmul_vec<double>(
const unsigned long sz,
346 do_vec_mul_vec<double>(sz,
res, v1);
348 template <>
inline void do_vec_cmul_vec<float>(
const unsigned long sz,
351 do_vec_mul_vec<float>(sz,
res, v1);
355 template <>
inline void do_vec_cmul_vec_inv<double>(
const unsigned long sz,
358 do_vec_mul_vec<double>(sz,
res, v1);
360 template <>
inline void do_vec_cmul_vec_inv<float>(
const unsigned long sz,
363 do_vec_mul_vec<float>(sz,
res, v1);
368 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 369 TMP = _MMAVX(load_##SUF(r)); \ 370 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 371 TMP = _MMAVX(div_##SUF(TMP, LD)); \ 372 _MM_STORE(r, TMP, SUF,) 374 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
375 AVXMD,
double, __MAVXD)
377 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
378 AVXMS,
float, __MAVXS)
382 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \ 383 TMP = _MMAVX(load_##SUF(r)); \ 384 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 385 LD = _MMAVX(div_##SUF(LD, TMP)); \ 386 _MM_STORE(r, LD, SUF,) 388 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
389 AVXMD,
double, __MAVXD)
391 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
392 AVXMS,
float, __MAVXS)
396 template <>
inline void do_vec_cdiv_vec<double>(
const unsigned long sz,
399 do_vec_div_vec<double>(sz,
res, v1);
401 template <>
inline void do_vec_cdiv_vec<float>(
const unsigned long sz,
404 do_vec_div_vec<float>(sz,
res, v1);
410 template <>
inline void do_vec_cdiv_vec_inv<double>(
const unsigned long sz,
413 do_vec_div_vec_inv<double>(sz,
res, v1);
415 template <>
inline void do_vec_cdiv_vec_inv<float>(
const unsigned long sz,
418 do_vec_div_vec_inv<float>(sz,
res, v1);
423 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 424 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 425 TMP = _MMAVX(add_##SUF(TMP, f2)); \ 426 _MM_STORE(r, TMP, SUF,) 428 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
429 AVXMD,
double, __MAVXD)
431 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
432 AVXMS,
float, __MAVXS)
436 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 437 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 438 TMP = _MMAVX(sub_##SUF(TMP, f2)); \ 439 _MM_STORE(r, TMP, SUF,) 441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442 AVXMD,
double, __MAVXD)
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445 AVXMS,
float, __MAVXS)
450 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 451 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 452 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 453 _MM_STORE(r, TMP, SUF,) 455 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
456 AVXMD,
double, __MAVXD)
458 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
459 AVXMS,
float, __MAVXS)
464 template <>
inline void do_val_vec_add<double>(
const unsigned long sz,
468 do_vec_val_add<double>(sz,
res, v1, _f2);
470 template <>
inline void do_val_vec_add<float>(
const unsigned long sz,
474 do_vec_val_add<float>(sz,
res, v1, _f2);
479 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 480 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 481 TMP = _MMAVX(sub_##SUF(f2, TMP)); \ 482 _MM_STORE(r, TMP, SUF,) 484 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
485 AVXMD,
double, __MAVXD)
487 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
488 AVXMS,
float, __MAVXS)
492 template <>
inline void do_val_vec_mul<double>(
const unsigned long sz,
496 do_vec_val_mul<double>(sz,
res, v1, _f2);
498 template <>
inline void do_val_vec_mul<float>(
const unsigned long sz,
502 do_vec_val_mul<float>(sz,
res, v1, _f2);
507 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 508 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 509 TMP = _MMAVX(div_##SUF(f2, TMP)); \ 510 _MM_STORE(r, TMP, SUF,) 512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513 AVXMD,
double, __MAVXD)
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516 AVXMS,
float, __MAVXS)
524 #define ADD1NV_SIMD(r,f1,f2,SUF) \ 525 TMP = _MMAVX(load_##SUF(r)); \ 526 TMP = _MMAVX(add_##SUF(TMP, f2)); \ 527 _MM_STORE(r, TMP, SUF,) 529 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
530 AVXMD,
double, __MAVXD)
532 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533 AVXMS,
float, __MAVXS)
537 #define SUB1NV_SIMD(r,f1,f2,SUF) \ 538 TMP = _MMAVX(load_##SUF(r)); \ 539 TMP = _MMAVX(sub_##SUF(TMP, f2)); \ 540 _MM_STORE(r, TMP, SUF,) 542 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
543 AVXMD,
double, __MAVXD)
545 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546 AVXMS,
float, __MAVXS)
550 #define SUB1RV_SIMD(r,f1,f2,SUF) \ 551 TMP = _MMAVX(load_##SUF(r)); \ 552 TMP = _MMAVX(sub_##SUF(f2, TMP)); \ 553 _MM_STORE(r, TMP, SUF,) 555 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
556 AVXMD,
double, __MAVXD)
558 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559 AVXMS,
float, __MAVXS)
563 #define MUL1NV_SIMD(r,f1,f2,SUF) \ 564 TMP = _MMAVX(load_##SUF(r)); \ 565 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 566 _MM_STORE(r, TMP, SUF,) 568 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
569 AVXMD,
double, __MAVXD)
571 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
572 AVXMS,
float, __MAVXS)
576 #define DIV1NV_SIMD(r,f1,f2,SUF) \ 577 TMP = _MMAVX(load_##SUF(r)); \ 578 TMP = _MMAVX(div_##SUF(TMP, f2)); \ 579 _MM_STORE(r, TMP, SUF,) 581 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
582 AVXMD,
double, __MAVXD)
584 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
585 AVXMS,
float, __MAVXS)
589 #define DIV1RV_SIMD(r,f1,f2,SUF) \ 590 TMP = _MMAVX(load_##SUF(r)); \ 591 TMP = _MMAVX(div_##SUF(f2, TMP)); \ 592 _MM_STORE(r, TMP, SUF,) 594 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
595 AVXMD,
double, __MAVXD)
597 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
598 AVXMS,
float, __MAVXS)
602 template <>
inline void do_val_add_vec<double>(
const unsigned long sz,
605 do_vec_add_val<double>(sz,
res, _f2);
607 template <>
inline void do_val_add_vec<float>(
const unsigned long sz,
610 do_vec_add_val<float>(sz,
res, _f2);
624 #if defined( __FMA__) && !defined(NO_FMA) 625 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD) 628 #define _mm256_fmadd_sd _mm256_fmadd_pd 629 #define _mm256_fmadd_ss _mm256_fmadd_ps 630 #define _mm256_fmsub_sd _mm256_fmsub_pd 631 #define _mm256_fmsub_ss _mm256_fmsub_ps 633 #define _mm256_FMA(SUF,a,b,c,d) \ 634 d = _mm256_fmadd_##SUF(a,b,c) 635 #define _mm256_FMS(SUF,a,b,c,d) \ 636 d = _mm256_fmsub_##SUF(a,b,c) 638 #define _mm256_FMA(SUF,a,b,c,d) \ 639 b = _mm256_mul_##SUF(a, b); \ 640 d = _mm256_add_##SUF(b, c) 641 #define _mm256_FMS(SUF,a,b,c,d) \ 642 b = _mm256_mul_##SUF(a, b); \ 643 d = _mm256_sub_##SUF(b, c) 646 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 647 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 648 TMP = _MMAVX(load_##SUF(r)); \ 649 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \ 650 _MM_STORE(r, TMP, SUF,) 652 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
653 AVXMD,
double, __MAVXD)
655 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
656 AVXMS,
float, __MAVXS)
660 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 661 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 662 TMP = _MMAVX(load_##SUF(r)); \ 663 LD = _MMAVX(mul_##SUF(f2,LD)); \ 664 TMP = _MMAVX(sub_##SUF(TMP,LD)); \ 665 _MM_STORE(r, TMP, SUF,) 667 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
668 AVXMD,
double, __MAVXD)
670 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
671 AVXMS,
float, __MAVXS)
675 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 676 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 677 TMP = _MMAVX(load_##SUF(r)); \ 678 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \ 679 _MM_STORE(r, LD, SUF,) 681 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
682 AVXMD,
double, __MAVXD)
684 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
685 AVXMS,
float, __MAVXS)
689 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 690 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 691 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 692 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \ 693 _MM_STORE(r, TMP, SUF,) 695 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
696 AVXMD,
double, __MAVXD)
698 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
699 AVXMS,
float, __MAVXS)
703 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 704 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 705 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 706 LD = _MMAVX(mul_##SUF(LD, f2)); \ 707 TMP = _MMAVX(sub_##SUF(TMP, LD)); \ 708 _MM_STORE(r, TMP, SUF,) 710 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
711 AVXMD,
double, __MAVXD)
713 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
714 AVXMS,
float, __MAVXS)
719 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 720 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 721 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \ 722 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \ 723 _MM_STORE(r, TMP, SUF,) 725 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
726 AVXMD,
double, __MAVXD)
728 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 AVXMS,
float, __MAVXS)
733 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 734 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 735 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \ 736 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \ 737 _MM_STORE(r, LD, SUF,) 739 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
740 AVXMD,
double, __MAVXD)
742 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
743 AVXMS,
float, __MAVXS)
748 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 749 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 750 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \ 751 TMP = _MMAVX(mul_##SUF(f2,TMP)); \ 752 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \ 753 _MM_STORE(r, LD, SUF,) 755 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
756 AVXMD,
double, __MAVXD)
758 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
759 AVXMS,
float, __MAVXS)
763 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 764 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 765 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \ 766 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 767 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \ 768 _MM_STORE(r, LD, SUF,) 770 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
771 AVXMD,
double, __MAVXD)
773 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774 AVXMS,
float, __MAVXS)
779 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \ 780 LD = _MMAVX(load_##SUF(r)); \ 781 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 782 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \ 783 _MM_STORE(r, TMP, SUF,) 785 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
786 AVXMD,
double, __MAVXD)
788 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
789 AVXMS,
float, __MAVXS)
793 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \ 794 LD = _MMAVX(load_##SUF(r)); \ 795 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 796 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \ 797 _MM_STORE(r, LD, SUF,) 799 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
800 AVXMD,
double, __MAVXD)
802 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
803 AVXMS,
float, __MAVXS)
807 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 808 LD = _MMAVX(load_##SUF(r)); \ 809 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 810 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 811 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \ 812 _MM_STORE(r, LD, SUF,) 814 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
815 AVXMD,
double, __MAVXD)
817 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818 AVXMS,
float, __MAVXS)
822 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 823 LD = _MMAVX(load_##SUF(r)); \ 824 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 825 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 826 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \ 827 _MM_STORE(r, LD, SUF,) 829 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
830 AVXMD,
double, __MAVXD)
832 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
833 AVXMS,
float, __MAVXS)
838 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 839 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 840 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \ 841 _MM_STORE(r, TMP, SUF,) 843 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
844 AVXMD,
double, __MAVXD)
846 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
847 AVXMS,
float, __MAVXS)
851 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 852 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 853 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \ 854 _MM_STORE(r, TMP, SUF,) 856 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
857 AVXMD,
double, __MAVXD)
859 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
860 AVXMS,
float, __MAVXS)
865 #define ADD1SV_SIMD(r,f1,f2,SUF) \ 866 TMP = _MMAVX(load_##SUF(r)); \ 867 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \ 868 _MM_STORE(r, TMP, SUF,) 870 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
871 AVXMD,
double, __MAVXD)
873 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
874 AVXMS,
float, __MAVXS)
878 #define SUB1SV_SIMD(r,f1,f2,SUF) \ 879 TMP = _MMAVX(load_##SUF(r)); \ 880 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \ 881 _MM_STORE(r, TMP, SUF,) 883 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
884 AVXMD,
double, __MAVXD)
886 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
887 AVXMS,
float, __MAVXS)
892 template <>
inline void do_val_svc_add<double>(
const unsigned long sz,
896 do_svc_val_add<double>(sz,
res, v1, f2, f1);
898 template <>
inline void do_val_svc_add<float>(
const unsigned long sz,
902 do_svc_val_add<float>(sz,
res, v1, f2, f1);
907 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 908 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 909 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 910 TMP = _MMAVX(sub_##SUF(f1, TMP)); \ 911 _MM_STORE(r, TMP, SUF,) 913 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
914 AVXMD,
double, __MAVXD)
916 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
917 AVXMS,
float, __MAVXS)
921 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 922 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 923 TMP = _MMAVX(mul_##SUF(TMP, f2)); \ 924 TMP = _MMAVX(div_##SUF(f1, TMP)); \ 925 _MM_STORE(r, TMP, SUF,) 927 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
928 AVXMD,
double, __MAVXD)
930 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
931 AVXMS,
float, __MAVXS)
939 #ifdef HAVE_LONG_LONG 940 #define NEG_DOUBLE_PREP \ 941 static union _negmask { \ 942 unsigned LONG_LONG lng[8]; \ 945 } ALIGN(64) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \ 946 __m512d neg = _MMAVX(load_pd(negmask.dbl)) 948 #define NEG_DOUBLE_PREP \ 949 static union _negmask { \ 950 unsigned int lng[16]; \ 953 } ALIGN(64) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \ 954 __m512d neg = _MMAVX(load_pd(negmask.dbl)) 956 #define NEG_FLOAT_PREP \ 957 static union _negmask { \ 958 unsigned int itg[8]; \ 961 } ALIGN(64) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \ 962 __m512 neg = _MMAVX(load_ps(negmask.flt)) 964 #ifdef HAVE_LONG_LONG 965 #define NEG_DOUBLE_PREP \ 966 static union _negmask { \ 967 unsigned LONG_LONG lng[4]; \ 970 } ALIGN(32) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \ 971 __m256d neg = _MMAVX(load_pd(negmask.dbl)) 973 #define NEG_DOUBLE_PREP \ 974 static union _negmask { \ 975 unsigned int lng[8]; \ 978 } ALIGN(32) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \ 979 __m256d neg = _MMAVX(load_pd(negmask.dbl)) 981 #define NEG_FLOAT_PREP \ 982 static union _negmask { \ 983 unsigned int itg[8]; \ 986 } ALIGN(32) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \ 987 __m256 neg = _MMAVX(load_ps(negmask.flt)) 991 #define _mm256_xor_sd _mm256_xor_pd 992 #define _mm256_xor_ss _mm256_xor_ps 993 #define _mm512_xor_sd _mm512_xor_pd 994 #define _mm512_xor_ss _mm512_xor_ps 998 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 999 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 1000 TMP = _MMAVX(xor_##SUF(TMP, neg)); \ 1001 _MM_STORE(r, TMP, SUF,) 1003 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1004 AVXMD,
double, __MAVXD)
1006 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1007 AVXMS,
float, __MAVXS)
1011 #define NEG1_SIMD(r,f1,f2,SUF) \ 1012 TMP = _MMAVX(load_##SUF(r)); \ 1013 TMP = _MMAVX(xor_##SUF(TMP, neg)); \ 1014 _MM_STORE(r, TMP, SUF,) 1016 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1017 AVXMD,
double, __MAVXD)
1019 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1020 AVXMS,
float, __MAVXS)
1026 #define VL_PREP(x) long f2 = (x) 1027 #define VL_FIN(x) x = f2 1028 #define _mm256_movemask_sd(x) \ 1029 _mm256_movemask_pd(x); rg &= 0x1 1030 #define _mm256_movemask_ss(x) \ 1031 _mm256_movemask_ps(x); rg &= 0x1 1032 #define _mm512_movemask_sd(x) \ 1033 _mm512_movemask_pd(x); rg &= 0x1 1034 #define _mm512_movemask_ss(x) \ 1035 _mm512_movemask_ps(x); rg &= 0x1 1038 #define _mm256_cmpneq_pd(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_UQ) 1039 #define _mm256_cmpneq_ps(a,b) _mm256_cmp_ps(a,b,_CMP_NEQ_UQ) 1040 #define _mm256_cmpneq_sd _mm256_cmpneq_pd 1041 #define _mm256_cmpneq_ss _mm256_cmpneq_ps 1044 #define _mm256_cmp_sd _mm256_cmp_pd 1045 #define _mm256_cmp_ss _mm256_cmp_ps 1048 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \ 1049 TMP = _MMAVX(load_##SUF(r)); \ 1050 LD = _MMAVX(load##UNA##_##SUF(v1)); \ 1051 TMP = _MMAVX(cmpneq_##SUF(TMP, LD)); \ 1053 rg = _MMAVX(movemask_##SUF(TMP)); \ 1054 if (rg) { ++f2; goto _fin; } 1056 VL_PREP, SIMD_EMPTY0, VL_FIN,
1057 AVXMD,
double, __MAVXD)
1059 VL_PREP, SIMD_EMPTY0, VL_FIN,
1060 AVXMS,
float, __MAVXS)
1064 #define DECL_DOUBLE __MAVXD TM2 1065 #define DECL_FLOAT __MAVXS TM2 1067 #define _mm_loadu_sd _mm_load_sd 1068 #define _mm_loadu_ss _mm_load_ss 1071 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 1072 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \ 1073 LD = _MMAVX(load##UNA2##_##SUF(v2)); \ 1074 TM2 = _MMAVX(load_##SUF(r)); \ 1075 _MMAVX(FMA(SUF,TMP,LD,TM2,TM2)); \ 1076 _MM_STORE(r, TM2, SUF,) 1079 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1080 AVXMD,
double, __MAVXD);
1082 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1083 AVXMS,
float, __MAVXS);
1086 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1087 AVXMD,
double, __MAVXD)
1089 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1090 AVXMS,
float, __MAVXS)
1094 template <>
inline void do_add_vec_vec_cmul<double>(
const unsigned long sz,
1098 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1100 template <>
inline void do_add_vec_vec_cmul<float>(
const unsigned long sz,
1104 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1130 #ifndef TBCI_NO_SIMD_SUM 1132 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE) 1133 # warning Info: Using unrolled AVX vector kernels for sums (reductions) 1136 #define SUM_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set_sd(x)) 1137 #define SUM_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set_ss(x)) 1139 #define XSUM_DOUBLE_PREP(x) \ 1140 REGISTER __MAVXD f1 = _MMAVX(setzero_pd()); \ 1141 REGISTER __MAVXD f2 = _MMAVX(set_sd(x)) 1142 #define XSUM_FLOAT_PREP(x) \ 1143 REGISTER __MAVXS f1 = _MMAVX(setzero_ps()); \ 1144 REGISTER __MAVXS f2 = _MMAVX(set_ss(x)) 1152 #if 1 //def __SSE3__ 1154 # define SUM_DOUBLE_SIMD_FINX(f) \ 1155 f = _mm256_hadd_pd(f, _mm256_permute2f128_pd(f, f, 0x33)); \ 1156 f = _mm256_hadd_pd(f, f) 1158 # define SUM_FLOAT_SIMD_FINX(f) \ 1159 f = _mm256_hadd_ps(f, _mm256_permute2f128_ps(f, f, 0x33)); \ 1160 f = _mm256_hadd_ps(f, f); \ 1161 f = _mm256_hadd_ps(f, f) 1163 # warning horizontal sum AVX512 not yet implemented 1166 # define SUM_DOUBLE_SIMD_FINX(f) \ 1167 __MAVXD TM##f = f; \ 1168 TM##f = _MMAVX(unpackhi_pd(TM##f, f)); \ 1169 f = _MMAVX(add_sd(f, TM##f)) 1170 # define SUM_FLOAT_SIMD_FINX(f) \ 1171 __MAVXS TM##f = f; \ 1172 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0xb1)); \ 1173 f = _MMAVX(add_ps(f, TM##f)); \ 1175 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0x1b)); \ 1176 f = _MMAVX(add_ss(f, TM##f)) 1177 # if defined(__GNUC__) && defined(WARN_SSE) 1178 # warning Not using SSE3 -- consider passing -msse3 1182 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2) 1183 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2) 1185 #define SUM_DOUBLE_FINAL(x) \ 1186 _MMAVX(store_sd(&x, f2)) 1187 #define SUM_FLOAT_FINAL(x) \ 1188 _MMAVX(store_ss(&x, f2)) 1192 #define _mm256_move_ps(f, x) x 1193 #define _mm256_move_pd(f, x) x 1194 #define _mm256_move_ss(x, f) (__extension__ (__m256 ) { ((__v8sf)f)[0],((__v8sf)x)[1],((__v8sf)x)[2],((__v8sf)x)[3], \ 1195 ((__v8sf)x)[4],((__v8sf)x)[5],((__v8sf)x)[6],((__v8sf)x)[7] }) 1196 #define _mm256_move_sd(x, f) (__extension__ (__m256d) { ((__v4df)f)[0],((__v4df)x)[1],((__v4df)x)[2],((__v4df)x)[3] }) 1207 #define XSUM_DOUBLE_SIMD_FIN_STORE \ 1212 #define XSUM_FLOAT_SIMD_FIN_STORE \ 1220 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \ 1223 SUM_DOUBLE_SIMD_FINX(f2); \ 1224 SUM_DOUBLE_SIMD_FINX(f1); \ 1225 f2 = _MMAVX(sub_sd(f2, f1)); \ 1226 _MMAVX(store_sd(&x, f2)) 1227 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \ 1232 SUM_FLOAT_SIMD_FINX(f2); \ 1233 SUM_FLOAT_SIMD_FINX(f1); \ 1234 f2 = _MMAVX(sub_ss(f2, f1)); \ 1235 _MMAVX(store_ss(&x, f2)) 1240 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \ 1244 SUM_DOUBLE_SIMD_FINX(f2); \ 1246 COR = _MMAVX(sub_sd(COR, TMP)); \ 1247 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \ 1248 COR = _MMAVX(sub_sd(COR, TMP)); \ 1249 TMP = _MMAVX(unpackhi_pd(TMP, TMP)); \ 1250 COR = _MMAVX(sub_sd(COR, TMP)); \ 1251 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \ 1252 COR = _MMAVX(sub_sd(COR, TMP)); \ 1253 COR = _MMAVX(set_sd(((__v4df)COR)[0])); \ 1254 f1 = _MMAVX(add_sd(f1, COR)); \ 1255 SUM_DOUBLE_SIMD_FINX(f1); \ 1256 f2 = _MMAVX(sub_sd(f2, f1)); \ 1257 _MMAVX(store_sd(&x, f2)) 1264 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 1265 TMP = _MMAVX(load_##SUF(r)); \ 1266 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 1267 _MMAVX(FMA(SUF,TMP,LD,f2,f2)) 1269 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1270 AVXMD,
double, __MAVXD)
1272 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1273 AVXMS,
float, __MAVXS)
1286 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 1287 TMP = _MMAVX(load_##SUF(r)); \ 1288 LD = _MMAVX(load##UNA1##_##SUF(v1)); \ 1289 TMP = _MMAVX(mul_##SUF(TMP, LD)); \ 1290 LD = _MMAVX(move_##SUF(LD, TMP)); \ 1292 TMP = _MMAVX(add_##SUF(TMP, f2)); \ 1294 TMP = _MMAVX(sub_##SUF(TMP, f2)); \ 1295 TMP = _MMAVX(sub_##SUF(TMP, LD)); \ 1296 f1 = _MMAVX(add_##SUF(f1, TMP)); \ 1298 f2 = _MMAVX(move_##SUF(f2, t)) 1300 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1301 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1302 AVXMD,
double, __MAVXD)
1304 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1305 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1306 AVXMS,
float, __MAVXS)
1309 template <>
inline void do_vec_dot_exact<double>(
const unsigned long sz,
1313 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1316 template <>
inline void do_vec_dot_quick<double>(
const unsigned long sz,
1320 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1323 template <>
inline void do_vec_dot_exact<float>(
const unsigned long sz,
1327 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1330 template <>
inline void do_vec_dot_quick<float>(
const unsigned long sz,
1334 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1345 #define SQR1_SIMD(r,f1,f2,SUF) \ 1346 TMP = _MMAVX(load_##SUF(r)); \ 1347 _MMAVX(FMA(SUF,TMP,TMP,f2,f2)) 1350 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1351 AVXMD,
double, __MAVXD)
1353 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1354 AVXMS,
float, __MAVXS)
1357 #define XSQR1_SIMD(r,f1,f2,SUF) \ 1358 TMP = _MMAVX(load_##SUF(r)); \ 1359 TMP = _MMAVX(mul_##SUF(TMP, TMP)); \ 1361 TMP = _MMAVX(add_##SUF(TMP, f2)); \ 1363 TMP = _MMAVX(sub_##SUF(TMP, f2)); \ 1364 TMP = _MMAVX(sub_##SUF(TMP, y)); \ 1365 f1 = _MMAVX(add_##SUF(f1, TMP)); \ 1367 f2 = _MMAVX(move_##SUF(f2, t)) 1369 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1370 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1371 AVXMD,
double, __MAVXD)
1373 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1374 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1375 AVXMS,
float, __MAVXS)
1378 #ifndef TBCI_NO_SIMD_FABSSQR 1379 template <>
inline void do_vec_fabssqr_quick<double>(
const unsigned long sz,
1380 const double *
const _v1,
double& _f2)
1383 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1386 template <>
inline void do_vec_fabssqr_exact<double>(
const unsigned long sz,
1387 const double *
const _v1,
double& _f2)
1390 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1393 #endif // TBCI_NO_SIMD_FABSSQR 1394 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable 1395 template <>
inline void do_vec_fabssqr_quick<float>(
const unsigned long sz,
1396 const float *
const _v1,
double& _f2)
1399 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1402 template <>
inline void do_vec_fabssqr_exact<float>(
const unsigned long sz,
1403 const float *
const _v1,
double& _f2)
1406 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1409 #endif // TBCI_SIMD_FABSSQR_FLOAT 1413 #define SUM1_SIMD(r,f1,f2,SUF) \ 1414 TMP = _MMAVX(load_##SUF(r)); \ 1415 f2 = _MMAVX(add_##SUF(f2, TMP)) 1417 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1418 AVXMD,
double, __MAVXD)
1420 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1421 AVXMS,
float, __MAVXS)
1425 #define XSUM1_SIMD(r,f1,f2,SUF) \ 1426 y = _MMAVX(load_##SUF(r)); \ 1427 t = _MMAVX(add_##SUF(f2, y)); \ 1428 TMP = _MMAVX(sub_##SUF(t, f2)); \ 1429 TMP = _MMAVX(sub_##SUF(TMP, y)); \ 1430 f1 = _MMAVX(add_##SUF(f1, TMP)); \ 1434 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1435 XSUM_DOUBLE_SIMD_FINAL_COMPLETE,
1436 AVXMD,
double, __MAVXD)
1438 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1439 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1440 AVXMS,
float, __MAVXS)
1442 #endif // TBCI_SIMD_SUM 1446 #endif // TBCI_SELECTIVE_INST 1450 #endif // H_VEC_KERN_SPECIAL_H #define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const unsigned TMatrix< T > * res
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define MULT2(r, v1, f1, f2)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)