9 #ifndef H_VEC_KERN_SPECIAL_H 10 #define H_VEC_KERN_SPECIAL_H 64 #if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \ 65 ( defined(__x86_64__) || defined(__i386__) ) 67 #include <emmintrin.h> 69 #if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__) 70 # include <pmmintrin.h> 75 #include "tbci/unroll_prefetch_simd_def.h" 79 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL) 80 # include "vec_kern_special_gd.h" 85 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE) 86 # warning Info: Using unrolled SSE2 vector kernels 94 #define SIMD_EMPTY0 do {} while (0) 95 #define SIMD_EMPTY1(x) do {} while (0) 96 #define SIMD_EMPTY2(x,y) do {} while (0) 98 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set1_pd(x) 99 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y) 101 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set1_ps(x) 102 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y) 107 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \ 108 __GNUC_MINOR__ == 0 && \ 109 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0) 110 # define _MM_STORE(mem, reg, SUF, UNA) \ 112 _mm_store##UNA##_##SUF(mem, reg) 114 # define _MM_STORE(mem, reg, SUF, UNA) \ 115 _mm_store##UNA##_##SUF(mem, reg) 119 #define _mm_loadu_sd _mm_load_sd 120 #define _mm_loadu_ss _mm_load_ss 129 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 130 TMP = _mm_load##UNA1##_##SUF(v1); \ 131 _MM_STORE(r, TMP, SUF,) 133 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
136 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
147 #define FILL1_SIMD(r,f1,f2,SUF) \ 148 _MM_STORE(r, f2, SUF,) 150 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
153 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
161 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 162 TMP = _mm_load##UNA1##_##SUF(v1); \ 163 LD = _mm_load##UNA2##_##SUF(v2); \ 164 TMP = _mm_add_##SUF(TMP, LD); \ 165 _MM_STORE(r, TMP, SUF,) 167 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
170 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
175 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 176 TMP = _mm_load##UNA1##_##SUF(v1); \ 177 LD = _mm_load##UNA2##_##SUF(v2); \ 178 TMP = _mm_sub_##SUF(TMP, LD); \ 179 _MM_STORE(r, TMP, SUF,) 181 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
184 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
189 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 190 TMP = _mm_load##UNA1##_##SUF(v1); \ 191 LD = _mm_load##UNA2##_##SUF(v2); \ 192 TMP = _mm_mul_##SUF(TMP, LD); \ 193 _MM_STORE(r, TMP, SUF,) 195 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
198 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
201 template <>
inline void do_vec_vec_cmul<double>(
const unsigned long sz,
205 do_vec_vec_mul<double>(sz,
res, v1, v2);
207 template <>
inline void do_vec_vec_cmul<float>(
const unsigned long sz,
211 do_vec_vec_mul<float>(sz,
res, v1, v2);
216 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 217 TMP = _mm_load##UNA1##_##SUF(v1); \ 218 LD = _mm_load##UNA2##_##SUF(v2); \ 219 TMP = _mm_div_##SUF(TMP, LD); \ 220 _MM_STORE(r, TMP, SUF,) 222 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228 template <>
inline void do_vec_vec_cdiv<double>(
const unsigned long sz,
232 do_vec_vec_div<double>(sz,
res, v1, v2);
234 template <>
inline void do_vec_vec_cdiv<float>(
const unsigned long sz,
238 do_vec_vec_div<float>(sz,
res, v1, v2);
244 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 245 TMP = _mm_load_##SUF(r); \ 246 LD = _mm_load##UNA1##_##SUF(v1); \ 247 TMP = _mm_add_##SUF(TMP, LD); \ 248 _MM_STORE(r, TMP, SUF,) 250 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
253 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
258 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 259 TMP = _mm_load_##SUF(r); \ 260 LD = _mm_load##UNA1##_##SUF(v1); \ 261 TMP = _mm_sub_##SUF(TMP, LD); \ 262 _MM_STORE(r, TMP, SUF,) 264 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
267 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
272 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \ 273 TMP = _mm_load_##SUF(r); \ 274 LD = _mm_load##UNA1##_##SUF(v1); \ 275 LD = _mm_sub_##SUF(LD, TMP); \ 276 _MM_STORE(r, LD, SUF,) 278 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
281 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
286 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 287 TMP = _mm_load_##SUF(r); \ 288 LD = _mm_load##UNA1##_##SUF(v1); \ 289 TMP = _mm_mul_##SUF(TMP, LD); \ 290 _MM_STORE(r, TMP, SUF,) 292 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
295 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
300 template <>
inline void do_vec_cmul_vec<double>(
const unsigned long sz,
303 do_vec_mul_vec<double>(sz,
res, v1);
305 template <>
inline void do_vec_cmul_vec<float>(
const unsigned long sz,
308 do_vec_mul_vec<float>(sz,
res, v1);
312 template <>
inline void do_vec_cmul_vec_inv<double>(
const unsigned long sz,
315 do_vec_mul_vec<double>(sz,
res, v1);
317 template <>
inline void do_vec_cmul_vec_inv<float>(
const unsigned long sz,
320 do_vec_mul_vec<float>(sz,
res, v1);
325 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 326 TMP = _mm_load_##SUF(r); \ 327 LD = _mm_load##UNA1##_##SUF(v1); \ 328 TMP = _mm_div_##SUF(TMP, LD); \ 329 _MM_STORE(r, TMP, SUF,) 331 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
334 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \ 340 TMP = _mm_load_##SUF(r); \ 341 LD = _mm_load##UNA1##_##SUF(v1); \ 342 LD = _mm_div_##SUF(LD, TMP); \ 343 _MM_STORE(r, LD, SUF,) 345 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
348 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
353 template <>
inline void do_vec_cdiv_vec<double>(
const unsigned long sz,
356 do_vec_div_vec<double>(sz,
res, v1);
358 template <>
inline void do_vec_cdiv_vec<float>(
const unsigned long sz,
361 do_vec_div_vec<float>(sz,
res, v1);
367 template <>
inline void do_vec_cdiv_vec_inv<double>(
const unsigned long sz,
370 do_vec_div_vec_inv<double>(sz,
res, v1);
372 template <>
inline void do_vec_cdiv_vec_inv<float>(
const unsigned long sz,
375 do_vec_div_vec_inv<float>(sz,
res, v1);
380 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 381 TMP = _mm_load##UNA1##_##SUF(v1); \ 382 TMP = _mm_add_##SUF(TMP, f2); \ 383 _MM_STORE(r, TMP, SUF,) 385 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
388 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
393 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 394 TMP = _mm_load##UNA1##_##SUF(v1); \ 395 TMP = _mm_sub_##SUF(TMP, f2); \ 396 _MM_STORE(r, TMP, SUF,) 398 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
401 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
407 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 408 TMP = _mm_load##UNA1##_##SUF(v1); \ 409 TMP = _mm_mul_##SUF(TMP, f2); \ 410 _MM_STORE(r, TMP, SUF,) 412 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
415 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
421 template <>
inline void do_val_vec_add<double>(
const unsigned long sz,
425 do_vec_val_add<double>(sz,
res, v1, _f2);
427 template <>
inline void do_val_vec_add<float>(
const unsigned long sz,
431 do_vec_val_add<float>(sz,
res, v1, _f2);
436 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 437 TMP = _mm_load##UNA1##_##SUF(v1); \ 438 TMP = _mm_sub_##SUF(f2, TMP); \ 439 _MM_STORE(r, TMP, SUF,) 441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
449 template <>
inline void do_val_vec_mul<double>(
const unsigned long sz,
453 do_vec_val_mul<double>(sz,
res, v1, _f2);
455 template <>
inline void do_val_vec_mul<float>(
const unsigned long sz,
459 do_vec_val_mul<float>(sz,
res, v1, _f2);
464 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 465 TMP = _mm_load##UNA1##_##SUF(v1); \ 466 TMP = _mm_div_##SUF(f2, TMP); \ 467 _MM_STORE(r, TMP, SUF,) 469 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
472 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
481 #define ADD1NV_SIMD(r,f1,f2,SUF) \ 482 TMP = _mm_load_##SUF(r); \ 483 TMP = _mm_add_##SUF(TMP, f2); \ 484 _MM_STORE(r, TMP, SUF,) 486 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
489 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
494 #define SUB1NV_SIMD(r,f1,f2,SUF) \ 495 TMP = _mm_load_##SUF(r); \ 496 TMP = _mm_sub_##SUF(TMP, f2); \ 497 _MM_STORE(r, TMP, SUF,) 499 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
502 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
507 #define SUB1RV_SIMD(r,f1,f2,SUF) \ 508 TMP = _mm_load_##SUF(r); \ 509 TMP = _mm_sub_##SUF(f2, TMP); \ 510 _MM_STORE(r, TMP, SUF,) 512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
520 #define MUL1NV_SIMD(r,f1,f2,SUF) \ 521 TMP = _mm_load_##SUF(r); \ 522 TMP = _mm_mul_##SUF(TMP, f2); \ 523 _MM_STORE(r, TMP, SUF,) 525 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
528 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533 #define DIV1NV_SIMD(r,f1,f2,SUF) \ 534 TMP = _mm_load_##SUF(r); \ 535 TMP = _mm_div_##SUF(TMP, f2); \ 536 _MM_STORE(r, TMP, SUF,) 538 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
541 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546 #define DIV1RV_SIMD(r,f1,f2,SUF) \ 547 TMP = _mm_load_##SUF(r); \ 548 TMP = _mm_div_##SUF(f2, TMP); \ 549 _MM_STORE(r, TMP, SUF,) 551 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
554 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559 template <>
inline void do_val_add_vec<double>(
const unsigned long sz,
562 do_vec_add_val<double>(sz,
res, _f2);
564 template <>
inline void do_val_add_vec<float>(
const unsigned long sz,
567 do_vec_add_val<float>(sz,
res, _f2);
579 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 580 LD = _mm_load##UNA1##_##SUF(v1); \ 581 TMP = _mm_load_##SUF(r); \ 582 LD = _mm_mul_##SUF(LD, f2); \ 583 TMP = _mm_add_##SUF(TMP, LD); \ 584 _MM_STORE(r, TMP, SUF,) 586 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
589 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
594 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 595 LD = _mm_load##UNA1##_##SUF(v1); \ 596 TMP = _mm_load_##SUF(r); \ 597 LD = _mm_mul_##SUF(LD, f2); \ 598 TMP = _mm_sub_##SUF(TMP, LD); \ 599 _MM_STORE(r, TMP, SUF,) 601 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
604 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
609 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 610 LD = _mm_load##UNA1##_##SUF(v1); \ 611 TMP = _mm_load_##SUF(r); \ 612 LD = _mm_mul_##SUF(LD, f2); \ 613 LD = _mm_sub_##SUF(LD, TMP); \ 614 _MM_STORE(r, LD, SUF,) 616 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
619 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
624 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 625 LD = _mm_load##UNA2##_##SUF(v2); \ 626 TMP = _mm_load##UNA1##_##SUF(v1); \ 627 LD = _mm_mul_##SUF(LD, f2); \ 628 TMP = _mm_add_##SUF(TMP, LD); \ 629 _MM_STORE(r, TMP, SUF,) 631 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
634 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
639 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 640 LD = _mm_load##UNA2##_##SUF(v2); \ 641 TMP = _mm_load##UNA1##_##SUF(v1); \ 642 LD = _mm_mul_##SUF(LD, f2); \ 643 TMP = _mm_sub_##SUF(TMP, LD); \ 644 _MM_STORE(r, TMP, SUF,) 646 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
649 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
655 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 656 LD = _mm_load##UNA1##_##SUF(v1); \ 657 TMP = _mm_load##UNA2##_##SUF(v2); \ 658 LD = _mm_mul_##SUF(LD, f2); \ 659 TMP = _mm_add_##SUF(TMP, LD); \ 660 _MM_STORE(r, TMP, SUF,) 662 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
665 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
670 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 671 LD = _mm_load##UNA1##_##SUF(v1); \ 672 TMP = _mm_load##UNA2##_##SUF(v2); \ 673 LD = _mm_mul_##SUF(LD, f2); \ 674 LD = _mm_sub_##SUF(LD, TMP); \ 675 _MM_STORE(r, LD, SUF,) 677 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
680 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
686 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 687 LD = _mm_load##UNA1##_##SUF(v1); \ 688 TMP = _mm_load##UNA2##_##SUF(v2); \ 689 LD = _mm_mul_##SUF(LD, f1); \ 690 TMP = _mm_mul_##SUF(TMP, f2); \ 691 LD = _mm_add_##SUF(LD, TMP); \ 692 _MM_STORE(r, LD, SUF,) 694 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
697 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
702 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\ 703 LD = _mm_load##UNA1##_##SUF(v1); \ 704 TMP = _mm_load##UNA2##_##SUF(v2); \ 705 LD = _mm_mul_##SUF(LD, f1); \ 706 TMP = _mm_mul_##SUF(TMP, f2); \ 707 LD = _mm_sub_##SUF(LD, TMP); \ 708 _MM_STORE(r, LD, SUF,) 710 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
713 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
719 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \ 720 LD = _mm_load_##SUF(r); \ 721 TMP = _mm_load##UNA1##_##SUF(v1); \ 722 LD = _mm_mul_##SUF(LD, f2); \ 723 TMP = _mm_add_##SUF(TMP, LD); \ 724 _MM_STORE(r, TMP, SUF,) 726 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
734 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \ 735 LD = _mm_load_##SUF(r); \ 736 TMP = _mm_load##UNA1##_##SUF(v1); \ 737 LD = _mm_mul_##SUF(LD, f2); \ 738 LD = _mm_sub_##SUF(LD, TMP); \ 739 _MM_STORE(r, LD, SUF,) 741 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
744 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
749 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 750 LD = _mm_load_##SUF(r); \ 751 TMP = _mm_load##UNA1##_##SUF(v1); \ 752 LD = _mm_mul_##SUF(LD, f1); \ 753 TMP = _mm_mul_##SUF(TMP, f2); \ 754 LD = _mm_add_##SUF(LD, TMP); \ 755 _MM_STORE(r, LD, SUF,) 757 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
760 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
765 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 766 LD = _mm_load_##SUF(r); \ 767 TMP = _mm_load##UNA1##_##SUF(v1); \ 768 LD = _mm_mul_##SUF(LD, f1); \ 769 TMP = _mm_mul_##SUF(TMP, f2); \ 770 LD = _mm_sub_##SUF(LD, TMP); \ 771 _MM_STORE(r, LD, SUF,) 773 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
776 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
782 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 783 TMP = _mm_load##UNA1##_##SUF(v1); \ 784 TMP = _mm_mul_##SUF(TMP, f1); \ 785 TMP = _mm_add_##SUF(TMP, f2); \ 786 _MM_STORE(r, TMP, SUF,) 788 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
791 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
796 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \ 797 TMP = _mm_load##UNA1##_##SUF(v1); \ 798 TMP = _mm_mul_##SUF(TMP, f1); \ 799 TMP = _mm_sub_##SUF(TMP, f2); \ 800 _MM_STORE(r, TMP, SUF,) 802 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
805 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
811 #define ADD1SV_SIMD(r,f1,f2,SUF) \ 812 TMP = _mm_load_##SUF(r); \ 813 TMP = _mm_mul_##SUF(TMP, f1); \ 814 TMP = _mm_add_##SUF(TMP, f2); \ 815 _MM_STORE(r, TMP, SUF,) 817 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
820 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
825 #define SUB1SV_SIMD(r,f1,f2,SUF) \ 826 TMP = _mm_load_##SUF(r); \ 827 TMP = _mm_mul_##SUF(TMP, f1); \ 828 TMP = _mm_sub_##SUF(TMP, f2); \ 829 _MM_STORE(r, TMP, SUF,) 831 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
834 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
840 template <>
inline void do_val_svc_add<double>(
const unsigned long sz,
844 do_svc_val_add<double>(sz,
res, v1, f2, f1);
846 template <>
inline void do_val_svc_add<float>(
const unsigned long sz,
850 do_svc_val_add<float>(sz,
res, v1, f2, f1);
855 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 856 TMP = _mm_load##UNA1##_##SUF(v1); \ 857 TMP = _mm_mul_##SUF(TMP, f2); \ 858 TMP = _mm_sub_##SUF(f1, TMP); \ 859 _MM_STORE(r, TMP, SUF,) 861 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
864 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
869 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \ 870 TMP = _mm_load##UNA1##_##SUF(v1); \ 871 TMP = _mm_mul_##SUF(TMP, f2); \ 872 TMP = _mm_div_##SUF(f1, TMP); \ 873 _MM_STORE(r, TMP, SUF,) 875 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
878 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
886 #ifdef HAVE_LONG_LONG 887 #define NEG_DOUBLE_PREP \ 888 static union _negmask { \ 889 unsigned LONG_LONG lng[2]; \ 892 } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \ 893 __m128d neg = _mm_load_pd(negmask.dbl) 895 #define NEG_DOUBLE_PREP \ 896 static union _negmask { \ 897 unsigned int lng[4]; \ 900 } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \ 901 __m128d neg = _mm_load_pd(negmask.dbl) 903 #define NEG_FLOAT_PREP \ 904 static union _negmask { \ 905 unsigned int itg[4]; \ 908 } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \ 909 __m128 neg = _mm_load_ps(negmask.flt) 912 #define _mm_xor_sd _mm_xor_pd 913 #define _mm_xor_ss _mm_xor_ps 917 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 918 TMP = _mm_load##UNA1##_##SUF(v1); \ 919 TMP = _mm_xor_##SUF(TMP, neg); \ 920 _MM_STORE(r, TMP, SUF,) 922 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
925 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
930 #define NEG1_SIMD(r,f1,f2,SUF) \ 931 TMP = _mm_load_##SUF(r); \ 932 TMP = _mm_xor_##SUF(TMP, neg); \ 933 _MM_STORE(r, TMP, SUF,) 935 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
938 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
945 #define VL_PREP(x) long f2 = (x) 946 #define VL_FIN(x) x = f2 947 #define _mm_movemask_sd(x) \ 948 _mm_movemask_pd(x); rg &= 0x1 949 #define _mm_movemask_ss(x) \ 950 _mm_movemask_ps(x); rg &= 0x1 951 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \ 952 TMP = _mm_load_##SUF(r); \ 953 LD = _mm_load_##SUF(v1); \ 954 TMP = _mm_cmpneq_##SUF(TMP, LD); \ 956 rg = _mm_movemask_##SUF(TMP); \ 957 if (rg) { ++f2; goto _fin; } 959 VL_PREP, SIMD_EMPTY0, VL_FIN,
962 VL_PREP, SIMD_EMPTY0, VL_FIN,
967 #define DECL_DOUBLE __m128d TM2 968 #define DECL_FLOAT __m128 TM2 971 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \ 972 TMP = _mm_load##UNA1##_##SUF(v1); \ 973 LD = _mm_load##UNA2##_##SUF(v2); \ 974 TM2 = _mm_load_##SUF(r); \ 975 TMP = _mm_mul_##SUF(TMP, LD); \ 976 TM2 = _mm_add_##SUF(TM2, TMP); \ 977 _MM_STORE(r, TM2, SUF,) 980 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
983 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
987 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
990 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
995 template <>
inline void do_add_vec_vec_cmul<double>(
const unsigned long sz,
999 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1001 template <>
inline void do_add_vec_vec_cmul<float>(
const unsigned long sz,
1005 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1031 #ifndef TBCI_NO_SIMD_SUM 1033 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE) 1034 # warning Info: Using unrolled SSE2 vector kernels for sums (reductions) 1037 #define SUM_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set_sd(x) 1038 #define SUM_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set_ss(x) 1040 #define XSUM_DOUBLE_PREP(x) \ 1041 REGISTER __m128d f1 = _mm_setzero_pd();\ 1042 REGISTER __m128d f2 = _mm_set_sd(x) 1043 #define XSUM_FLOAT_PREP(x) \ 1044 REGISTER __m128 f1 = _mm_setzero_ps(); \ 1045 REGISTER __m128 f2 = _mm_set_ss(x) 1053 # define SUM_DOUBLE_SIMD_FINX(f) \ 1054 f = _mm_hadd_pd(f, f) 1055 # define SUM_FLOAT_SIMD_FINX(f) \ 1056 f = _mm_hadd_ps(f, f); \ 1057 f = _mm_hadd_ps(f, f) 1059 # define SUM_DOUBLE_SIMD_FINX(f) \ 1060 __m128d TM##f = f; \ 1061 TM##f = _mm_unpackhi_pd(TM##f, f); \ 1062 f = _mm_add_sd(f, TM##f) 1063 # define SUM_FLOAT_SIMD_FINX(f) \ 1065 TM##f = _mm_shuffle_ps(TM##f, f, 0xb1); \ 1066 f = _mm_add_ps(f, TM##f); \ 1068 TM##f = _mm_shuffle_ps(TM##f, f, 0x1b); \ 1069 f = _mm_add_ss(f, TM##f) 1070 # if defined(__GNUC__) && defined(WARN_SSE) 1071 # warning Not using SSE3 -- consider passing -msse3 1075 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2) 1076 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2) 1078 #define SUM_DOUBLE_FINAL(x) \ 1079 _mm_store_sd(&x, f2) 1080 #define SUM_FLOAT_FINAL(x) \ 1081 _mm_store_ss(&x, f2) 1085 #define _mm_move_ps(f, x) x 1086 #define _mm_move_pd(f, x) x 1096 #define XSUM_DOUBLE_SIMD_FIN_STORE \ 1101 #define XSUM_FLOAT_SIMD_FIN_STORE \ 1109 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \ 1112 SUM_DOUBLE_SIMD_FINX(f2); \ 1113 SUM_DOUBLE_SIMD_FINX(f1); \ 1114 f2 = _mm_sub_sd(f2, f1); \ 1115 _mm_store_sd(&x, f2) 1116 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \ 1121 SUM_FLOAT_SIMD_FINX(f2); \ 1122 SUM_FLOAT_SIMD_FINX(f1); \ 1123 f2 = _mm_sub_ss(f2, f1); \ 1124 _mm_store_ss(&x, f2) 1127 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \ 1131 SUM_DOUBLE_SIMD_FINX(f2); \ 1133 COR = _mm_sub_sd(COR, TMP); \ 1134 TMP = _mm_unpackhi_pd(TMP, TMP); \ 1135 COR = _mm_sub_sd(COR, TMP); \ 1136 f1 = _mm_add_sd(f1, COR); \ 1137 SUM_DOUBLE_SIMD_FINX(f1); \ 1138 f2 = _mm_sub_sd(f2, f1); \ 1139 _mm_store_sd(&x, f2) 1147 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 1148 TMP = _mm_load_##SUF(r); \ 1149 LD = _mm_load##UNA1##_##SUF(v1); \ 1150 TMP = _mm_mul_##SUF(TMP, LD); \ 1151 f2 = _mm_add_##SUF(f2, TMP) 1153 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1156 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1160 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \ 1161 TMP = _mm_load_##SUF(r); \ 1162 LD = _mm_load##UNA1##_##SUF(v1); \ 1163 TMP = _mm_mul_##SUF(TMP, LD); \ 1164 LD = _mm_move_##SUF(LD, TMP); \ 1165 TMP = _mm_add_##SUF(TMP, f2); \ 1167 TMP = _mm_sub_##SUF(TMP, f2); \ 1168 TMP = _mm_sub_##SUF(TMP, LD); \ 1169 f1 = _mm_add_##SUF(f1, TMP); \ 1170 f2 = _mm_move_##SUF(f2, t) 1172 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1173 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1176 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1177 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1181 template <>
inline void do_vec_dot_exact<double>(
const unsigned long sz,
1185 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1188 template <>
inline void do_vec_dot_quick<double>(
const unsigned long sz,
1192 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1195 template <>
inline void do_vec_dot_exact<float>(
const unsigned long sz,
1199 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1202 template <>
inline void do_vec_dot_quick<float>(
const unsigned long sz,
1206 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1217 #define SQR1_SIMD(r,f1,f2,SUF) \ 1218 TMP = _mm_load_##SUF(r); \ 1219 TMP = _mm_mul_##SUF(TMP, TMP); \ 1220 f2 = _mm_add_##SUF(f2, TMP) 1223 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1226 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1230 #define XSQR1_SIMD(r,f1,f2,SUF) \ 1231 TMP = _mm_load_##SUF(r); \ 1232 TMP = _mm_mul_##SUF(TMP, TMP); \ 1234 TMP = _mm_add_##SUF(TMP, f2); \ 1236 TMP = _mm_sub_##SUF(TMP, f2); \ 1237 TMP = _mm_sub_##SUF(TMP, y); \ 1238 f1 = _mm_add_##SUF(f1, TMP); \ 1239 f2 = _mm_move_##SUF(f2, t) 1241 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1242 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1245 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1246 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1250 #ifndef TBCI_NO_SIMD_FABSSQR 1251 template <>
inline void do_vec_fabssqr_quick<double>(
const unsigned long sz,
1252 const double *
const _v1,
double& _f2)
1255 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1258 template <>
inline void do_vec_fabssqr_exact<double>(
const unsigned long sz,
1259 const double *
const _v1,
double& _f2)
1262 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1265 #endif // TBCI_NO_SIMD_FABSSQR 1266 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable 1267 template <>
inline void do_vec_fabssqr_quick<float>(
const unsigned long sz,
1268 const float *
const _v1,
double& _f2)
1271 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1274 template <>
inline void do_vec_fabssqr_exact<float>(
const unsigned long sz,
1275 const float *
const _v1,
double& _f2)
1278 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1281 #endif // TBCI_SIMD_FABSSQR_FLOAT 1285 #define SUM1_SIMD(r,f1,f2,SUF) \ 1286 TMP = _mm_load_##SUF(r); \ 1287 f2 = _mm_add_##SUF(f2, TMP) 1289 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1292 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1297 #define XSUM1_SIMD(r,f1,f2,SUF) \ 1298 y = _mm_load_##SUF(r); \ 1299 t = _mm_add_##SUF(f2, y); \ 1300 TMP = _mm_sub_##SUF(t, f2); \ 1301 TMP = _mm_sub_##SUF(TMP, y); \ 1302 f1 = _mm_add_##SUF(f1, TMP); \ 1303 f2 = _mm_move_##SUF(f2, t) 1305 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1306 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1309 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1310 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1313 #endif // TBCI_SIMD_SUM 1317 #endif // TBCI_SELECTIVE_INST 1321 #endif // H_VEC_KERN_SPECIAL_H #define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const unsigned TMatrix< T > * res
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define MULT2(r, v1, f1, f2)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)