9 #ifndef H_VEC_KERN_SPECIAL_H
10 #define H_VEC_KERN_SPECIAL_H
64 #if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65 ( defined(__x86_64__) || defined(__i386__) )
67 #include <emmintrin.h>
69 #if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__)
70 # include <pmmintrin.h>
75 #include "tbci/unroll_prefetch_simd_def.h"
79 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
80 # include "vec_kern_special_gd.h"
85 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
86 # warning Info: Using unrolled SSE2 vector kernels
94 #define SIMD_EMPTY0 do {} while (0)
95 #define SIMD_EMPTY1(x) do {} while (0)
96 #define SIMD_EMPTY2(x,y) do {} while (0)
98 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set1_pd(x)
99 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y)
101 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set1_ps(x)
102 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y)
107 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
108 __GNUC_MINOR__ == 0 && \
109 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
110 # define _MM_STORE(mem, reg, SUF, UNA) \
112 _mm_store##UNA##_##SUF(mem, reg)
114 # define _MM_STORE(mem, reg, SUF, UNA) \
115 _mm_store##UNA##_##SUF(mem, reg)
119 #define _mm_loadu_sd _mm_load_sd
120 #define _mm_loadu_ss _mm_load_ss
129 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
130 TMP = _mm_load##UNA1##_##SUF(v1); \
131 _MM_STORE(r, TMP, SUF,)
133 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
136 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
147 #define FILL1_SIMD(r,f1,f2,SUF) \
148 _MM_STORE(r, f2, SUF,)
150 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
153 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
161 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
162 TMP = _mm_load##UNA1##_##SUF(v1); \
163 LD = _mm_load##UNA2##_##SUF(v2); \
164 TMP = _mm_add_##SUF(TMP, LD); \
165 _MM_STORE(r, TMP, SUF,)
167 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
170 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
175 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
176 TMP = _mm_load##UNA1##_##SUF(v1); \
177 LD = _mm_load##UNA2##_##SUF(v2); \
178 TMP = _mm_sub_##SUF(TMP, LD); \
179 _MM_STORE(r, TMP, SUF,)
181 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
184 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
189 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
190 TMP = _mm_load##UNA1##_##SUF(v1); \
191 LD = _mm_load##UNA2##_##SUF(v2); \
192 TMP = _mm_mul_##SUF(TMP, LD); \
193 _MM_STORE(r, TMP, SUF,)
195 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
198 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
201 template <> inline
void do_vec_vec_cmul<
double>(const
unsigned long sz,
205 do_vec_vec_mul<double>(sz,
res, v1, v2);
207 template <>
inline void do_vec_vec_cmul<float>(
const unsigned long sz,
211 do_vec_vec_mul<float>(sz,
res, v1, v2);
216 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
217 TMP = _mm_load##UNA1##_##SUF(v1); \
218 LD = _mm_load##UNA2##_##SUF(v2); \
219 TMP = _mm_div_##SUF(TMP, LD); \
220 _MM_STORE(r, TMP, SUF,)
222 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228 template <> inline
void do_vec_vec_cdiv<
double>(const
unsigned long sz,
232 do_vec_vec_div<double>(sz,
res, v1, v2);
234 template <>
inline void do_vec_vec_cdiv<float>(
const unsigned long sz,
238 do_vec_vec_div<float>(sz,
res, v1, v2);
244 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
245 TMP = _mm_load_##SUF(r); \
246 LD = _mm_load##UNA1##_##SUF(v1); \
247 TMP = _mm_add_##SUF(TMP, LD); \
248 _MM_STORE(r, TMP, SUF,)
250 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
253 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
258 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
259 TMP = _mm_load_##SUF(r); \
260 LD = _mm_load##UNA1##_##SUF(v1); \
261 TMP = _mm_sub_##SUF(TMP, LD); \
262 _MM_STORE(r, TMP, SUF,)
264 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
267 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
272 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
273 TMP = _mm_load_##SUF(r); \
274 LD = _mm_load##UNA1##_##SUF(v1); \
275 LD = _mm_sub_##SUF(LD, TMP); \
276 _MM_STORE(r, LD, SUF,)
278 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
281 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
286 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
287 TMP = _mm_load_##SUF(r); \
288 LD = _mm_load##UNA1##_##SUF(v1); \
289 TMP = _mm_mul_##SUF(TMP, LD); \
290 _MM_STORE(r, TMP, SUF,)
292 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
295 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
300 template <> inline
void do_vec_cmul_vec<
double>(const
unsigned long sz,
303 do_vec_mul_vec<double>(sz,
res, v1);
305 template <>
inline void do_vec_cmul_vec<float>(
const unsigned long sz,
308 do_vec_mul_vec<float>(sz,
res, v1);
312 template <>
inline void do_vec_cmul_vec_inv<double>(
const unsigned long sz,
315 do_vec_mul_vec<double>(sz,
res, v1);
317 template <>
inline void do_vec_cmul_vec_inv<float>(
const unsigned long sz,
320 do_vec_mul_vec<float>(sz,
res, v1);
325 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
326 TMP = _mm_load_##SUF(r); \
327 LD = _mm_load##UNA1##_##SUF(v1); \
328 TMP = _mm_div_##SUF(TMP, LD); \
329 _MM_STORE(r, TMP, SUF,)
331 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
334 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
340 TMP = _mm_load_##SUF(r); \
341 LD = _mm_load##UNA1##_##SUF(v1); \
342 LD = _mm_div_##SUF(LD, TMP); \
343 _MM_STORE(r, LD, SUF,)
345 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
348 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
353 template <> inline
void do_vec_cdiv_vec<
double>(const
unsigned long sz,
356 do_vec_div_vec<double>(sz,
res, v1);
358 template <>
inline void do_vec_cdiv_vec<float>(
const unsigned long sz,
361 do_vec_div_vec<float>(sz,
res, v1);
367 template <>
inline void do_vec_cdiv_vec_inv<double>(
const unsigned long sz,
370 do_vec_div_vec_inv<double>(sz,
res, v1);
372 template <>
inline void do_vec_cdiv_vec_inv<float>(
const unsigned long sz,
375 do_vec_div_vec_inv<float>(sz,
res, v1);
380 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
381 TMP = _mm_load##UNA1##_##SUF(v1); \
382 TMP = _mm_add_##SUF(TMP, f2); \
383 _MM_STORE(r, TMP, SUF,)
385 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
388 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
393 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
394 TMP = _mm_load##UNA1##_##SUF(v1); \
395 TMP = _mm_sub_##SUF(TMP, f2); \
396 _MM_STORE(r, TMP, SUF,)
398 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
401 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
407 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
408 TMP = _mm_load##UNA1##_##SUF(v1); \
409 TMP = _mm_mul_##SUF(TMP, f2); \
410 _MM_STORE(r, TMP, SUF,)
412 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
415 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
421 template <> inline
void do_val_vec_add<
double>(const
unsigned long sz,
425 do_vec_val_add<double>(sz,
res, v1, _f2);
427 template <>
inline void do_val_vec_add<float>(
const unsigned long sz,
431 do_vec_val_add<float>(sz,
res, v1, _f2);
436 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437 TMP = _mm_load##UNA1##_##SUF(v1); \
438 TMP = _mm_sub_##SUF(f2, TMP); \
439 _MM_STORE(r, TMP, SUF,)
441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
449 template <> inline
void do_val_vec_mul<
double>(const
unsigned long sz,
453 do_vec_val_mul<double>(sz,
res, v1, _f2);
455 template <>
inline void do_val_vec_mul<float>(
const unsigned long sz,
459 do_vec_val_mul<float>(sz,
res, v1, _f2);
464 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
465 TMP = _mm_load##UNA1##_##SUF(v1); \
466 TMP = _mm_div_##SUF(f2, TMP); \
467 _MM_STORE(r, TMP, SUF,)
469 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
472 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
481 #define ADD1NV_SIMD(r,f1,f2,SUF) \
482 TMP = _mm_load_##SUF(r); \
483 TMP = _mm_add_##SUF(TMP, f2); \
484 _MM_STORE(r, TMP, SUF,)
486 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
489 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
494 #define SUB1NV_SIMD(r,f1,f2,SUF) \
495 TMP = _mm_load_##SUF(r); \
496 TMP = _mm_sub_##SUF(TMP, f2); \
497 _MM_STORE(r, TMP, SUF,)
499 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
502 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
507 #define SUB1RV_SIMD(r,f1,f2,SUF) \
508 TMP = _mm_load_##SUF(r); \
509 TMP = _mm_sub_##SUF(f2, TMP); \
510 _MM_STORE(r, TMP, SUF,)
512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
520 #define MUL1NV_SIMD(r,f1,f2,SUF) \
521 TMP = _mm_load_##SUF(r); \
522 TMP = _mm_mul_##SUF(TMP, f2); \
523 _MM_STORE(r, TMP, SUF,)
525 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
528 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533 #define DIV1NV_SIMD(r,f1,f2,SUF) \
534 TMP = _mm_load_##SUF(r); \
535 TMP = _mm_div_##SUF(TMP, f2); \
536 _MM_STORE(r, TMP, SUF,)
538 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
541 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546 #define DIV1RV_SIMD(r,f1,f2,SUF) \
547 TMP = _mm_load_##SUF(r); \
548 TMP = _mm_div_##SUF(f2, TMP); \
549 _MM_STORE(r, TMP, SUF,)
551 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
554 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559 template <> inline
void do_val_add_vec<
double>(const
unsigned long sz,
562 do_vec_add_val<double>(sz,
res, _f2);
564 template <>
inline void do_val_add_vec<float>(
const unsigned long sz,
567 do_vec_add_val<float>(sz,
res, _f2);
579 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
580 LD = _mm_load##UNA1##_##SUF(v1); \
581 TMP = _mm_load_##SUF(r); \
582 LD = _mm_mul_##SUF(LD, f2); \
583 TMP = _mm_add_##SUF(TMP, LD); \
584 _MM_STORE(r, TMP, SUF,)
586 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
589 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
594 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
595 LD = _mm_load##UNA1##_##SUF(v1); \
596 TMP = _mm_load_##SUF(r); \
597 LD = _mm_mul_##SUF(LD, f2); \
598 TMP = _mm_sub_##SUF(TMP, LD); \
599 _MM_STORE(r, TMP, SUF,)
601 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
604 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
609 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
610 LD = _mm_load##UNA1##_##SUF(v1); \
611 TMP = _mm_load_##SUF(r); \
612 LD = _mm_mul_##SUF(LD, f2); \
613 LD = _mm_sub_##SUF(LD, TMP); \
614 _MM_STORE(r, LD, SUF,)
616 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
619 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
624 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
625 LD = _mm_load##UNA2##_##SUF(v2); \
626 TMP = _mm_load##UNA1##_##SUF(v1); \
627 LD = _mm_mul_##SUF(LD, f2); \
628 TMP = _mm_add_##SUF(TMP, LD); \
629 _MM_STORE(r, TMP, SUF,)
631 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
634 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
639 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
640 LD = _mm_load##UNA2##_##SUF(v2); \
641 TMP = _mm_load##UNA1##_##SUF(v1); \
642 LD = _mm_mul_##SUF(LD, f2); \
643 TMP = _mm_sub_##SUF(TMP, LD); \
644 _MM_STORE(r, TMP, SUF,)
646 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
649 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
655 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
656 LD = _mm_load##UNA1##_##SUF(v1); \
657 TMP = _mm_load##UNA2##_##SUF(v2); \
658 LD = _mm_mul_##SUF(LD, f2); \
659 TMP = _mm_add_##SUF(TMP, LD); \
660 _MM_STORE(r, TMP, SUF,)
662 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
665 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
670 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
671 LD = _mm_load##UNA1##_##SUF(v1); \
672 TMP = _mm_load##UNA2##_##SUF(v2); \
673 LD = _mm_mul_##SUF(LD, f2); \
674 LD = _mm_sub_##SUF(LD, TMP); \
675 _MM_STORE(r, LD, SUF,)
677 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
680 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
686 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
687 LD = _mm_load##UNA1##_##SUF(v1); \
688 TMP = _mm_load##UNA2##_##SUF(v2); \
689 LD = _mm_mul_##SUF(LD, f1); \
690 TMP = _mm_mul_##SUF(TMP, f2); \
691 LD = _mm_add_##SUF(LD, TMP); \
692 _MM_STORE(r, LD, SUF,)
694 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
697 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
702 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
703 LD = _mm_load##UNA1##_##SUF(v1); \
704 TMP = _mm_load##UNA2##_##SUF(v2); \
705 LD = _mm_mul_##SUF(LD, f1); \
706 TMP = _mm_mul_##SUF(TMP, f2); \
707 LD = _mm_sub_##SUF(LD, TMP); \
708 _MM_STORE(r, LD, SUF,)
710 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
713 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
719 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
720 LD = _mm_load_##SUF(r); \
721 TMP = _mm_load##UNA1##_##SUF(v1); \
722 LD = _mm_mul_##SUF(LD, f2); \
723 TMP = _mm_add_##SUF(TMP, LD); \
724 _MM_STORE(r, TMP, SUF,)
726 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
734 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
735 LD = _mm_load_##SUF(r); \
736 TMP = _mm_load##UNA1##_##SUF(v1); \
737 LD = _mm_mul_##SUF(LD, f2); \
738 LD = _mm_sub_##SUF(LD, TMP); \
739 _MM_STORE(r, LD, SUF,)
741 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
744 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
749 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
750 LD = _mm_load_##SUF(r); \
751 TMP = _mm_load##UNA1##_##SUF(v1); \
752 LD = _mm_mul_##SUF(LD, f1); \
753 TMP = _mm_mul_##SUF(TMP, f2); \
754 LD = _mm_add_##SUF(LD, TMP); \
755 _MM_STORE(r, LD, SUF,)
757 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
760 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
765 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
766 LD = _mm_load_##SUF(r); \
767 TMP = _mm_load##UNA1##_##SUF(v1); \
768 LD = _mm_mul_##SUF(LD, f1); \
769 TMP = _mm_mul_##SUF(TMP, f2); \
770 LD = _mm_sub_##SUF(LD, TMP); \
771 _MM_STORE(r, LD, SUF,)
773 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
776 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
782 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
783 TMP = _mm_load##UNA1##_##SUF(v1); \
784 TMP = _mm_mul_##SUF(TMP, f1); \
785 TMP = _mm_add_##SUF(TMP, f2); \
786 _MM_STORE(r, TMP, SUF,)
788 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
791 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
796 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
797 TMP = _mm_load##UNA1##_##SUF(v1); \
798 TMP = _mm_mul_##SUF(TMP, f1); \
799 TMP = _mm_sub_##SUF(TMP, f2); \
800 _MM_STORE(r, TMP, SUF,)
802 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
805 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
811 #define ADD1SV_SIMD(r,f1,f2,SUF) \
812 TMP = _mm_load_##SUF(r); \
813 TMP = _mm_mul_##SUF(TMP, f1); \
814 TMP = _mm_add_##SUF(TMP, f2); \
815 _MM_STORE(r, TMP, SUF,)
817 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
820 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
825 #define SUB1SV_SIMD(r,f1,f2,SUF) \
826 TMP = _mm_load_##SUF(r); \
827 TMP = _mm_mul_##SUF(TMP, f1); \
828 TMP = _mm_sub_##SUF(TMP, f2); \
829 _MM_STORE(r, TMP, SUF,)
831 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
834 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
840 template <> inline
void do_val_svc_add<
double>(const
unsigned long sz,
844 do_svc_val_add<double>(sz,
res, v1, f2, f1);
846 template <>
inline void do_val_svc_add<float>(
const unsigned long sz,
850 do_svc_val_add<float>(sz,
res, v1, f2, f1);
855 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
856 TMP = _mm_load##UNA1##_##SUF(v1); \
857 TMP = _mm_mul_##SUF(TMP, f2); \
858 TMP = _mm_sub_##SUF(f1, TMP); \
859 _MM_STORE(r, TMP, SUF,)
861 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
864 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
869 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
870 TMP = _mm_load##UNA1##_##SUF(v1); \
871 TMP = _mm_mul_##SUF(TMP, f2); \
872 TMP = _mm_div_##SUF(f1, TMP); \
873 _MM_STORE(r, TMP, SUF,)
875 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
878 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
886 #ifdef HAVE_LONG_LONG
887 #define NEG_DOUBLE_PREP \
888 static union _negmask { \
889 unsigned LONG_LONG lng[2]; \
892 } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
893 __m128d neg = _mm_load_pd(negmask.dbl)
895 #define NEG_DOUBLE_PREP \
896 static union _negmask { \
897 unsigned int lng[4]; \
900 } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
901 __m128d neg = _mm_load_pd(negmask.dbl)
903 #define NEG_FLOAT_PREP \
904 static union _negmask { \
905 unsigned int itg[4]; \
908 } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
909 __m128 neg = _mm_load_ps(negmask.flt)
912 #define _mm_xor_sd _mm_xor_pd
913 #define _mm_xor_ss _mm_xor_ps
917 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
918 TMP = _mm_load##UNA1##_##SUF(v1); \
919 TMP = _mm_xor_##SUF(TMP, neg); \
920 _MM_STORE(r, TMP, SUF,)
922 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
925 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
930 #define NEG1_SIMD(r,f1,f2,SUF) \
931 TMP = _mm_load_##SUF(r); \
932 TMP = _mm_xor_##SUF(TMP, neg); \
933 _MM_STORE(r, TMP, SUF,)
935 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
938 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
945 #define VL_PREP(x) long f2 = (x)
946 #define VL_FIN(x) x = f2
947 #define _mm_movemask_sd(x) \
948 _mm_movemask_pd(x); rg &= 0x1
949 #define _mm_movemask_ss(x) \
950 _mm_movemask_ps(x); rg &= 0x1
951 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
952 TMP = _mm_load_##SUF(r); \
953 LD = _mm_load_##SUF(v1); \
954 TMP = _mm_cmpneq_##SUF(TMP, LD); \
956 rg = _mm_movemask_##SUF(TMP); \
957 if (rg) { ++f2; goto _fin; }
959 VL_PREP, SIMD_EMPTY0, VL_FIN,
962 VL_PREP, SIMD_EMPTY0, VL_FIN,
967 #define DECL_DOUBLE __m128d TM2
968 #define DECL_FLOAT __m128 TM2
971 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
972 TMP = _mm_load##UNA1##_##SUF(v1); \
973 LD = _mm_load##UNA2##_##SUF(v2); \
974 TM2 = _mm_load_##SUF(r); \
975 TMP = _mm_mul_##SUF(TMP, LD); \
976 TM2 = _mm_add_##SUF(TM2, TMP); \
977 _MM_STORE(r, TM2, SUF,)
980 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
983 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
987 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
990 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
995 template <>
inline void do_add_vec_vec_cmul<double>(
const unsigned long sz,
999 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1001 template <>
inline void do_add_vec_vec_cmul<float>(
const unsigned long sz,
1005 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1031 #ifndef TBCI_NO_SIMD_SUM
1033 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1034 # warning Info: Using unrolled SSE2 vector kernels for sums (reductions)
1037 #define SUM_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set_sd(x)
1038 #define SUM_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set_ss(x)
1040 #define XSUM_DOUBLE_PREP(x) \
1041 REGISTER __m128d f1 = _mm_setzero_pd();\
1042 REGISTER __m128d f2 = _mm_set_sd(x)
1043 #define XSUM_FLOAT_PREP(x) \
1044 REGISTER __m128 f1 = _mm_setzero_ps(); \
1045 REGISTER __m128 f2 = _mm_set_ss(x)
1053 # define SUM_DOUBLE_SIMD_FINX(f) \
1054 f = _mm_hadd_pd(f, f)
1055 # define SUM_FLOAT_SIMD_FINX(f) \
1056 f = _mm_hadd_ps(f, f); \
1057 f = _mm_hadd_ps(f, f)
1059 # define SUM_DOUBLE_SIMD_FINX(f) \
1060 __m128d TM##f = f; \
1061 TM##f = _mm_unpackhi_pd(TM##f, f); \
1062 f = _mm_add_sd(f, TM##f)
1063 # define SUM_FLOAT_SIMD_FINX(f) \
1065 TM##f = _mm_shuffle_ps(TM##f, f, 0xb1); \
1066 f = _mm_add_ps(f, TM##f); \
1068 TM##f = _mm_shuffle_ps(TM##f, f, 0x1b); \
1069 f = _mm_add_ss(f, TM##f)
1070 # if defined(__GNUC__) && defined(WARN_SSE)
1071 # warning Not using SSE3 -- consider passing -msse3
1075 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1076 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1078 #define SUM_DOUBLE_FINAL(x) \
1079 _mm_store_sd(&x, f2)
1080 #define SUM_FLOAT_FINAL(x) \
1081 _mm_store_ss(&x, f2)
1085 #define _mm_move_ps(f, x) x
1086 #define _mm_move_pd(f, x) x
1096 #define XSUM_DOUBLE_SIMD_FIN_STORE \
1101 #define XSUM_FLOAT_SIMD_FIN_STORE \
1109 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1112 SUM_DOUBLE_SIMD_FINX(f2); \
1113 SUM_DOUBLE_SIMD_FINX(f1); \
1114 f2 = _mm_sub_sd(f2, f1); \
1115 _mm_store_sd(&x, f2)
1116 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1121 SUM_FLOAT_SIMD_FINX(f2); \
1122 SUM_FLOAT_SIMD_FINX(f1); \
1123 f2 = _mm_sub_ss(f2, f1); \
1124 _mm_store_ss(&x, f2)
1127 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1131 SUM_DOUBLE_SIMD_FINX(f2); \
1133 COR = _mm_sub_sd(COR, TMP); \
1134 TMP = _mm_unpackhi_pd(TMP, TMP); \
1135 COR = _mm_sub_sd(COR, TMP); \
1136 f1 = _mm_add_sd(f1, COR); \
1137 SUM_DOUBLE_SIMD_FINX(f1); \
1138 f2 = _mm_sub_sd(f2, f1); \
1139 _mm_store_sd(&x, f2)
1147 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1148 TMP = _mm_load_##SUF(r); \
1149 LD = _mm_load##UNA1##_##SUF(v1); \
1150 TMP = _mm_mul_##SUF(TMP, LD); \
1151 f2 = _mm_add_##SUF(f2, TMP)
1153 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1156 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1160 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1161 TMP = _mm_load_##SUF(r); \
1162 LD = _mm_load##UNA1##_##SUF(v1); \
1163 TMP = _mm_mul_##SUF(TMP, LD); \
1164 LD = _mm_move_##SUF(LD, TMP); \
1165 TMP = _mm_add_##SUF(TMP, f2); \
1167 TMP = _mm_sub_##SUF(TMP, f2); \
1168 TMP = _mm_sub_##SUF(TMP, LD); \
1169 f1 = _mm_add_##SUF(f1, TMP); \
1170 f2 = _mm_move_##SUF(f2, t)
1172 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1173 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1176 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1177 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1181 template <> inline
void do_vec_dot_exact<
double>(const
unsigned long sz,
1185 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1188 template <>
inline void do_vec_dot_quick<double>(
const unsigned long sz,
1192 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1195 template <>
inline void do_vec_dot_exact<float>(
const unsigned long sz,
1199 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1202 template <>
inline void do_vec_dot_quick<float>(
const unsigned long sz,
1206 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1217 #define SQR1_SIMD(r,f1,f2,SUF) \
1218 TMP = _mm_load_##SUF(r); \
1219 TMP = _mm_mul_##SUF(TMP, TMP); \
1220 f2 = _mm_add_##SUF(f2, TMP)
1223 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1226 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1230 #define XSQR1_SIMD(r,f1,f2,SUF) \
1231 TMP = _mm_load_##SUF(r); \
1232 TMP = _mm_mul_##SUF(TMP, TMP); \
1234 TMP = _mm_add_##SUF(TMP, f2); \
1236 TMP = _mm_sub_##SUF(TMP, f2); \
1237 TMP = _mm_sub_##SUF(TMP, y); \
1238 f1 = _mm_add_##SUF(f1, TMP); \
1239 f2 = _mm_move_##SUF(f2, t)
1241 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1242 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1245 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1246 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1250 #ifndef TBCI_NO_SIMD_FABSSQR
1251 template <>
inline void do_vec_fabssqr_quick<double>(
const unsigned long sz,
1252 const double *
const _v1,
double& _f2)
1255 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1258 template <>
inline void do_vec_fabssqr_exact<double>(
const unsigned long sz,
1259 const double *
const _v1,
double& _f2)
1262 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1265 #endif // TBCI_NO_SIMD_FABSSQR
1266 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1267 template <>
inline void do_vec_fabssqr_quick<float>(
const unsigned long sz,
1268 const float *
const _v1,
double& _f2)
1271 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1274 template <>
inline void do_vec_fabssqr_exact<float>(
const unsigned long sz,
1275 const float *
const _v1,
double& _f2)
1278 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1281 #endif // TBCI_SIMD_FABSSQR_FLOAT
1285 #define SUM1_SIMD(r,f1,f2,SUF) \
1286 TMP = _mm_load_##SUF(r); \
1287 f2 = _mm_add_##SUF(f2, TMP)
1289 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1292 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1297 #define XSUM1_SIMD(r,f1,f2,SUF) \
1298 y = _mm_load_##SUF(r); \
1299 t = _mm_add_##SUF(f2, y); \
1300 TMP = _mm_sub_##SUF(t, f2); \
1301 TMP = _mm_sub_##SUF(TMP, y); \
1302 f1 = _mm_add_##SUF(f1, TMP); \
1303 f2 = _mm_move_##SUF(f2, t)
1305 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1306 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1309 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1310 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1313 #endif // TBCI_SIMD_SUM
1317 #endif // TBCI_SELECTIVE_INST
1321 #endif // H_VEC_KERN_SPECIAL_H
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
#define MULT2(r, v1, f1, f2)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)