9#ifndef H_VEC_KERN_SPECIAL_H
10#define H_VEC_KERN_SPECIAL_H
64#if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65 ( defined(__x86_64__) || defined(__i386__) )
69#if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__)
70# include <pmmintrin.h>
75#include "tbci/unroll_prefetch_simd_def.h"
80# include "vec_kern_special_gd.h"
85#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
86# warning Info: Using unrolled SSE2 vector kernels
94#define SIMD_EMPTY0 do {} while (0)
95#define SIMD_EMPTY1(x) do {} while (0)
96#define SIMD_EMPTY2(x,y) do {} while (0)
98#define SIMD_CONST_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set1_pd(x)
99#define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y)
101#define SIMD_CONST_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set1_ps(x)
102#define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y)
107#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
108 __GNUC_MINOR__ == 0 && \
109 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
110# define _MM_STORE(mem, reg, SUF, UNA) \
112 _mm_store##UNA##_##SUF(mem, reg)
114# define _MM_STORE(mem, reg, SUF, UNA) \
115 _mm_store##UNA##_##SUF(mem, reg)
119#define _mm_loadu_sd _mm_load_sd
120#define _mm_loadu_ss _mm_load_ss
129#define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
130 TMP = _mm_load##UNA1##_##SUF(v1); \
131 _MM_STORE(r, TMP, SUF,)
133 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
136 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
147#define FILL1_SIMD(r,f1,f2,SUF) \
148 _MM_STORE(r, f2, SUF,)
150 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
153 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
161#define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
162 TMP = _mm_load##UNA1##_##SUF(v1); \
163 LD = _mm_load##UNA2##_##SUF(v2); \
164 TMP = _mm_add_##SUF(TMP, LD); \
165 _MM_STORE(r, TMP, SUF,)
167 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
170 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
175#define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
176 TMP = _mm_load##UNA1##_##SUF(v1); \
177 LD = _mm_load##UNA2##_##SUF(v2); \
178 TMP = _mm_sub_##SUF(TMP, LD); \
179 _MM_STORE(r, TMP, SUF,)
181 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
184 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
189#define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
190 TMP = _mm_load##UNA1##_##SUF(v1); \
191 LD = _mm_load##UNA2##_##SUF(v2); \
192 TMP = _mm_mul_##SUF(TMP, LD); \
193 _MM_STORE(r, TMP, SUF,)
195 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
198 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
201template <>
inline void do_vec_vec_cmul<double>(
const unsigned long sz,
205 do_vec_vec_mul<double>(sz,
res, v1, v2);
207template <>
inline void do_vec_vec_cmul<float>(
const unsigned long sz,
211 do_vec_vec_mul<float>(sz,
res, v1, v2);
216#define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
217 TMP = _mm_load##UNA1##_##SUF(v1); \
218 LD = _mm_load##UNA2##_##SUF(v2); \
219 TMP = _mm_div_##SUF(TMP, LD); \
220 _MM_STORE(r, TMP, SUF,)
222 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228template <>
inline void do_vec_vec_cdiv<double>(
const unsigned long sz,
232 do_vec_vec_div<double>(sz,
res, v1, v2);
234template <>
inline void do_vec_vec_cdiv<float>(
const unsigned long sz,
238 do_vec_vec_div<float>(sz,
res, v1, v2);
244#define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
245 TMP = _mm_load_##SUF(r); \
246 LD = _mm_load##UNA1##_##SUF(v1); \
247 TMP = _mm_add_##SUF(TMP, LD); \
248 _MM_STORE(r, TMP, SUF,)
250 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
253 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
258#define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
259 TMP = _mm_load_##SUF(r); \
260 LD = _mm_load##UNA1##_##SUF(v1); \
261 TMP = _mm_sub_##SUF(TMP, LD); \
262 _MM_STORE(r, TMP, SUF,)
264 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
267 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
272#define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
273 TMP = _mm_load_##SUF(r); \
274 LD = _mm_load##UNA1##_##SUF(v1); \
275 LD = _mm_sub_##SUF(LD, TMP); \
276 _MM_STORE(r, LD, SUF,)
278 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
281 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
286#define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
287 TMP = _mm_load_##SUF(r); \
288 LD = _mm_load##UNA1##_##SUF(v1); \
289 TMP = _mm_mul_##SUF(TMP, LD); \
290 _MM_STORE(r, TMP, SUF,)
292 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
295 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
300template <>
inline void do_vec_cmul_vec<double>(
const unsigned long sz,
303 do_vec_mul_vec<double>(sz,
res, v1);
305template <>
inline void do_vec_cmul_vec<float>(
const unsigned long sz,
308 do_vec_mul_vec<float>(sz,
res, v1);
312template <>
inline void do_vec_cmul_vec_inv<double>(
const unsigned long sz,
315 do_vec_mul_vec<double>(sz,
res, v1);
317template <>
inline void do_vec_cmul_vec_inv<float>(
const unsigned long sz,
320 do_vec_mul_vec<float>(sz,
res, v1);
325#define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
326 TMP = _mm_load_##SUF(r); \
327 LD = _mm_load##UNA1##_##SUF(v1); \
328 TMP = _mm_div_##SUF(TMP, LD); \
329 _MM_STORE(r, TMP, SUF,)
331 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
334 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339#define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
340 TMP = _mm_load_##SUF(r); \
341 LD = _mm_load##UNA1##_##SUF(v1); \
342 LD = _mm_div_##SUF(LD, TMP); \
343 _MM_STORE(r, LD, SUF,)
345 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
348 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
353template <>
inline void do_vec_cdiv_vec<double>(
const unsigned long sz,
356 do_vec_div_vec<double>(sz,
res, v1);
358template <>
inline void do_vec_cdiv_vec<float>(
const unsigned long sz,
361 do_vec_div_vec<float>(sz,
res, v1);
367template <>
inline void do_vec_cdiv_vec_inv<double>(
const unsigned long sz,
370 do_vec_div_vec_inv<double>(sz,
res, v1);
372template <>
inline void do_vec_cdiv_vec_inv<float>(
const unsigned long sz,
375 do_vec_div_vec_inv<float>(sz,
res, v1);
380#define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
381 TMP = _mm_load##UNA1##_##SUF(v1); \
382 TMP = _mm_add_##SUF(TMP, f2); \
383 _MM_STORE(r, TMP, SUF,)
385 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
388 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
393#define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
394 TMP = _mm_load##UNA1##_##SUF(v1); \
395 TMP = _mm_sub_##SUF(TMP, f2); \
396 _MM_STORE(r, TMP, SUF,)
398 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
401 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
407#define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
408 TMP = _mm_load##UNA1##_##SUF(v1); \
409 TMP = _mm_mul_##SUF(TMP, f2); \
410 _MM_STORE(r, TMP, SUF,)
412 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
415 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
421template <>
inline void do_val_vec_add<double>(
const unsigned long sz,
425 do_vec_val_add<double>(sz,
res, v1, _f2);
427template <>
inline void do_val_vec_add<float>(
const unsigned long sz,
431 do_vec_val_add<float>(sz,
res, v1, _f2);
436#define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437 TMP = _mm_load##UNA1##_##SUF(v1); \
438 TMP = _mm_sub_##SUF(f2, TMP); \
439 _MM_STORE(r, TMP, SUF,)
441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
449template <>
inline void do_val_vec_mul<double>(
const unsigned long sz,
453 do_vec_val_mul<double>(sz,
res, v1, _f2);
455template <>
inline void do_val_vec_mul<float>(
const unsigned long sz,
459 do_vec_val_mul<float>(sz,
res, v1, _f2);
464#define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
465 TMP = _mm_load##UNA1##_##SUF(v1); \
466 TMP = _mm_div_##SUF(f2, TMP); \
467 _MM_STORE(r, TMP, SUF,)
469 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
472 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
481#define ADD1NV_SIMD(r,f1,f2,SUF) \
482 TMP = _mm_load_##SUF(r); \
483 TMP = _mm_add_##SUF(TMP, f2); \
484 _MM_STORE(r, TMP, SUF,)
486 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
489 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
494#define SUB1NV_SIMD(r,f1,f2,SUF) \
495 TMP = _mm_load_##SUF(r); \
496 TMP = _mm_sub_##SUF(TMP, f2); \
497 _MM_STORE(r, TMP, SUF,)
499 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
502 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
507#define SUB1RV_SIMD(r,f1,f2,SUF) \
508 TMP = _mm_load_##SUF(r); \
509 TMP = _mm_sub_##SUF(f2, TMP); \
510 _MM_STORE(r, TMP, SUF,)
512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
520#define MUL1NV_SIMD(r,f1,f2,SUF) \
521 TMP = _mm_load_##SUF(r); \
522 TMP = _mm_mul_##SUF(TMP, f2); \
523 _MM_STORE(r, TMP, SUF,)
525 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
528 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533#define DIV1NV_SIMD(r,f1,f2,SUF) \
534 TMP = _mm_load_##SUF(r); \
535 TMP = _mm_div_##SUF(TMP, f2); \
536 _MM_STORE(r, TMP, SUF,)
538 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
541 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546#define DIV1RV_SIMD(r,f1,f2,SUF) \
547 TMP = _mm_load_##SUF(r); \
548 TMP = _mm_div_##SUF(f2, TMP); \
549 _MM_STORE(r, TMP, SUF,)
551 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
554 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559template <>
inline void do_val_add_vec<double>(
const unsigned long sz,
562 do_vec_add_val<double>(sz,
res, _f2);
564template <>
inline void do_val_add_vec<float>(
const unsigned long sz,
567 do_vec_add_val<float>(sz,
res, _f2);
579#define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
580 LD = _mm_load##UNA1##_##SUF(v1); \
581 TMP = _mm_load_##SUF(r); \
582 LD = _mm_mul_##SUF(LD, f2); \
583 TMP = _mm_add_##SUF(TMP, LD); \
584 _MM_STORE(r, TMP, SUF,)
586 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
589 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
594#define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
595 LD = _mm_load##UNA1##_##SUF(v1); \
596 TMP = _mm_load_##SUF(r); \
597 LD = _mm_mul_##SUF(LD, f2); \
598 TMP = _mm_sub_##SUF(TMP, LD); \
599 _MM_STORE(r, TMP, SUF,)
601 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
604 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
609#define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
610 LD = _mm_load##UNA1##_##SUF(v1); \
611 TMP = _mm_load_##SUF(r); \
612 LD = _mm_mul_##SUF(LD, f2); \
613 LD = _mm_sub_##SUF(LD, TMP); \
614 _MM_STORE(r, LD, SUF,)
616 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
619 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
624#define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
625 LD = _mm_load##UNA2##_##SUF(v2); \
626 TMP = _mm_load##UNA1##_##SUF(v1); \
627 LD = _mm_mul_##SUF(LD, f2); \
628 TMP = _mm_add_##SUF(TMP, LD); \
629 _MM_STORE(r, TMP, SUF,)
631 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
634 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
639#define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
640 LD = _mm_load##UNA2##_##SUF(v2); \
641 TMP = _mm_load##UNA1##_##SUF(v1); \
642 LD = _mm_mul_##SUF(LD, f2); \
643 TMP = _mm_sub_##SUF(TMP, LD); \
644 _MM_STORE(r, TMP, SUF,)
646 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
649 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
655#define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
656 LD = _mm_load##UNA1##_##SUF(v1); \
657 TMP = _mm_load##UNA2##_##SUF(v2); \
658 LD = _mm_mul_##SUF(LD, f2); \
659 TMP = _mm_add_##SUF(TMP, LD); \
660 _MM_STORE(r, TMP, SUF,)
662 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
665 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
670#define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
671 LD = _mm_load##UNA1##_##SUF(v1); \
672 TMP = _mm_load##UNA2##_##SUF(v2); \
673 LD = _mm_mul_##SUF(LD, f2); \
674 LD = _mm_sub_##SUF(LD, TMP); \
675 _MM_STORE(r, LD, SUF,)
677 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
680 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
686#define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
687 LD = _mm_load##UNA1##_##SUF(v1); \
688 TMP = _mm_load##UNA2##_##SUF(v2); \
689 LD = _mm_mul_##SUF(LD, f1); \
690 TMP = _mm_mul_##SUF(TMP, f2); \
691 LD = _mm_add_##SUF(LD, TMP); \
692 _MM_STORE(r, LD, SUF,)
694 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
697 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
702#define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
703 LD = _mm_load##UNA1##_##SUF(v1); \
704 TMP = _mm_load##UNA2##_##SUF(v2); \
705 LD = _mm_mul_##SUF(LD, f1); \
706 TMP = _mm_mul_##SUF(TMP, f2); \
707 LD = _mm_sub_##SUF(LD, TMP); \
708 _MM_STORE(r, LD, SUF,)
710 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
713 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
719#define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
720 LD = _mm_load_##SUF(r); \
721 TMP = _mm_load##UNA1##_##SUF(v1); \
722 LD = _mm_mul_##SUF(LD, f2); \
723 TMP = _mm_add_##SUF(TMP, LD); \
724 _MM_STORE(r, TMP, SUF,)
726 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
734#define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
735 LD = _mm_load_##SUF(r); \
736 TMP = _mm_load##UNA1##_##SUF(v1); \
737 LD = _mm_mul_##SUF(LD, f2); \
738 LD = _mm_sub_##SUF(LD, TMP); \
739 _MM_STORE(r, LD, SUF,)
741 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
744 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
749#define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
750 LD = _mm_load_##SUF(r); \
751 TMP = _mm_load##UNA1##_##SUF(v1); \
752 LD = _mm_mul_##SUF(LD, f1); \
753 TMP = _mm_mul_##SUF(TMP, f2); \
754 LD = _mm_add_##SUF(LD, TMP); \
755 _MM_STORE(r, LD, SUF,)
757 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
760 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
765#define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
766 LD = _mm_load_##SUF(r); \
767 TMP = _mm_load##UNA1##_##SUF(v1); \
768 LD = _mm_mul_##SUF(LD, f1); \
769 TMP = _mm_mul_##SUF(TMP, f2); \
770 LD = _mm_sub_##SUF(LD, TMP); \
771 _MM_STORE(r, LD, SUF,)
773 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
776 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
782#define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
783 TMP = _mm_load##UNA1##_##SUF(v1); \
784 TMP = _mm_mul_##SUF(TMP, f1); \
785 TMP = _mm_add_##SUF(TMP, f2); \
786 _MM_STORE(r, TMP, SUF,)
788 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
791 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
796#define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
797 TMP = _mm_load##UNA1##_##SUF(v1); \
798 TMP = _mm_mul_##SUF(TMP, f1); \
799 TMP = _mm_sub_##SUF(TMP, f2); \
800 _MM_STORE(r, TMP, SUF,)
802 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
805 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
811#define ADD1SV_SIMD(r,f1,f2,SUF) \
812 TMP = _mm_load_##SUF(r); \
813 TMP = _mm_mul_##SUF(TMP, f1); \
814 TMP = _mm_add_##SUF(TMP, f2); \
815 _MM_STORE(r, TMP, SUF,)
817 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
820 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
825#define SUB1SV_SIMD(r,f1,f2,SUF) \
826 TMP = _mm_load_##SUF(r); \
827 TMP = _mm_mul_##SUF(TMP, f1); \
828 TMP = _mm_sub_##SUF(TMP, f2); \
829 _MM_STORE(r, TMP, SUF,)
831 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
834 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
840template <>
inline void do_val_svc_add<double>(
const unsigned long sz,
844 do_svc_val_add<double>(sz,
res, v1, f2, f1);
846template <>
inline void do_val_svc_add<float>(
const unsigned long sz,
850 do_svc_val_add<float>(sz,
res, v1, f2, f1);
855#define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
856 TMP = _mm_load##UNA1##_##SUF(v1); \
857 TMP = _mm_mul_##SUF(TMP, f2); \
858 TMP = _mm_sub_##SUF(f1, TMP); \
859 _MM_STORE(r, TMP, SUF,)
861 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
864 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
869#define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
870 TMP = _mm_load##UNA1##_##SUF(v1); \
871 TMP = _mm_mul_##SUF(TMP, f2); \
872 TMP = _mm_div_##SUF(f1, TMP); \
873 _MM_STORE(r, TMP, SUF,)
875 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
878 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
887#define NEG_DOUBLE_PREP \
888 static union _negmask { \
889 unsigned LONG_LONG lng[2]; \
892 } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
893 __m128d neg = _mm_load_pd(negmask.dbl)
895#define NEG_DOUBLE_PREP \
896 static union _negmask { \
897 unsigned int lng[4]; \
900 } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
901 __m128d neg = _mm_load_pd(negmask.dbl)
903#define NEG_FLOAT_PREP \
904 static union _negmask { \
905 unsigned int itg[4]; \
908 } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
909 __m128 neg = _mm_load_ps(negmask.flt)
912#define _mm_xor_sd _mm_xor_pd
913#define _mm_xor_ss _mm_xor_ps
917#define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
918 TMP = _mm_load##UNA1##_##SUF(v1); \
919 TMP = _mm_xor_##SUF(TMP, neg); \
920 _MM_STORE(r, TMP, SUF,)
922 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
925 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
930#define NEG1_SIMD(r,f1,f2,SUF) \
931 TMP = _mm_load_##SUF(r); \
932 TMP = _mm_xor_##SUF(TMP, neg); \
933 _MM_STORE(r, TMP, SUF,)
935 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
938 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
945#define VL_PREP(x) long f2 = (x)
946#define VL_FIN(x) x = f2
947#define _mm_movemask_sd(x) \
948 _mm_movemask_pd(x); rg &= 0x1
949#define _mm_movemask_ss(x) \
950 _mm_movemask_ps(x); rg &= 0x1
951#define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
952 TMP = _mm_load_##SUF(r); \
953 LD = _mm_load_##SUF(v1); \
954 TMP = _mm_cmpneq_##SUF(TMP, LD); \
956 rg = _mm_movemask_##SUF(TMP); \
957 if (rg) { ++f2; goto _fin; }
959 VL_PREP, SIMD_EMPTY0, VL_FIN,
962 VL_PREP, SIMD_EMPTY0, VL_FIN,
967#define DECL_DOUBLE __m128d TM2
968#define DECL_FLOAT __m128 TM2
971#define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
972 TMP = _mm_load##UNA1##_##SUF(v1); \
973 LD = _mm_load##UNA2##_##SUF(v2); \
974 TM2 = _mm_load_##SUF(r); \
975 TMP = _mm_mul_##SUF(TMP, LD); \
976 TM2 = _mm_add_##SUF(TM2, TMP); \
977 _MM_STORE(r, TM2, SUF,)
980 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
983 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
987 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
990 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
995template <>
inline void do_add_vec_vec_cmul<double>(
const unsigned long sz,
999 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1001template <>
inline void do_add_vec_vec_cmul<float>(
const unsigned long sz,
1005 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1031#ifndef TBCI_NO_SIMD_SUM
1033#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1034# warning Info: Using unrolled SSE2 vector kernels for sums (reductions)
1037#define SUM_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set_sd(x)
1038#define SUM_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set_ss(x)
1040#define XSUM_DOUBLE_PREP(x) \
1041 REGISTER __m128d f1 = _mm_setzero_pd();\
1042 REGISTER __m128d f2 = _mm_set_sd(x)
1043#define XSUM_FLOAT_PREP(x) \
1044 REGISTER __m128 f1 = _mm_setzero_ps(); \
1045 REGISTER __m128 f2 = _mm_set_ss(x)
1053# define SUM_DOUBLE_SIMD_FINX(f) \
1054 f = _mm_hadd_pd(f, f)
1055# define SUM_FLOAT_SIMD_FINX(f) \
1056 f = _mm_hadd_ps(f, f); \
1057 f = _mm_hadd_ps(f, f)
1059# define SUM_DOUBLE_SIMD_FINX(f) \
1060 __m128d TM##f = f; \
1061 TM##f = _mm_unpackhi_pd(TM##f, f); \
1062 f = _mm_add_sd(f, TM##f)
1063# define SUM_FLOAT_SIMD_FINX(f) \
1065 TM##f = _mm_shuffle_ps(TM##f, f, 0xb1); \
1066 f = _mm_add_ps(f, TM##f); \
1068 TM##f = _mm_shuffle_ps(TM##f, f, 0x1b); \
1069 f = _mm_add_ss(f, TM##f)
1070# if defined(__GNUC__) && defined(WARN_SSE)
1071# warning Not using SSE3 -- consider passing -msse3
1075#define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1076#define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1078#define SUM_DOUBLE_FINAL(x) \
1079 _mm_store_sd(&x, f2)
1080#define SUM_FLOAT_FINAL(x) \
1081 _mm_store_ss(&x, f2)
1085#define _mm_move_ps(f, x) x
1086#define _mm_move_pd(f, x) x
1096#define XSUM_DOUBLE_SIMD_FIN_STORE \
1101#define XSUM_FLOAT_SIMD_FIN_STORE \
1109#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1112 SUM_DOUBLE_SIMD_FINX(f2); \
1113 SUM_DOUBLE_SIMD_FINX(f1); \
1114 f2 = _mm_sub_sd(f2, f1); \
1115 _mm_store_sd(&x, f2)
1116#define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1121 SUM_FLOAT_SIMD_FINX(f2); \
1122 SUM_FLOAT_SIMD_FINX(f1); \
1123 f2 = _mm_sub_ss(f2, f1); \
1124 _mm_store_ss(&x, f2)
1127#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1131 SUM_DOUBLE_SIMD_FINX(f2); \
1133 COR = _mm_sub_sd(COR, TMP); \
1134 TMP = _mm_unpackhi_pd(TMP, TMP); \
1135 COR = _mm_sub_sd(COR, TMP); \
1136 f1 = _mm_add_sd(f1, COR); \
1137 SUM_DOUBLE_SIMD_FINX(f1); \
1138 f2 = _mm_sub_sd(f2, f1); \
1139 _mm_store_sd(&x, f2)
1147#define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1148 TMP = _mm_load_##SUF(r); \
1149 LD = _mm_load##UNA1##_##SUF(v1); \
1150 TMP = _mm_mul_##SUF(TMP, LD); \
1151 f2 = _mm_add_##SUF(f2, TMP)
1153 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1156 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1160#define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1161 TMP = _mm_load_##SUF(r); \
1162 LD = _mm_load##UNA1##_##SUF(v1); \
1163 TMP = _mm_mul_##SUF(TMP, LD); \
1164 LD = _mm_move_##SUF(LD, TMP); \
1165 TMP = _mm_add_##SUF(TMP, f2); \
1167 TMP = _mm_sub_##SUF(TMP, f2); \
1168 TMP = _mm_sub_##SUF(TMP, LD); \
1169 f1 = _mm_add_##SUF(f1, TMP); \
1170 f2 = _mm_move_##SUF(f2, t)
1172 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1173 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1176 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1177 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1181template <>
inline void do_vec_dot_exact<double>(
const unsigned long sz,
1185 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1188template <>
inline void do_vec_dot_quick<double>(
const unsigned long sz,
1192 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1195template <>
inline void do_vec_dot_exact<float>(
const unsigned long sz,
1199 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1202template <>
inline void do_vec_dot_quick<float>(
const unsigned long sz,
1206 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1217#define SQR1_SIMD(r,f1,f2,SUF) \
1218 TMP = _mm_load_##SUF(r); \
1219 TMP = _mm_mul_##SUF(TMP, TMP); \
1220 f2 = _mm_add_##SUF(f2, TMP)
1223 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1226 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1230#define XSQR1_SIMD(r,f1,f2,SUF) \
1231 TMP = _mm_load_##SUF(r); \
1232 TMP = _mm_mul_##SUF(TMP, TMP); \
1234 TMP = _mm_add_##SUF(TMP, f2); \
1236 TMP = _mm_sub_##SUF(TMP, f2); \
1237 TMP = _mm_sub_##SUF(TMP, y); \
1238 f1 = _mm_add_##SUF(f1, TMP); \
1239 f2 = _mm_move_##SUF(f2, t)
1241 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1242 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1245 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1246 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1250#ifndef TBCI_NO_SIMD_FABSSQR
1251template <>
inline void do_vec_fabssqr_quick<double>(
const unsigned long sz,
1252 const double *
const _v1,
double& _f2)
1255 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1258template <>
inline void do_vec_fabssqr_exact<double>(
const unsigned long sz,
1259 const double *
const _v1,
double& _f2)
1262 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1266#ifdef TBCI_SIMD_FABSSQR_FLOAT
1267template <>
inline void do_vec_fabssqr_quick<float>(
const unsigned long sz,
1268 const float *
const _v1,
double& _f2)
1271 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1274template <>
inline void do_vec_fabssqr_exact<float>(
const unsigned long sz,
1275 const float *
const _v1,
double& _f2)
1278 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1285#define SUM1_SIMD(r,f1,f2,SUF) \
1286 TMP = _mm_load_##SUF(r); \
1287 f2 = _mm_add_##SUF(f2, TMP)
1289 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1292 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1297#define XSUM1_SIMD(r,f1,f2,SUF) \
1298 y = _mm_load_##SUF(r); \
1299 t = _mm_add_##SUF(f2, y); \
1300 TMP = _mm_sub_##SUF(t, f2); \
1301 TMP = _mm_sub_##SUF(TMP, y); \
1302 f1 = _mm_add_##SUF(f1, TMP); \
1303 f2 = _mm_move_##SUF(f2, t)
1305 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1306 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1309 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1310 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
const unsigned TMatrix< T > * res
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define MULT2(r, v1, f1, f2)