TBCI Numerical high perf. C++ Library  2.8.0
vec_kern_special.h
Go to the documentation of this file.
1 
9 #ifndef H_VEC_KERN_SPECIAL_H
10 #define H_VEC_KERN_SPECIAL_H
11 
64 #if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65  ( defined(__x86_64__) || defined(__i386__) )
66 
67 #include <emmintrin.h>
68 
69 #if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__)
70 # include <pmmintrin.h>
71 #else
72 # undef __SSE3__
73 #endif
74 
75 #include "tbci/unroll_prefetch_simd_def.h"
76 
77 /* TODO: Add define, controlling instantiation */
78 
79 #if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
80 # include "vec_kern_special_gd.h"
81 #else
82 
84 
85 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
86 # warning Info: Using unrolled SSE2 vector kernels
87 #endif
88 
89 // TODO: Are integers useful as well?
90 // Unfortunately, the SSE commands follow a slightly different
91 // naming and systematics, so there's some manual work required.
92 // Maybe later ...
93 
94 #define SIMD_EMPTY0 do {} while (0)
95 #define SIMD_EMPTY1(x) do {} while (0)
96 #define SIMD_EMPTY2(x,y) do {} while (0)
97 
98 #define SIMD_CONST_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set1_pd(x)
99 #define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y)
100 
101 #define SIMD_CONST_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set1_ps(x)
102 #define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y)
103 
104 /* First the stuff from basics.h */
105 
106 /* Found a compiler bug in gcc 4.0.0 (PR 21239) */
107 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
108  __GNUC_MINOR__ == 0 && \
109  (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
110 # define _MM_STORE(mem, reg, SUF, UNA) \
111  asm(""::"x"(reg)); \
112  _mm_store##UNA##_##SUF(mem, reg)
113 #else
114 # define _MM_STORE(mem, reg, SUF, UNA) \
115  _mm_store##UNA##_##SUF(mem, reg)
116 #endif
117 
118 // Unaligned accesses for single loads are harmless ...
119 #define _mm_loadu_sd _mm_load_sd
120 #define _mm_loadu_ss _mm_load_ss
121 
122 /* General policy: We'll always ensure alignment of the result
123  * vector, so no need to use unaligned insns for it. */
124 
125 #ifndef C_MEMALLOC
126 /* Template decl should be in basics.h */
128 //# define COPY2(res,v1,f1,f2) res = v1
129 #define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
130  TMP = _mm_load##UNA1##_##SUF(v1); \
131  _MM_STORE(r, TMP, SUF,)
132 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, sd, pd,
133  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
134  2, double, __m128d)
135 VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, ss, ps,
136  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
137  4, float, __m128)
138 /* We could use more xmm REGISTERs here ... but the CPU internal
139  * REGISTER renaming should hide the latencies by not doing so. */
140 
141 /* TODO: Benchmark copy and fill ... */
142 /* TODO: Avoid gcc unrolling this even more ... */
143 /* TODO: Here integers and pointers certainly make sense */
144 
146 //# define FILL1(res,f1,f2) res = f2
147 #define FILL1_SIMD(r,f1,f2,SUF) \
148  _MM_STORE(r, f2, SUF,)
149 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, sd, pd,
150  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
151  2, double, __m128d)
152 VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, ss, ps,
153  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
154  4, float, __m128)
155 /* TODO: Here integers and pointers certainly make sense */
156 #endif
157 
158 /* 3Vec operations */
160 //#define ADD3(r,v1,v2,f1,f2) r = v1 + v2
161 #define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
162  TMP = _mm_load##UNA1##_##SUF(v1); \
163  LD = _mm_load##UNA2##_##SUF(v2); \
164  TMP = _mm_add_##SUF(TMP, LD); \
165  _MM_STORE(r, TMP, SUF,)
166 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, sd, pd,
167  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
168  2, double, __m128d)
169 VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, ss, ps,
170  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
171  4, float, __m128)
172 
174 //#define SUB3(r,v1,v2,f1,f2) r = v1 - v2
175 #define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
176  TMP = _mm_load##UNA1##_##SUF(v1); \
177  LD = _mm_load##UNA2##_##SUF(v2); \
178  TMP = _mm_sub_##SUF(TMP, LD); \
179  _MM_STORE(r, TMP, SUF,)
180 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, sd, pd,
181  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
182  2, double, __m128d)
183 VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, ss, ps,
184  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
185  4, float, __m128)
186 
188 //#define MUL3(r,v1,v2,f1,f2) r = v1 * v2
189 #define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
190  TMP = _mm_load##UNA1##_##SUF(v1); \
191  LD = _mm_load##UNA2##_##SUF(v2); \
192  TMP = _mm_mul_##SUF(TMP, LD); \
193  _MM_STORE(r, TMP, SUF,)
194 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, sd, pd,
195  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
196  2, double, __m128d)
197 VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, ss, ps,
198  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
199  4, float, __m128)
200 
201 template <> inline void do_vec_vec_cmul<double>(const unsigned long sz,
202  double* RESTRICT const res, const double* RESTRICT const v1,
203  const double* RESTRICT const v2)
204 {
205  do_vec_vec_mul<double>(sz, res, v1, v2);
206 }
207 template <> inline void do_vec_vec_cmul<float>(const unsigned long sz,
208  float* RESTRICT const res, const float* RESTRICT const v1,
209  const float* RESTRICT const v2)
210 {
211  do_vec_vec_mul<float>(sz, res, v1, v2);
212 }
213 
215 //#define DIV3(r,v1,v2,f1,f2) r = v1 / v2
216 #define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
217  TMP = _mm_load##UNA1##_##SUF(v1); \
218  LD = _mm_load##UNA2##_##SUF(v2); \
219  TMP = _mm_div_##SUF(TMP, LD); \
220  _MM_STORE(r, TMP, SUF,)
221 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, sd, pd,
222  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
223  2, double, __m128d)
224 VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, ss, ps,
225  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
226  4, float, __m128)
227 
228 template <> inline void do_vec_vec_cdiv<double>(const unsigned long sz,
229  double* RESTRICT const res, const double* RESTRICT const v1,
230  const double* RESTRICT const v2)
231 {
232  do_vec_vec_div<double>(sz, res, v1, v2);
233 }
234 template <> inline void do_vec_vec_cdiv<float>(const unsigned long sz,
235  float* RESTRICT const res, const float* RESTRICT const v1,
236  const float* RESTRICT const v2)
237 {
238  do_vec_vec_div<float>(sz, res, v1, v2);
239 }
240 
241 
243 //#define ADD2(r,v1,f1,f2) r += v1
244 #define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
245  TMP = _mm_load_##SUF(r); \
246  LD = _mm_load##UNA1##_##SUF(v1); \
247  TMP = _mm_add_##SUF(TMP, LD); \
248  _MM_STORE(r, TMP, SUF,)
249 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, sd, pd,
250  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
251  2, double, __m128d)
252 VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, ss, ps,
253  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
254  4, float, __m128)
255 
257 //#define SUB2(r,v1,f1,f2) r -= v1
258 #define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
259  TMP = _mm_load_##SUF(r); \
260  LD = _mm_load##UNA1##_##SUF(v1); \
261  TMP = _mm_sub_##SUF(TMP, LD); \
262  _MM_STORE(r, TMP, SUF,)
263 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, sd, pd,
264  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
265  2, double, __m128d)
266 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, ss, ps,
267  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
268  4, float, __m128)
269 
271 //#define SUB2I(r,v1,f1,f2) r = v1 - r
272 #define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
273  TMP = _mm_load_##SUF(r); \
274  LD = _mm_load##UNA1##_##SUF(v1); \
275  LD = _mm_sub_##SUF(LD, TMP); \
276  _MM_STORE(r, LD, SUF,)
277 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, sd, pd,
278  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
279  2, double, __m128d)
280 VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, ss, ps,
281  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
282  4, float, __m128)
283 
285 //#define MUL2(r,v1,f1,f2) r *= v1
286 #define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
287  TMP = _mm_load_##SUF(r); \
288  LD = _mm_load##UNA1##_##SUF(v1); \
289  TMP = _mm_mul_##SUF(TMP, LD); \
290  _MM_STORE(r, TMP, SUF,)
291 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, sd, pd,
292  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
293  2, double, __m128d)
294 VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, ss, ps,
295  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
296  4, float, __m128)
297 
299 //#define CMUL2(r,v1,f1,f2) r = CPLX__ conj(r) * v1
300 template <> inline void do_vec_cmul_vec<double>(const unsigned long sz,
301  double* RESTRICT const res, const double* RESTRICT const v1)
302 {
303  do_vec_mul_vec<double>(sz, res, v1);
304 }
305 template <> inline void do_vec_cmul_vec<float>(const unsigned long sz,
306  float* RESTRICT const res, const float* RESTRICT const v1)
307 {
308  do_vec_mul_vec<float>(sz, res, v1);
309 }
311 //#define CMUL2I(r,v1,f1,f2) r *= CPLX__ conj(v1)
312 template <> inline void do_vec_cmul_vec_inv<double>(const unsigned long sz,
313  double* RESTRICT const res, const double* RESTRICT const v1)
314 {
315  do_vec_mul_vec<double>(sz, res, v1);
316 }
317 template <> inline void do_vec_cmul_vec_inv<float>(const unsigned long sz,
318  float* RESTRICT const res, const float* RESTRICT const v1)
319 {
320  do_vec_mul_vec<float>(sz, res, v1);
321 }
322 
324 //#define DIV2(r,v1,f1,f2) r /= v1
325 #define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
326  TMP = _mm_load_##SUF(r); \
327  LD = _mm_load##UNA1##_##SUF(v1); \
328  TMP = _mm_div_##SUF(TMP, LD); \
329  _MM_STORE(r, TMP, SUF,)
330 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, sd, pd,
331  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
332  2, double, __m128d)
333 VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, ss, ps,
334  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
335  4, float, __m128)
336 
338 //#define DIV2I(r,v1,f1,f2) r = v1 / r
339 #define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
340  TMP = _mm_load_##SUF(r); \
341  LD = _mm_load##UNA1##_##SUF(v1); \
342  LD = _mm_div_##SUF(LD, TMP); \
343  _MM_STORE(r, LD, SUF,)
344 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, sd, pd,
345  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
346  2, double, __m128d)
347 VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, ss, ps,
348  SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
349  4, float, __m128)
350 
352 //#define CDIV2(r,v1,f1,f2) r = CPLX__ conj(r) / v1
353 template <> inline void do_vec_cdiv_vec<double>(const unsigned long sz,
354  double* RESTRICT const res, const double* RESTRICT const v1)
355 {
356  do_vec_div_vec<double>(sz, res, v1);
357 }
358 template <> inline void do_vec_cdiv_vec<float>(const unsigned long sz,
359  float* RESTRICT const res, const float* RESTRICT const v1)
360 {
361  do_vec_div_vec<float>(sz, res, v1);
362 }
363 
364 
366 //#define CDIV2I(r,v1,f1,f2) r = CPLX__ conj(v1) / r
367 template <> inline void do_vec_cdiv_vec_inv<double>(const unsigned long sz,
368  double* RESTRICT const res, const double* RESTRICT const v1)
369 {
370  do_vec_div_vec_inv<double>(sz, res, v1);
371 }
372 template <> inline void do_vec_cdiv_vec_inv<float>(const unsigned long sz,
373  float* RESTRICT const res, const float* RESTRICT const v1)
374 {
375  do_vec_div_vec_inv<float>(sz, res, v1);
376 }
377 
379 //#define ADD2NV(r,v1,f1,f2) r = v1 + f2
380 #define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
381  TMP = _mm_load##UNA1##_##SUF(v1); \
382  TMP = _mm_add_##SUF(TMP, f2); \
383  _MM_STORE(r, TMP, SUF,)
384 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, sd, pd,
385  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
386  2, double, __m128d)
387 VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, ss, ps,
388  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
389  4, float, __m128)
390 
392 //#define SUB2NV(r,v1,f1,f2) r = v1 - f2
393 #define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
394  TMP = _mm_load##UNA1##_##SUF(v1); \
395  TMP = _mm_sub_##SUF(TMP, f2); \
396  _MM_STORE(r, TMP, SUF,)
397 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, sd, pd,
398  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
399  2, double, __m128d)
400 VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, ss, ps,
401  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
402  4, float, __m128)
403 
405 //#define MUL2NV(r,v1,f1,f2) r = v1 * f2
406 //VKERN_TEMPL_2V_C(do_vec_val_mul, MUL2NV);
407 #define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
408  TMP = _mm_load##UNA1##_##SUF(v1); \
409  TMP = _mm_mul_##SUF(TMP, f2); \
410  _MM_STORE(r, TMP, SUF,)
411 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, sd, pd,
412  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
413  2, double, __m128d)
414 VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, ss, ps,
415  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
416  4, float, __m128)
417 
418 
420 //#define ADD2RV(r,v1,f1,f2) r = f2 + v1
421 template <> inline void do_val_vec_add<double>(const unsigned long sz,
422  double* RESTRICT const res, const double* RESTRICT const v1,
423  LCTYPED(double) _f2)
424 {
425  do_vec_val_add<double>(sz, res, v1, _f2);
426 }
427 template <> inline void do_val_vec_add<float>(const unsigned long sz,
428  float* RESTRICT const res, const float* RESTRICT const v1,
429  LCTYPED(float) _f2)
430 {
431  do_vec_val_add<float>(sz, res, v1, _f2);
432 }
433 
435 //#define SUB2RV(r,v1,f1,f2) r = f2 - v1
436 #define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437  TMP = _mm_load##UNA1##_##SUF(v1); \
438  TMP = _mm_sub_##SUF(f2, TMP); \
439  _MM_STORE(r, TMP, SUF,)
440 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, sd, pd,
441  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442  2, double, __m128d)
443 VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, ss, ps,
444  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445  4, float, __m128)
446 
448 //#define MUL2RV(r,v1,f1,f2) r = f2 * v1
449 template <> inline void do_val_vec_mul<double>(const unsigned long sz,
450  double* RESTRICT const res, const double* RESTRICT const v1,
451  LCTYPED(double) _f2)
452 {
453  do_vec_val_mul<double>(sz, res, v1, _f2);
454 }
455 template <> inline void do_val_vec_mul<float>(const unsigned long sz,
456  float* RESTRICT const res, const float* RESTRICT const v1,
457  LCTYPED(float) _f2)
458 {
459  do_vec_val_mul<float>(sz, res, v1, _f2);
460 }
461 
463 //#define DIV2RV(r,v1,f1,f2) r = f2 / v1
464 #define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
465  TMP = _mm_load##UNA1##_##SUF(v1); \
466  TMP = _mm_div_##SUF(f2, TMP); \
467  _MM_STORE(r, TMP, SUF,)
468 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, sd, pd,
469  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
470  2, double, __m128d)
471 VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, ss, ps,
472  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
473  4, float, __m128)
474 
475 /* The one vec operations don't need unaligned versions,
476  * as the loop preamble will always be able to lead us to an
477  * aligned situation */
478 
480 //#define ADD1NV(r,f1,f2) r += f2
481 #define ADD1NV_SIMD(r,f1,f2,SUF) \
482  TMP = _mm_load_##SUF(r); \
483  TMP = _mm_add_##SUF(TMP, f2); \
484  _MM_STORE(r, TMP, SUF,)
485 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, sd, pd,
486  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
487  2, double, __m128d)
488 VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, ss, ps,
489  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
490  4, float, __m128)
491 
493 //#define SUB1NV(r,f1,f2) r -= f2
494 #define SUB1NV_SIMD(r,f1,f2,SUF) \
495  TMP = _mm_load_##SUF(r); \
496  TMP = _mm_sub_##SUF(TMP, f2); \
497  _MM_STORE(r, TMP, SUF,)
498 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, sd, pd,
499  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
500  2, double, __m128d)
501 VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, ss, ps,
502  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
503  4, float, __m128)
504 
506 //#define SUB1RV(r,f1,f2) r = f2 - r
507 #define SUB1RV_SIMD(r,f1,f2,SUF) \
508  TMP = _mm_load_##SUF(r); \
509  TMP = _mm_sub_##SUF(f2, TMP); \
510  _MM_STORE(r, TMP, SUF,)
511 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, sd, pd,
512  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513  2, double, __m128d)
514 VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, ss, ps,
515  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516  4, float, __m128)
517 
519 //#define MUL1NV(r,f1,f2) r *= f2
520 #define MUL1NV_SIMD(r,f1,f2,SUF) \
521  TMP = _mm_load_##SUF(r); \
522  TMP = _mm_mul_##SUF(TMP, f2); \
523  _MM_STORE(r, TMP, SUF,)
524 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, sd, pd,
525  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
526  2, double, __m128d)
527 VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, ss, ps,
528  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
529  4, float, __m128)
530 
532 //#define DIV1NV(r,f1,f2) r /= f2
533 #define DIV1NV_SIMD(r,f1,f2,SUF) \
534  TMP = _mm_load_##SUF(r); \
535  TMP = _mm_div_##SUF(TMP, f2); \
536  _MM_STORE(r, TMP, SUF,)
537 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, sd, pd,
538  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
539  2, double, __m128d)
540 VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, ss, ps,
541  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
542  4, float, __m128)
543 
545 //#define DIV1RV(r,f1,f2) r = f2 / r
546 #define DIV1RV_SIMD(r,f1,f2,SUF) \
547  TMP = _mm_load_##SUF(r); \
548  TMP = _mm_div_##SUF(f2, TMP); \
549  _MM_STORE(r, TMP, SUF,)
550 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, sd, pd,
551  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
552  2, double, __m128d)
553 VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, ss, ps,
554  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
555  4, float, __m128)
556 
558 //#define ADD1RV(r,f1,f2) r = f2 + r
559 template <> inline void do_val_add_vec<double>(const unsigned long sz,
560  double* RESTRICT const res, LCTYPED(double) _f2)
561 {
562  do_vec_add_val<double>(sz, res, _f2);
563 }
564 template <> inline void do_val_add_vec<float>(const unsigned long sz,
565  float* RESTRICT const res, LCTYPED(float) _f2)
566 {
567  do_vec_add_val<float>(sz, res, _f2);
568 }
569 
570 // unused ...
571 //#define MUL1RV(r,f1,f2) r = f2 * r;
572 
573 
574 
575 /* TSVector stuff */
576 
578 //#define ADD2NS(r,v1,f1,f2) r += f2*v1
579 #define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
580  LD = _mm_load##UNA1##_##SUF(v1); \
581  TMP = _mm_load_##SUF(r); \
582  LD = _mm_mul_##SUF(LD, f2); \
583  TMP = _mm_add_##SUF(TMP, LD); \
584  _MM_STORE(r, TMP, SUF,)
585 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, sd, pd,
586  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
587  2, double, __m128d)
588 VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, ss, ps,
589  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
590  4, float, __m128)
591 
593 //#define SUB2NS(r,v1,f1,f2) r -= f2*v1
594 #define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
595  LD = _mm_load##UNA1##_##SUF(v1); \
596  TMP = _mm_load_##SUF(r); \
597  LD = _mm_mul_##SUF(LD, f2); \
598  TMP = _mm_sub_##SUF(TMP, LD); \
599  _MM_STORE(r, TMP, SUF,)
600 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, sd, pd,
601  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
602  2, double, __m128d)
603 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, ss, ps,
604  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
605  4, float, __m128)
606 
608 //#define SUB2RS(r,v1,f1,f2) r = f2*v1 - r
609 #define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
610  LD = _mm_load##UNA1##_##SUF(v1); \
611  TMP = _mm_load_##SUF(r); \
612  LD = _mm_mul_##SUF(LD, f2); \
613  LD = _mm_sub_##SUF(LD, TMP); \
614  _MM_STORE(r, LD, SUF,)
615 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, sd, pd,
616  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
617  2, double, __m128d)
618 VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, ss, ps,
619  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
620  4, float, __m128)
621 
623 //#define ADD3NS(r,v1,v2,f1,f2) r = v1 + f2*v2
624 #define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
625  LD = _mm_load##UNA2##_##SUF(v2); \
626  TMP = _mm_load##UNA1##_##SUF(v1); \
627  LD = _mm_mul_##SUF(LD, f2); \
628  TMP = _mm_add_##SUF(TMP, LD); \
629  _MM_STORE(r, TMP, SUF,)
630 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, sd, pd,
631  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
632  2, double, __m128d)
633 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, ss, ps,
634  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
635  4, float, __m128)
636 
638 //#define SUB3NS(r,v1,v2,f1,f2) r = v1 - f2*v2
639 #define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
640  LD = _mm_load##UNA2##_##SUF(v2); \
641  TMP = _mm_load##UNA1##_##SUF(v1); \
642  LD = _mm_mul_##SUF(LD, f2); \
643  TMP = _mm_sub_##SUF(TMP, LD); \
644  _MM_STORE(r, TMP, SUF,)
645 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, sd, pd,
646  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
647  2, double, __m128d)
648 VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, ss, ps,
649  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
650  4, float, __m128)
651 
652 
654 //#define ADD3SN(r,v1,v2,f1,f2) r = f2*v1 + v2
655 #define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
656  LD = _mm_load##UNA1##_##SUF(v1); \
657  TMP = _mm_load##UNA2##_##SUF(v2); \
658  LD = _mm_mul_##SUF(LD, f2); \
659  TMP = _mm_add_##SUF(TMP, LD); \
660  _MM_STORE(r, TMP, SUF,)
661 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, sd, pd,
662  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
663  2, double, __m128d)
664 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, ss, ps,
665  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
666  4, float, __m128)
667 
669 //#define SUB3SN(r,v1,v2,f1,f2) r = f2*v1 - v2
670 #define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
671  LD = _mm_load##UNA1##_##SUF(v1); \
672  TMP = _mm_load##UNA2##_##SUF(v2); \
673  LD = _mm_mul_##SUF(LD, f2); \
674  LD = _mm_sub_##SUF(LD, TMP); \
675  _MM_STORE(r, LD, SUF,)
676 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, sd, pd,
677  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
678  2, double, __m128d)
679 VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, ss, ps,
680  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
681  4, float, __m128)
682 
683 
685 //#define ADD3SS(r,v1,v2,f1,f2) r = f1*v1 + f2*v2
686 #define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
687  LD = _mm_load##UNA1##_##SUF(v1); \
688  TMP = _mm_load##UNA2##_##SUF(v2); \
689  LD = _mm_mul_##SUF(LD, f1); \
690  TMP = _mm_mul_##SUF(TMP, f2); \
691  LD = _mm_add_##SUF(LD, TMP); \
692  _MM_STORE(r, LD, SUF,)
693 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, sd, pd,
694  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
695  2, double, __m128d)
696 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, ss, ps,
697  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
698  4, float, __m128)
699 
701 //#define SUB3SS(r,v1,v2,f1,f2) r = f1*v1 - f2*v2
702 #define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
703  LD = _mm_load##UNA1##_##SUF(v1); \
704  TMP = _mm_load##UNA2##_##SUF(v2); \
705  LD = _mm_mul_##SUF(LD, f1); \
706  TMP = _mm_mul_##SUF(TMP, f2); \
707  LD = _mm_sub_##SUF(LD, TMP); \
708  _MM_STORE(r, LD, SUF,)
709 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, sd, pd,
710  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
711  2, double, __m128d)
712 VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, ss, ps,
713  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
714  4, float, __m128)
715 
716 
718 //#define ADD2SN(r,v1,f1,f2) r = f2*r + v1
719 #define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
720  LD = _mm_load_##SUF(r); \
721  TMP = _mm_load##UNA1##_##SUF(v1); \
722  LD = _mm_mul_##SUF(LD, f2); \
723  TMP = _mm_add_##SUF(TMP, LD); \
724  _MM_STORE(r, TMP, SUF,)
725 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, sd, pd,
726  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
727  2, double, __m128d)
728 VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, ss, ps,
729  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
730  4, float, __m128)
731 
733 //#define SUB2SN(r,v1,f1,f2) r = f2*r - v1
734 #define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
735  LD = _mm_load_##SUF(r); \
736  TMP = _mm_load##UNA1##_##SUF(v1); \
737  LD = _mm_mul_##SUF(LD, f2); \
738  LD = _mm_sub_##SUF(LD, TMP); \
739  _MM_STORE(r, LD, SUF,)
740 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, sd, pd,
741  SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
742  2, double, __m128d)
743 VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, ss, ps,
744  SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
745  4, float, __m128)
746 
748 //#define ADD2SS(r,v1,f1,f2) r = f1*r + f2*v1
749 #define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
750  LD = _mm_load_##SUF(r); \
751  TMP = _mm_load##UNA1##_##SUF(v1); \
752  LD = _mm_mul_##SUF(LD, f1); \
753  TMP = _mm_mul_##SUF(TMP, f2); \
754  LD = _mm_add_##SUF(LD, TMP); \
755  _MM_STORE(r, LD, SUF,)
756 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, sd, pd,
757  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
758  2, double, __m128d)
759 VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, ss, ps,
760  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
761  4, float, __m128)
762 
764 //#define SUB2SS(r,v1,f1,f2) r = f1*r - f2*v1
765 #define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
766  LD = _mm_load_##SUF(r); \
767  TMP = _mm_load##UNA1##_##SUF(v1); \
768  LD = _mm_mul_##SUF(LD, f1); \
769  TMP = _mm_mul_##SUF(TMP, f2); \
770  LD = _mm_sub_##SUF(LD, TMP); \
771  _MM_STORE(r, LD, SUF,)
772 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, sd, pd,
773  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774  2, double, __m128d)
775 VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, ss, ps,
776  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
777  4, float, __m128)
778 
779 
781 //#define ADD2SV(r,v1,f1,f2) r = f1*v1 + f2
782 #define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
783  TMP = _mm_load##UNA1##_##SUF(v1); \
784  TMP = _mm_mul_##SUF(TMP, f1); \
785  TMP = _mm_add_##SUF(TMP, f2); \
786  _MM_STORE(r, TMP, SUF,)
787 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, sd, pd,
788  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
789  2, double, __m128d)
790 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, ss, ps,
791  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
792  4, float, __m128)
793 
795 //#define SUB2SV(r,v1,f1,f2) r = f1*v1 - f2
796 #define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
797  TMP = _mm_load##UNA1##_##SUF(v1); \
798  TMP = _mm_mul_##SUF(TMP, f1); \
799  TMP = _mm_sub_##SUF(TMP, f2); \
800  _MM_STORE(r, TMP, SUF,)
801 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, sd, pd,
802  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
803  2, double, __m128d)
804 VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, ss, ps,
805  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
806  4, float, __m128)
807 
808 
810 //#define ADD1SV(r,f1,f2) r = f1*r + f2
811 #define ADD1SV_SIMD(r,f1,f2,SUF) \
812  TMP = _mm_load_##SUF(r); \
813  TMP = _mm_mul_##SUF(TMP, f1); \
814  TMP = _mm_add_##SUF(TMP, f2); \
815  _MM_STORE(r, TMP, SUF,)
816 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, sd, pd,
817  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818  2, double, __m128d)
819 VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, ss, ps,
820  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
821  4, float, __m128)
822 
824 //#define SUB1SV(r,f1,f2) r = f1*r - f2
825 #define SUB1SV_SIMD(r,f1,f2,SUF) \
826  TMP = _mm_load_##SUF(r); \
827  TMP = _mm_mul_##SUF(TMP, f1); \
828  TMP = _mm_sub_##SUF(TMP, f2); \
829  _MM_STORE(r, TMP, SUF,)
830 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, sd, pd,
831  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
832  2, double, __m128d)
833 VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, ss, ps,
834  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
835  4, float, __m128)
836 
837 
839 //#define ADD2VS(r,v1,f1,f2) r = f1 + f2*v1
840 template <> inline void do_val_svc_add<double>(const unsigned long sz,
841  double* RESTRICT const res, const double* RESTRICT const v1,
842  LCTYPED(double) f1, LCTYPED(double) f2)
843 {
844  do_svc_val_add<double>(sz, res, v1, f2, f1); // note the reverse order!
845 }
846 template <> inline void do_val_svc_add<float>(const unsigned long sz,
847  float* RESTRICT const res, const float* RESTRICT const v1,
848  LCTYPED(float) f1, LCTYPED(float) f2)
849 {
850  do_svc_val_add<float>(sz, res, v1, f2, f1); // note the reverse order!
851 }
852 
854 //#define SUB2VS(r,v1,f1,f2) r = f1 - f2*v1
855 #define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
856  TMP = _mm_load##UNA1##_##SUF(v1); \
857  TMP = _mm_mul_##SUF(TMP, f2); \
858  TMP = _mm_sub_##SUF(f1, TMP); \
859  _MM_STORE(r, TMP, SUF,)
860 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, sd, pd,
861  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
862  2, double, __m128d)
863 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, ss, ps,
864  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
865  4, float, __m128)
866 
868 //#define DIV2VS(r,v1,f1,f2) r = f1 / (f2*v1)
869 #define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
870  TMP = _mm_load##UNA1##_##SUF(v1); \
871  TMP = _mm_mul_##SUF(TMP, f2); \
872  TMP = _mm_div_##SUF(f1, TMP); \
873  _MM_STORE(r, TMP, SUF,)
874 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, sd, pd,
875  SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
876  2, double, __m128d)
877 VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, ss, ps,
878  SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
879  4, float, __m128)
880 
881 
882 /* ... */
883 
884 
885 /* For negation, use our knowledge of the position of the sign bits */
886 #ifdef HAVE_LONG_LONG
887 #define NEG_DOUBLE_PREP \
888  static union _negmask { \
889  unsigned LONG_LONG lng[2]; \
890  double dbl[2]; \
891  __m128d m128d; \
892  } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
893  __m128d neg = _mm_load_pd(negmask.dbl)
894 #else
895 #define NEG_DOUBLE_PREP \
896  static union _negmask { \
897  unsigned int lng[4]; \
898  double dbl[2]; \
899  __m128d m128d; \
900  } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
901  __m128d neg = _mm_load_pd(negmask.dbl)
902 #endif
903 #define NEG_FLOAT_PREP \
904  static union _negmask { \
905  unsigned int itg[4]; \
906  float flt[4]; \
907  __m128 m128s; \
908  } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
909  __m128 neg = _mm_load_ps(negmask.flt)
910 
911 /* single val xor operations don't exist, but we don't care ... */
912 #define _mm_xor_sd _mm_xor_pd
913 #define _mm_xor_ss _mm_xor_ps
914 
916 //#define NEG2(r,v1,f1,f2) r = -v1
917 #define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
918  TMP = _mm_load##UNA1##_##SUF(v1); \
919  TMP = _mm_xor_##SUF(TMP, neg); \
920  _MM_STORE(r, TMP, SUF,)
921 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, sd, pd,
922  NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
923  2, double, __m128d)
924 VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, ss, ps,
925  NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
926  4, float, __m128)
927 
929 //#define NEG1(r,f1,f2) r = -r
930 #define NEG1_SIMD(r,f1,f2,SUF) \
931  TMP = _mm_load_##SUF(r); \
932  TMP = _mm_xor_##SUF(TMP, neg); \
933  _MM_STORE(r, TMP, SUF,)
934 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, sd, pd,
935  NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
936  2, double, __m128d)
937 VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, ss, ps,
938  NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
939  4, float, __m128)
940 
941 
943 //#define COMP2(r,v1,f1,f2) if (r != v1) { ++f2; break; }
944 //VKERN_TEMPL_2V_T(do_vv_comp, COMP2, volatile long);
945 #define VL_PREP(x) long f2 = (x)
946 #define VL_FIN(x) x = f2
947 #define _mm_movemask_sd(x) \
948  _mm_movemask_pd(x); rg &= 0x1
949 #define _mm_movemask_ss(x) \
950  _mm_movemask_ps(x); rg &= 0x1
951 #define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
952  TMP = _mm_load_##SUF(r); \
953  LD = _mm_load_##SUF(v1); \
954  TMP = _mm_cmpneq_##SUF(TMP, LD); \
955  /* And now? movmskpd and bt? */ \
956  rg = _mm_movemask_##SUF(TMP); \
957  if (rg) { ++f2; /*fprintf(stderr, "DIFF @ %li: %i\n", sz-i, rg);*/ goto _fin; }
958 VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, sd, pd,
959  VL_PREP, SIMD_EMPTY0, VL_FIN,
960  2, double, __m128d)
961 VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, ss, ps,
962  VL_PREP, SIMD_EMPTY0, VL_FIN,
963  4, float, __m128)
964 
965 
966 // Used in do_bdmat_vec_mult
967 #define DECL_DOUBLE __m128d TM2
968 #define DECL_FLOAT __m128 TM2
969 
970 //#define SUMMULT3(r,v1,v2,f1,f2) r += v1*v2
971 #define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
972  TMP = _mm_load##UNA1##_##SUF(v1); \
973  LD = _mm_load##UNA2##_##SUF(v2); \
974  TM2 = _mm_load_##SUF(r); \
975  TMP = _mm_mul_##SUF(TMP, LD); \
976  TM2 = _mm_add_##SUF(TM2, TMP); \
977  _MM_STORE(r, TM2, SUF,)
978 #if 1 /* These are used in bdmat_vec_mul -- unaligned accesses are unavoidable */
979 VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
980  DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
981  2, double, __m128d);
982 VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
983  DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
984  4, float, __m128);
985 #else
986 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
987  DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
988  2, double, __m128d)
989 VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
990  DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
991  4, float, __m128)
992 #endif
993 
994 //#define SUMCMULT3(r,v1,v2,f1,f2) r += CPLX__ conj(v1)*v2
995 template <> inline void do_add_vec_vec_cmul<double>(const unsigned long sz,
996  double* RESTRICT const r, const double* RESTRICT const v1,
997  const double* RESTRICT const v2)
998 {
999  do_add_vec_vec_mul<double>(sz, r, v1, v2);
1000 }
1001 template <> inline void do_add_vec_vec_cmul<float>(const unsigned long sz,
1002  float* RESTRICT const r, const float* RESTRICT const v1,
1003  const float* RESTRICT const v2)
1004 {
1005  do_add_vec_vec_mul<float>(sz, r, v1, v2);
1006 }
1007 
1008 
1031 #ifndef TBCI_NO_SIMD_SUM
1032 
1033 #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1034 # warning Info: Using unrolled SSE2 vector kernels for sums (reductions)
1035 #endif
1036 
1037 #define SUM_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set_sd(x)
1038 #define SUM_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set_ss(x)
1039 
1040 #define XSUM_DOUBLE_PREP(x) \
1041  REGISTER __m128d f1 = _mm_setzero_pd();\
1042  REGISTER __m128d f2 = _mm_set_sd(x)
1043 #define XSUM_FLOAT_PREP(x) \
1044  REGISTER __m128 f1 = _mm_setzero_ps(); \
1045  REGISTER __m128 f2 = _mm_set_ss(x)
1046 
1052 #ifdef __SSE3__
1053 # define SUM_DOUBLE_SIMD_FINX(f) \
1054  f = _mm_hadd_pd(f, f)
1055 # define SUM_FLOAT_SIMD_FINX(f) \
1056  f = _mm_hadd_ps(f, f); \
1057  f = _mm_hadd_ps(f, f)
1058 #else // __SSE3__
1059 # define SUM_DOUBLE_SIMD_FINX(f) \
1060  __m128d TM##f = f; \
1061  TM##f = _mm_unpackhi_pd(TM##f, f); \
1062  f = _mm_add_sd(f, TM##f)
1063 # define SUM_FLOAT_SIMD_FINX(f) \
1064  __m128 TM##f = f; \
1065  TM##f = _mm_shuffle_ps(TM##f, f, 0xb1); \
1066  f = _mm_add_ps(f, TM##f); \
1067  TM##f = f; \
1068  TM##f = _mm_shuffle_ps(TM##f, f, 0x1b); \
1069  f = _mm_add_ss(f, TM##f)
1070 # if defined(__GNUC__) && defined(WARN_SSE)
1071 # warning Not using SSE3 -- consider passing -msse3
1072 # endif
1073 #endif // __SSE3__
1074 
1075 #define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1076 #define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1077 
1078 #define SUM_DOUBLE_FINAL(x) \
1079  _mm_store_sd(&x, f2)
1080 #define SUM_FLOAT_FINAL(x) \
1081  _mm_store_ss(&x, f2)
1082 
1083 
1084 /* Define missing intrinsics for full REGISTER copies */
1085 #define _mm_move_ps(f, x) x
1086 #define _mm_move_pd(f, x) x
1087 
1094 /* We don't need to save the upper values of f2 any more,
1095  * as the SISD (sd,ss) loop tails now preserve them */
1096 #define XSUM_DOUBLE_SIMD_FIN_STORE \
1097  /*double hif1, hif2;*/ \
1098  /*_mm_storeh_pd(&hif1, f1);*/ \
1099  /*_mm_storeh_pd(&hif2, f2)*/ \
1100  do {} while(0)
1101 #define XSUM_FLOAT_SIMD_FIN_STORE \
1102  /*float hif1[4], hif2[4];*/ \
1103  /*_mm_store_ps(hif1, f1);*/ \
1104  /*_mm_store_ps(hif2, f2)*/ \
1105  do {} while(0)
1106 
1107 /* Do the horizontal sums and the final application of the correction */
1108 
1109 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1110  /*f2 = _mm_loadh_pd(f2, &hif2);*/ \
1111  /*f1 = _mm_loadh_pd(f1, &hif1);*/ \
1112  SUM_DOUBLE_SIMD_FINX(f2); \
1113  SUM_DOUBLE_SIMD_FINX(f1); \
1114  f2 = _mm_sub_sd(f2, f1); \
1115  _mm_store_sd(&x, f2)
1116 #define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1117  /*_mm_store_ss(hif2, f2);*/ \
1118  /*_mm_store_ss(hif1, f1);*/ \
1119  /*f2 = _mm_load_ps(hif2);*/ \
1120  /*f1 = _mm_load_ps(hif1);*/ \
1121  SUM_FLOAT_SIMD_FINX(f2); \
1122  SUM_FLOAT_SIMD_FINX(f1); \
1123  f2 = _mm_sub_ss(f2, f1); \
1124  _mm_store_ss(&x, f2)
1125 
1126 /* Variant with compensation for lost bits in hadd_pd */
1127 #define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1128  /*f2 = _mm_loadh_pd(f2, &hif2);*/ \
1129  /*f1 = _mm_loadh_pd(f1, &hif1);*/ \
1130  __m128d TMP = f2; \
1131  SUM_DOUBLE_SIMD_FINX(f2); \
1132  __m128d COR = f2; \
1133  COR = _mm_sub_sd(COR, TMP); \
1134  TMP = _mm_unpackhi_pd(TMP, TMP); \
1135  COR = _mm_sub_sd(COR, TMP); \
1136  f1 = _mm_add_sd(f1, COR); \
1137  SUM_DOUBLE_SIMD_FINX(f1); \
1138  f2 = _mm_sub_sd(f2, f1); \
1139  _mm_store_sd(&x, f2)
1140 
1141 /* TODO: Variant for floats with compensation for lost bits in hadd_ps */
1142 
1143 
1144 
1146 // #define MULT2 (r,v1,f1,f2) f2 += r * v1
1147 #define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1148  TMP = _mm_load_##SUF(r); \
1149  LD = _mm_load##UNA1##_##SUF(v1); \
1150  TMP = _mm_mul_##SUF(TMP, LD); \
1151  f2 = _mm_add_##SUF(f2, TMP)
1152 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, sd, pd,
1153  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1154  2, double, __m128d)
1155 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, ss, ps,
1156  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1157  4, float, __m128)
1158 
1159 //do_vec_mult_exact
1160 #define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1161  TMP = _mm_load_##SUF(r); \
1162  LD = _mm_load##UNA1##_##SUF(v1); \
1163  TMP = _mm_mul_##SUF(TMP, LD); \
1164  LD = _mm_move_##SUF(LD, TMP); \
1165  TMP = _mm_add_##SUF(TMP, f2); \
1166  t = TMP; \
1167  TMP = _mm_sub_##SUF(TMP, f2); \
1168  TMP = _mm_sub_##SUF(TMP, LD); \
1169  f1 = _mm_add_##SUF(f1, TMP); \
1170  f2 = _mm_move_##SUF(f2, t)
1171 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, sd, pd,
1172  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1173  XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1174  2, double, __m128d)
1175 VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, ss, ps,
1176  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1177  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1178  4, float, __m128)
1179 
1180 
1181 template <> inline void do_vec_dot_exact<double>(const unsigned long sz,
1182  const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1183  double& _f2)
1184 {
1185  do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1186 }
1187 
1188 template <> inline void do_vec_dot_quick<double>(const unsigned long sz,
1189  const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1190  double& _f2)
1191 {
1192  do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1193 }
1194 
1195 template <> inline void do_vec_dot_exact<float>(const unsigned long sz,
1196  const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1197  float& _f2)
1198 {
1199  do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1200 }
1201 
1202 template <> inline void do_vec_dot_quick<float>(const unsigned long sz,
1203  const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1204  float& _f2)
1205 {
1206  do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1207 }
1208 
1210 VKERN_TEMPL_2V_T(do_vec_mult_unaligned_exact, XMULT2, T)
1211 VKERN_TEMPL_2V_T(do_vec_mult_unaligned_quick, MULT2, T)
1212 
1213 // TODO: Implement do_vec_sumsqr_exact
1214 
1216 // #define SQR1(r,f1,f2) f2 += r*r
1217 #define SQR1_SIMD(r,f1,f2,SUF) \
1218  TMP = _mm_load_##SUF(r); \
1219  TMP = _mm_mul_##SUF(TMP, TMP); \
1220  f2 = _mm_add_##SUF(f2, TMP)
1221 
1222 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, sd, pd,
1223  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1224  2, double, __m128d)
1225 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, ss, ps,
1226  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1227  4, float, __m128)
1228 
1229 //do_vec_sumsqr_exact
1230 #define XSQR1_SIMD(r,f1,f2,SUF) \
1231  TMP = _mm_load_##SUF(r); \
1232  TMP = _mm_mul_##SUF(TMP, TMP); \
1233  y = TMP; \
1234  TMP = _mm_add_##SUF(TMP, f2); \
1235  t = TMP; \
1236  TMP = _mm_sub_##SUF(TMP, f2); \
1237  TMP = _mm_sub_##SUF(TMP, y); \
1238  f1 = _mm_add_##SUF(f1, TMP); \
1239  f2 = _mm_move_##SUF(f2, t)
1240 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, sd, pd,
1241  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1242  XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1243  2, double, __m128d)
1244 VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, ss, ps,
1245  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1246  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1247  4, float, __m128)
1248 
1249 
1250 #ifndef TBCI_NO_SIMD_FABSSQR
1251 template <> inline void do_vec_fabssqr_quick<double>(const unsigned long sz,
1252  const double * const _v1, double& _f2)
1253 {
1254  double F2 = _f2;
1255  do_vec_sumsqr_quick<double>(sz, _v1, F2);
1256  _f2 = F2;
1257 }
1258 template <> inline void do_vec_fabssqr_exact<double>(const unsigned long sz,
1259  const double * const _v1, double& _f2)
1260 {
1261  double F2 = _f2;
1262  do_vec_sumsqr_exact<double>(sz, _v1, F2);
1263  _f2 = F2;
1264 }
1265 #endif // TBCI_NO_SIMD_FABSSQR
1266 #ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1267 template <> inline void do_vec_fabssqr_quick<float>(const unsigned long sz,
1268  const float * const _v1, double& _f2)
1269 {
1270  float F2 = _f2;
1271  do_vec_sumsqr_quick<float>(sz, _v1, F2);
1272  _f2 = F2;
1273 }
1274 template <> inline void do_vec_fabssqr_exact<float>(const unsigned long sz,
1275  const float * const _v1, double& _f2)
1276 {
1277  float F2 = _f2;
1278  do_vec_sumsqr_exact<float>(sz, _v1, F2);
1279  _f2 = F2;
1280 }
1281 #endif // TBCI_SIMD_FABSSQR_FLOAT
1282 
1284 //#define SUM1(r,f1,f2) f2 += r
1285 #define SUM1_SIMD(r,f1,f2,SUF) \
1286  TMP = _mm_load_##SUF(r); \
1287  f2 = _mm_add_##SUF(f2, TMP)
1288 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, sd, pd,
1289  SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1290  2, double, __m128d)
1291 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, ss, ps,
1292  SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1293  4, float, __m128)
1294 
1295 //#define XSUM1(r,f1,f2) { T t = f2+r; f1 += (t-f2)-r; f2 = t; }
1297 #define XSUM1_SIMD(r,f1,f2,SUF) \
1298  y = _mm_load_##SUF(r); \
1299  t = _mm_add_##SUF(f2, y); \
1300  TMP = _mm_sub_##SUF(t, f2); \
1301  TMP = _mm_sub_##SUF(TMP, y); \
1302  f1 = _mm_add_##SUF(f1, TMP); \
1303  f2 = _mm_move_##SUF(f2, t)
1304 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, sd, pd,
1305  XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1306  XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1307  2, double, __m128d)
1308 VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, ss, ps,
1309  XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1310  XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1311  4, float, __m128)
1312 
1313 #endif // TBCI_SIMD_SUM
1314 
1316 
1317 #endif // TBCI_SELECTIVE_INST
1318 
1319 #endif // __SSE2__
1320 
1321 #endif // H_VEC_KERN_SPECIAL_H
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void _tbci_fill(const unsigned long sz, T *const res, register typename tbci_traits< T >::loop_const_refval_type f2)
Definition: basics.h:907
#define NAMESPACE_TBCI
Definition: basics.h:317
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition: plain_def.h:119
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define NAMESPACE_END
Definition: basics.h:323
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define MULT2(r, v1, f1, f2)
#define T
Definition: bdmatlib.cc:20
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
void do_vv_comp(const unsigned long sz, const T *const v1, const T *const v2, volatile long &_f2)
f2 = number of differences vec, vec
Definition: basics.h:975
void _tbci_copy(const unsigned long sz, T *const res, const T *const v1)
Definition: basics.h:891