TBCI Numerical high perf. C++ Library 2.8.0
vec_kern_special.h
Go to the documentation of this file.
1
8
9#ifndef H_VEC_KERN_SPECIAL_H
10#define H_VEC_KERN_SPECIAL_H
11
63
64#if defined(__SSE2__) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65 ( defined(__x86_64__) || defined(__i386__) )
66
67#include <emmintrin.h>
68
69#if defined(HAVE_PMMINTRIN_H) && defined(__SSE3__)
70# include <pmmintrin.h>
71#else
72# undef __SSE3__
73#endif
74
75#include "tbci/unroll_prefetch_simd_def.h"
76
77/* TODO: Add define, controlling instantiation */
78
79#if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
80# include "vec_kern_special_gd.h"
81#else
82
84
85#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
86# warning Info: Using unrolled SSE2 vector kernels
87#endif
88
89// TODO: Are integers useful as well?
90// Unfortunately, the SSE commands follow a slightly different
91// naming and systematics, so there's some manual work required.
92// Maybe later ...
93
94#define SIMD_EMPTY0 do {} while (0)
95#define SIMD_EMPTY1(x) do {} while (0)
96#define SIMD_EMPTY2(x,y) do {} while (0)
97
98#define SIMD_CONST_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set1_pd(x)
99#define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __m128d f1 = _mm_set1_pd(x), f2 = _mm_set1_pd(y)
100
101#define SIMD_CONST_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set1_ps(x)
102#define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __m128 f1 = _mm_set1_ps(x), f2 = _mm_set1_ps(y)
103
104/* First the stuff from basics.h */
105
106/* Found a compiler bug in gcc 4.0.0 (PR 21239) */
107#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
108 __GNUC_MINOR__ == 0 && \
109 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
110# define _MM_STORE(mem, reg, SUF, UNA) \
111 asm(""::"x"(reg)); \
112 _mm_store##UNA##_##SUF(mem, reg)
113#else
114# define _MM_STORE(mem, reg, SUF, UNA) \
115 _mm_store##UNA##_##SUF(mem, reg)
116#endif
117
118// Unaligned accesses for single loads are harmless ...
119#define _mm_loadu_sd _mm_load_sd
120#define _mm_loadu_ss _mm_load_ss
121
122/* General policy: We'll always ensure alignment of the result
123 * vector, so no need to use unaligned insns for it. */
124
125#ifndef C_MEMALLOC
127/* Template decl should be in basics.h */
128//# define COPY2(res,v1,f1,f2) res = v1
129#define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
130 TMP = _mm_load##UNA1##_##SUF(v1); \
131 _MM_STORE(r, TMP, SUF,)
132VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, sd, pd,
133 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
134 2, double, __m128d)
135VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, ss, ps,
136 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
137 4, float, __m128)
138/* We could use more xmm REGISTERs here ... but the CPU internal
139 * REGISTER renaming should hide the latencies by not doing so. */
140
141/* TODO: Benchmark copy and fill ... */
142/* TODO: Avoid gcc unrolling this even more ... */
143/* TODO: Here integers and pointers certainly make sense */
144
145
146//# define FILL1(res,f1,f2) res = f2
147#define FILL1_SIMD(r,f1,f2,SUF) \
148 _MM_STORE(r, f2, SUF,)
149VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, sd, pd,
150 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
151 2, double, __m128d)
152VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, ss, ps,
153 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
154 4, float, __m128)
155/* TODO: Here integers and pointers certainly make sense */
156#endif
157
158/* 3Vec operations */
160//#define ADD3(r,v1,v2,f1,f2) r = v1 + v2
161#define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
162 TMP = _mm_load##UNA1##_##SUF(v1); \
163 LD = _mm_load##UNA2##_##SUF(v2); \
164 TMP = _mm_add_##SUF(TMP, LD); \
165 _MM_STORE(r, TMP, SUF,)
166VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, sd, pd,
167 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
168 2, double, __m128d)
169VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, ss, ps,
170 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
171 4, float, __m128)
172
173
174//#define SUB3(r,v1,v2,f1,f2) r = v1 - v2
175#define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
176 TMP = _mm_load##UNA1##_##SUF(v1); \
177 LD = _mm_load##UNA2##_##SUF(v2); \
178 TMP = _mm_sub_##SUF(TMP, LD); \
179 _MM_STORE(r, TMP, SUF,)
180VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, sd, pd,
181 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
182 2, double, __m128d)
183VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, ss, ps,
184 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
185 4, float, __m128)
186
187
188//#define MUL3(r,v1,v2,f1,f2) r = v1 * v2
189#define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
190 TMP = _mm_load##UNA1##_##SUF(v1); \
191 LD = _mm_load##UNA2##_##SUF(v2); \
192 TMP = _mm_mul_##SUF(TMP, LD); \
193 _MM_STORE(r, TMP, SUF,)
194VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, sd, pd,
195 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
196 2, double, __m128d)
197VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, ss, ps,
198 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
199 4, float, __m128)
200
201template <> inline void do_vec_vec_cmul<double>(const unsigned long sz,
202 double* RESTRICT const res, const double* RESTRICT const v1,
203 const double* RESTRICT const v2)
204{
205 do_vec_vec_mul<double>(sz, res, v1, v2);
206}
207template <> inline void do_vec_vec_cmul<float>(const unsigned long sz,
208 float* RESTRICT const res, const float* RESTRICT const v1,
209 const float* RESTRICT const v2)
210{
211 do_vec_vec_mul<float>(sz, res, v1, v2);
212}
213
215//#define DIV3(r,v1,v2,f1,f2) r = v1 / v2
216#define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
217 TMP = _mm_load##UNA1##_##SUF(v1); \
218 LD = _mm_load##UNA2##_##SUF(v2); \
219 TMP = _mm_div_##SUF(TMP, LD); \
220 _MM_STORE(r, TMP, SUF,)
221VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, sd, pd,
222 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
223 2, double, __m128d)
224VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, ss, ps,
225 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
226 4, float, __m128)
227
228template <> inline void do_vec_vec_cdiv<double>(const unsigned long sz,
229 double* RESTRICT const res, const double* RESTRICT const v1,
230 const double* RESTRICT const v2)
231{
232 do_vec_vec_div<double>(sz, res, v1, v2);
233}
234template <> inline void do_vec_vec_cdiv<float>(const unsigned long sz,
235 float* RESTRICT const res, const float* RESTRICT const v1,
236 const float* RESTRICT const v2)
237{
238 do_vec_vec_div<float>(sz, res, v1, v2);
239}
240
241
243//#define ADD2(r,v1,f1,f2) r += v1
244#define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
245 TMP = _mm_load_##SUF(r); \
246 LD = _mm_load##UNA1##_##SUF(v1); \
247 TMP = _mm_add_##SUF(TMP, LD); \
248 _MM_STORE(r, TMP, SUF,)
249VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, sd, pd,
250 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
251 2, double, __m128d)
252VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, ss, ps,
253 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
254 4, float, __m128)
255
256
257//#define SUB2(r,v1,f1,f2) r -= v1
258#define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
259 TMP = _mm_load_##SUF(r); \
260 LD = _mm_load##UNA1##_##SUF(v1); \
261 TMP = _mm_sub_##SUF(TMP, LD); \
262 _MM_STORE(r, TMP, SUF,)
263VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, sd, pd,
264 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
265 2, double, __m128d)
266VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, ss, ps,
267 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
268 4, float, __m128)
269
270
271//#define SUB2I(r,v1,f1,f2) r = v1 - r
272#define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
273 TMP = _mm_load_##SUF(r); \
274 LD = _mm_load##UNA1##_##SUF(v1); \
275 LD = _mm_sub_##SUF(LD, TMP); \
276 _MM_STORE(r, LD, SUF,)
277VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, sd, pd,
278 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
279 2, double, __m128d)
280VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, ss, ps,
281 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
282 4, float, __m128)
283
284
285//#define MUL2(r,v1,f1,f2) r *= v1
286#define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
287 TMP = _mm_load_##SUF(r); \
288 LD = _mm_load##UNA1##_##SUF(v1); \
289 TMP = _mm_mul_##SUF(TMP, LD); \
290 _MM_STORE(r, TMP, SUF,)
291VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, sd, pd,
292 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
293 2, double, __m128d)
294VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, ss, ps,
295 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
296 4, float, __m128)
297
298
299//#define CMUL2(r,v1,f1,f2) r = CPLX__ conj(r) * v1
300template <> inline void do_vec_cmul_vec<double>(const unsigned long sz,
301 double* RESTRICT const res, const double* RESTRICT const v1)
302{
303 do_vec_mul_vec<double>(sz, res, v1);
304}
305template <> inline void do_vec_cmul_vec<float>(const unsigned long sz,
306 float* RESTRICT const res, const float* RESTRICT const v1)
307{
308 do_vec_mul_vec<float>(sz, res, v1);
309}
311//#define CMUL2I(r,v1,f1,f2) r *= CPLX__ conj(v1)
312template <> inline void do_vec_cmul_vec_inv<double>(const unsigned long sz,
313 double* RESTRICT const res, const double* RESTRICT const v1)
314{
315 do_vec_mul_vec<double>(sz, res, v1);
316}
317template <> inline void do_vec_cmul_vec_inv<float>(const unsigned long sz,
318 float* RESTRICT const res, const float* RESTRICT const v1)
319{
320 do_vec_mul_vec<float>(sz, res, v1);
321}
322
324//#define DIV2(r,v1,f1,f2) r /= v1
325#define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
326 TMP = _mm_load_##SUF(r); \
327 LD = _mm_load##UNA1##_##SUF(v1); \
328 TMP = _mm_div_##SUF(TMP, LD); \
329 _MM_STORE(r, TMP, SUF,)
330VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, sd, pd,
331 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
332 2, double, __m128d)
333VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, ss, ps,
334 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
335 4, float, __m128)
336
337
338//#define DIV2I(r,v1,f1,f2) r = v1 / r
339#define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
340 TMP = _mm_load_##SUF(r); \
341 LD = _mm_load##UNA1##_##SUF(v1); \
342 LD = _mm_div_##SUF(LD, TMP); \
343 _MM_STORE(r, LD, SUF,)
344VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, sd, pd,
345 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
346 2, double, __m128d)
347VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, ss, ps,
348 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
349 4, float, __m128)
350
351
352//#define CDIV2(r,v1,f1,f2) r = CPLX__ conj(r) / v1
353template <> inline void do_vec_cdiv_vec<double>(const unsigned long sz,
354 double* RESTRICT const res, const double* RESTRICT const v1)
355{
356 do_vec_div_vec<double>(sz, res, v1);
357}
358template <> inline void do_vec_cdiv_vec<float>(const unsigned long sz,
359 float* RESTRICT const res, const float* RESTRICT const v1)
360{
361 do_vec_div_vec<float>(sz, res, v1);
362}
363
364
366//#define CDIV2I(r,v1,f1,f2) r = CPLX__ conj(v1) / r
367template <> inline void do_vec_cdiv_vec_inv<double>(const unsigned long sz,
368 double* RESTRICT const res, const double* RESTRICT const v1)
369{
370 do_vec_div_vec_inv<double>(sz, res, v1);
371}
372template <> inline void do_vec_cdiv_vec_inv<float>(const unsigned long sz,
373 float* RESTRICT const res, const float* RESTRICT const v1)
374{
375 do_vec_div_vec_inv<float>(sz, res, v1);
376}
377
379//#define ADD2NV(r,v1,f1,f2) r = v1 + f2
380#define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
381 TMP = _mm_load##UNA1##_##SUF(v1); \
382 TMP = _mm_add_##SUF(TMP, f2); \
383 _MM_STORE(r, TMP, SUF,)
384VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, sd, pd,
385 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
386 2, double, __m128d)
387VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, ss, ps,
388 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
389 4, float, __m128)
390
391
392//#define SUB2NV(r,v1,f1,f2) r = v1 - f2
393#define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
394 TMP = _mm_load##UNA1##_##SUF(v1); \
395 TMP = _mm_sub_##SUF(TMP, f2); \
396 _MM_STORE(r, TMP, SUF,)
397VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, sd, pd,
398 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
399 2, double, __m128d)
400VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, ss, ps,
401 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
402 4, float, __m128)
403
404
405//#define MUL2NV(r,v1,f1,f2) r = v1 * f2
406//VKERN_TEMPL_2V_C(do_vec_val_mul, MUL2NV);
407#define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
408 TMP = _mm_load##UNA1##_##SUF(v1); \
409 TMP = _mm_mul_##SUF(TMP, f2); \
410 _MM_STORE(r, TMP, SUF,)
411VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, sd, pd,
412 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
413 2, double, __m128d)
414VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, ss, ps,
415 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
416 4, float, __m128)
417
418
419
420//#define ADD2RV(r,v1,f1,f2) r = f2 + v1
421template <> inline void do_val_vec_add<double>(const unsigned long sz,
422 double* RESTRICT const res, const double* RESTRICT const v1,
423 LCTYPED(double) _f2)
424{
425 do_vec_val_add<double>(sz, res, v1, _f2);
426}
427template <> inline void do_val_vec_add<float>(const unsigned long sz,
428 float* RESTRICT const res, const float* RESTRICT const v1,
429 LCTYPED(float) _f2)
430{
431 do_vec_val_add<float>(sz, res, v1, _f2);
432}
433
435//#define SUB2RV(r,v1,f1,f2) r = f2 - v1
436#define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437 TMP = _mm_load##UNA1##_##SUF(v1); \
438 TMP = _mm_sub_##SUF(f2, TMP); \
439 _MM_STORE(r, TMP, SUF,)
440VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, sd, pd,
441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442 2, double, __m128d)
443VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, ss, ps,
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445 4, float, __m128)
446
447
448//#define MUL2RV(r,v1,f1,f2) r = f2 * v1
449template <> inline void do_val_vec_mul<double>(const unsigned long sz,
450 double* RESTRICT const res, const double* RESTRICT const v1,
451 LCTYPED(double) _f2)
452{
453 do_vec_val_mul<double>(sz, res, v1, _f2);
454}
455template <> inline void do_val_vec_mul<float>(const unsigned long sz,
456 float* RESTRICT const res, const float* RESTRICT const v1,
457 LCTYPED(float) _f2)
458{
459 do_vec_val_mul<float>(sz, res, v1, _f2);
460}
461
463//#define DIV2RV(r,v1,f1,f2) r = f2 / v1
464#define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
465 TMP = _mm_load##UNA1##_##SUF(v1); \
466 TMP = _mm_div_##SUF(f2, TMP); \
467 _MM_STORE(r, TMP, SUF,)
468VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, sd, pd,
469 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
470 2, double, __m128d)
471VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, ss, ps,
472 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
473 4, float, __m128)
474
475/* The one vec operations don't need unaligned versions,
476 * as the loop preamble will always be able to lead us to an
477 * aligned situation */
478
479
480//#define ADD1NV(r,f1,f2) r += f2
481#define ADD1NV_SIMD(r,f1,f2,SUF) \
482 TMP = _mm_load_##SUF(r); \
483 TMP = _mm_add_##SUF(TMP, f2); \
484 _MM_STORE(r, TMP, SUF,)
485VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, sd, pd,
486 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
487 2, double, __m128d)
488VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, ss, ps,
489 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
490 4, float, __m128)
491
492
493//#define SUB1NV(r,f1,f2) r -= f2
494#define SUB1NV_SIMD(r,f1,f2,SUF) \
495 TMP = _mm_load_##SUF(r); \
496 TMP = _mm_sub_##SUF(TMP, f2); \
497 _MM_STORE(r, TMP, SUF,)
498VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, sd, pd,
499 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
500 2, double, __m128d)
501VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, ss, ps,
502 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
503 4, float, __m128)
504
505
506//#define SUB1RV(r,f1,f2) r = f2 - r
507#define SUB1RV_SIMD(r,f1,f2,SUF) \
508 TMP = _mm_load_##SUF(r); \
509 TMP = _mm_sub_##SUF(f2, TMP); \
510 _MM_STORE(r, TMP, SUF,)
511VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, sd, pd,
512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513 2, double, __m128d)
514VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, ss, ps,
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516 4, float, __m128)
517
518
519//#define MUL1NV(r,f1,f2) r *= f2
520#define MUL1NV_SIMD(r,f1,f2,SUF) \
521 TMP = _mm_load_##SUF(r); \
522 TMP = _mm_mul_##SUF(TMP, f2); \
523 _MM_STORE(r, TMP, SUF,)
524VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, sd, pd,
525 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
526 2, double, __m128d)
527VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, ss, ps,
528 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
529 4, float, __m128)
530
531
532//#define DIV1NV(r,f1,f2) r /= f2
533#define DIV1NV_SIMD(r,f1,f2,SUF) \
534 TMP = _mm_load_##SUF(r); \
535 TMP = _mm_div_##SUF(TMP, f2); \
536 _MM_STORE(r, TMP, SUF,)
537VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, sd, pd,
538 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
539 2, double, __m128d)
540VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, ss, ps,
541 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
542 4, float, __m128)
543
544
545//#define DIV1RV(r,f1,f2) r = f2 / r
546#define DIV1RV_SIMD(r,f1,f2,SUF) \
547 TMP = _mm_load_##SUF(r); \
548 TMP = _mm_div_##SUF(f2, TMP); \
549 _MM_STORE(r, TMP, SUF,)
550VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, sd, pd,
551 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
552 2, double, __m128d)
553VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, ss, ps,
554 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
555 4, float, __m128)
556
557
558//#define ADD1RV(r,f1,f2) r = f2 + r
559template <> inline void do_val_add_vec<double>(const unsigned long sz,
560 double* RESTRICT const res, LCTYPED(double) _f2)
561{
562 do_vec_add_val<double>(sz, res, _f2);
563}
564template <> inline void do_val_add_vec<float>(const unsigned long sz,
565 float* RESTRICT const res, LCTYPED(float) _f2)
566{
567 do_vec_add_val<float>(sz, res, _f2);
568}
569
570// unused ...
571//#define MUL1RV(r,f1,f2) r = f2 * r;
572
573
574
575/* TSVector stuff */
576
578//#define ADD2NS(r,v1,f1,f2) r += f2*v1
579#define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
580 LD = _mm_load##UNA1##_##SUF(v1); \
581 TMP = _mm_load_##SUF(r); \
582 LD = _mm_mul_##SUF(LD, f2); \
583 TMP = _mm_add_##SUF(TMP, LD); \
584 _MM_STORE(r, TMP, SUF,)
585VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, sd, pd,
586 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
587 2, double, __m128d)
588VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, ss, ps,
589 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
590 4, float, __m128)
591
592
593//#define SUB2NS(r,v1,f1,f2) r -= f2*v1
594#define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
595 LD = _mm_load##UNA1##_##SUF(v1); \
596 TMP = _mm_load_##SUF(r); \
597 LD = _mm_mul_##SUF(LD, f2); \
598 TMP = _mm_sub_##SUF(TMP, LD); \
599 _MM_STORE(r, TMP, SUF,)
600VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, sd, pd,
601 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
602 2, double, __m128d)
603VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, ss, ps,
604 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
605 4, float, __m128)
606
607
608//#define SUB2RS(r,v1,f1,f2) r = f2*v1 - r
609#define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
610 LD = _mm_load##UNA1##_##SUF(v1); \
611 TMP = _mm_load_##SUF(r); \
612 LD = _mm_mul_##SUF(LD, f2); \
613 LD = _mm_sub_##SUF(LD, TMP); \
614 _MM_STORE(r, LD, SUF,)
615VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, sd, pd,
616 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
617 2, double, __m128d)
618VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, ss, ps,
619 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
620 4, float, __m128)
621
622
623//#define ADD3NS(r,v1,v2,f1,f2) r = v1 + f2*v2
624#define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
625 LD = _mm_load##UNA2##_##SUF(v2); \
626 TMP = _mm_load##UNA1##_##SUF(v1); \
627 LD = _mm_mul_##SUF(LD, f2); \
628 TMP = _mm_add_##SUF(TMP, LD); \
629 _MM_STORE(r, TMP, SUF,)
630VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, sd, pd,
631 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
632 2, double, __m128d)
633VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, ss, ps,
634 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
635 4, float, __m128)
636
637
638//#define SUB3NS(r,v1,v2,f1,f2) r = v1 - f2*v2
639#define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
640 LD = _mm_load##UNA2##_##SUF(v2); \
641 TMP = _mm_load##UNA1##_##SUF(v1); \
642 LD = _mm_mul_##SUF(LD, f2); \
643 TMP = _mm_sub_##SUF(TMP, LD); \
644 _MM_STORE(r, TMP, SUF,)
645VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, sd, pd,
646 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
647 2, double, __m128d)
648VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, ss, ps,
649 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
650 4, float, __m128)
651
652
653
654//#define ADD3SN(r,v1,v2,f1,f2) r = f2*v1 + v2
655#define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
656 LD = _mm_load##UNA1##_##SUF(v1); \
657 TMP = _mm_load##UNA2##_##SUF(v2); \
658 LD = _mm_mul_##SUF(LD, f2); \
659 TMP = _mm_add_##SUF(TMP, LD); \
660 _MM_STORE(r, TMP, SUF,)
661VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, sd, pd,
662 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
663 2, double, __m128d)
664VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, ss, ps,
665 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
666 4, float, __m128)
667
668
669//#define SUB3SN(r,v1,v2,f1,f2) r = f2*v1 - v2
670#define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
671 LD = _mm_load##UNA1##_##SUF(v1); \
672 TMP = _mm_load##UNA2##_##SUF(v2); \
673 LD = _mm_mul_##SUF(LD, f2); \
674 LD = _mm_sub_##SUF(LD, TMP); \
675 _MM_STORE(r, LD, SUF,)
676VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, sd, pd,
677 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
678 2, double, __m128d)
679VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, ss, ps,
680 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
681 4, float, __m128)
682
683
684
685//#define ADD3SS(r,v1,v2,f1,f2) r = f1*v1 + f2*v2
686#define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
687 LD = _mm_load##UNA1##_##SUF(v1); \
688 TMP = _mm_load##UNA2##_##SUF(v2); \
689 LD = _mm_mul_##SUF(LD, f1); \
690 TMP = _mm_mul_##SUF(TMP, f2); \
691 LD = _mm_add_##SUF(LD, TMP); \
692 _MM_STORE(r, LD, SUF,)
693VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, sd, pd,
694 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
695 2, double, __m128d)
696VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, ss, ps,
697 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
698 4, float, __m128)
699
700
701//#define SUB3SS(r,v1,v2,f1,f2) r = f1*v1 - f2*v2
702#define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
703 LD = _mm_load##UNA1##_##SUF(v1); \
704 TMP = _mm_load##UNA2##_##SUF(v2); \
705 LD = _mm_mul_##SUF(LD, f1); \
706 TMP = _mm_mul_##SUF(TMP, f2); \
707 LD = _mm_sub_##SUF(LD, TMP); \
708 _MM_STORE(r, LD, SUF,)
709VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, sd, pd,
710 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
711 2, double, __m128d)
712VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, ss, ps,
713 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
714 4, float, __m128)
715
716
717
718//#define ADD2SN(r,v1,f1,f2) r = f2*r + v1
719#define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
720 LD = _mm_load_##SUF(r); \
721 TMP = _mm_load##UNA1##_##SUF(v1); \
722 LD = _mm_mul_##SUF(LD, f2); \
723 TMP = _mm_add_##SUF(TMP, LD); \
724 _MM_STORE(r, TMP, SUF,)
725VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, sd, pd,
726 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
727 2, double, __m128d)
728VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, ss, ps,
729 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
730 4, float, __m128)
731
732
733//#define SUB2SN(r,v1,f1,f2) r = f2*r - v1
734#define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
735 LD = _mm_load_##SUF(r); \
736 TMP = _mm_load##UNA1##_##SUF(v1); \
737 LD = _mm_mul_##SUF(LD, f2); \
738 LD = _mm_sub_##SUF(LD, TMP); \
739 _MM_STORE(r, LD, SUF,)
740VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, sd, pd,
741 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
742 2, double, __m128d)
743VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, ss, ps,
744 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
745 4, float, __m128)
746
747
748//#define ADD2SS(r,v1,f1,f2) r = f1*r + f2*v1
749#define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
750 LD = _mm_load_##SUF(r); \
751 TMP = _mm_load##UNA1##_##SUF(v1); \
752 LD = _mm_mul_##SUF(LD, f1); \
753 TMP = _mm_mul_##SUF(TMP, f2); \
754 LD = _mm_add_##SUF(LD, TMP); \
755 _MM_STORE(r, LD, SUF,)
756VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, sd, pd,
757 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
758 2, double, __m128d)
759VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, ss, ps,
760 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
761 4, float, __m128)
762
763
764//#define SUB2SS(r,v1,f1,f2) r = f1*r - f2*v1
765#define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
766 LD = _mm_load_##SUF(r); \
767 TMP = _mm_load##UNA1##_##SUF(v1); \
768 LD = _mm_mul_##SUF(LD, f1); \
769 TMP = _mm_mul_##SUF(TMP, f2); \
770 LD = _mm_sub_##SUF(LD, TMP); \
771 _MM_STORE(r, LD, SUF,)
772VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, sd, pd,
773 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774 2, double, __m128d)
775VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, ss, ps,
776 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
777 4, float, __m128)
778
779
780
781//#define ADD2SV(r,v1,f1,f2) r = f1*v1 + f2
782#define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
783 TMP = _mm_load##UNA1##_##SUF(v1); \
784 TMP = _mm_mul_##SUF(TMP, f1); \
785 TMP = _mm_add_##SUF(TMP, f2); \
786 _MM_STORE(r, TMP, SUF,)
787VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, sd, pd,
788 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
789 2, double, __m128d)
790VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, ss, ps,
791 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
792 4, float, __m128)
793
794
795//#define SUB2SV(r,v1,f1,f2) r = f1*v1 - f2
796#define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
797 TMP = _mm_load##UNA1##_##SUF(v1); \
798 TMP = _mm_mul_##SUF(TMP, f1); \
799 TMP = _mm_sub_##SUF(TMP, f2); \
800 _MM_STORE(r, TMP, SUF,)
801VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, sd, pd,
802 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
803 2, double, __m128d)
804VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, ss, ps,
805 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
806 4, float, __m128)
807
808
809
810//#define ADD1SV(r,f1,f2) r = f1*r + f2
811#define ADD1SV_SIMD(r,f1,f2,SUF) \
812 TMP = _mm_load_##SUF(r); \
813 TMP = _mm_mul_##SUF(TMP, f1); \
814 TMP = _mm_add_##SUF(TMP, f2); \
815 _MM_STORE(r, TMP, SUF,)
816VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, sd, pd,
817 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818 2, double, __m128d)
819VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, ss, ps,
820 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
821 4, float, __m128)
822
823
824//#define SUB1SV(r,f1,f2) r = f1*r - f2
825#define SUB1SV_SIMD(r,f1,f2,SUF) \
826 TMP = _mm_load_##SUF(r); \
827 TMP = _mm_mul_##SUF(TMP, f1); \
828 TMP = _mm_sub_##SUF(TMP, f2); \
829 _MM_STORE(r, TMP, SUF,)
830VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, sd, pd,
831 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
832 2, double, __m128d)
833VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, ss, ps,
834 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
835 4, float, __m128)
836
837
838
839//#define ADD2VS(r,v1,f1,f2) r = f1 + f2*v1
840template <> inline void do_val_svc_add<double>(const unsigned long sz,
841 double* RESTRICT const res, const double* RESTRICT const v1,
842 LCTYPED(double) f1, LCTYPED(double) f2)
843{
844 do_svc_val_add<double>(sz, res, v1, f2, f1); // note the reverse order!
845}
846template <> inline void do_val_svc_add<float>(const unsigned long sz,
847 float* RESTRICT const res, const float* RESTRICT const v1,
848 LCTYPED(float) f1, LCTYPED(float) f2)
849{
850 do_svc_val_add<float>(sz, res, v1, f2, f1); // note the reverse order!
851}
852
854//#define SUB2VS(r,v1,f1,f2) r = f1 - f2*v1
855#define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
856 TMP = _mm_load##UNA1##_##SUF(v1); \
857 TMP = _mm_mul_##SUF(TMP, f2); \
858 TMP = _mm_sub_##SUF(f1, TMP); \
859 _MM_STORE(r, TMP, SUF,)
860VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, sd, pd,
861 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
862 2, double, __m128d)
863VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, ss, ps,
864 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
865 4, float, __m128)
866
867
868//#define DIV2VS(r,v1,f1,f2) r = f1 / (f2*v1)
869#define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
870 TMP = _mm_load##UNA1##_##SUF(v1); \
871 TMP = _mm_mul_##SUF(TMP, f2); \
872 TMP = _mm_div_##SUF(f1, TMP); \
873 _MM_STORE(r, TMP, SUF,)
874VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, sd, pd,
875 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
876 2, double, __m128d)
877VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, ss, ps,
878 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
879 4, float, __m128)
880
881
882/* ... */
883
884
885/* For negation, use our knowledge of the position of the sign bits */
886#ifdef HAVE_LONG_LONG
887#define NEG_DOUBLE_PREP \
888 static union _negmask { \
889 unsigned LONG_LONG lng[2]; \
890 double dbl[2]; \
891 __m128d m128d; \
892 } ALIGN(16) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
893 __m128d neg = _mm_load_pd(negmask.dbl)
894#else
895#define NEG_DOUBLE_PREP \
896 static union _negmask { \
897 unsigned int lng[4]; \
898 double dbl[2]; \
899 __m128d m128d; \
900 } ALIGN(16) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
901 __m128d neg = _mm_load_pd(negmask.dbl)
902#endif
903#define NEG_FLOAT_PREP \
904 static union _negmask { \
905 unsigned int itg[4]; \
906 float flt[4]; \
907 __m128 m128s; \
908 } ALIGN(16) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
909 __m128 neg = _mm_load_ps(negmask.flt)
910
911/* single val xor operations don't exist, but we don't care ... */
912#define _mm_xor_sd _mm_xor_pd
913#define _mm_xor_ss _mm_xor_ps
914
916//#define NEG2(r,v1,f1,f2) r = -v1
917#define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
918 TMP = _mm_load##UNA1##_##SUF(v1); \
919 TMP = _mm_xor_##SUF(TMP, neg); \
920 _MM_STORE(r, TMP, SUF,)
921VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, sd, pd,
922 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
923 2, double, __m128d)
924VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, ss, ps,
925 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
926 4, float, __m128)
927
928
929//#define NEG1(r,f1,f2) r = -r
930#define NEG1_SIMD(r,f1,f2,SUF) \
931 TMP = _mm_load_##SUF(r); \
932 TMP = _mm_xor_##SUF(TMP, neg); \
933 _MM_STORE(r, TMP, SUF,)
934VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, sd, pd,
935 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
936 2, double, __m128d)
937VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, ss, ps,
938 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
939 4, float, __m128)
940
941
942
943//#define COMP2(r,v1,f1,f2) if (r != v1) { ++f2; break; }
944//VKERN_TEMPL_2V_T(do_vv_comp, COMP2, volatile long);
945#define VL_PREP(x) long f2 = (x)
946#define VL_FIN(x) x = f2
947#define _mm_movemask_sd(x) \
948 _mm_movemask_pd(x); rg &= 0x1
949#define _mm_movemask_ss(x) \
950 _mm_movemask_ps(x); rg &= 0x1
951#define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
952 TMP = _mm_load_##SUF(r); \
953 LD = _mm_load_##SUF(v1); \
954 TMP = _mm_cmpneq_##SUF(TMP, LD); \
955 /* And now? movmskpd and bt? */ \
956 rg = _mm_movemask_##SUF(TMP); \
957 if (rg) { ++f2; /*fprintf(stderr, "DIFF @ %li: %i\n", sz-i, rg);*/ goto _fin; }
958VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, sd, pd,
959 VL_PREP, SIMD_EMPTY0, VL_FIN,
960 2, double, __m128d)
961VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, ss, ps,
962 VL_PREP, SIMD_EMPTY0, VL_FIN,
963 4, float, __m128)
964
965
966// Used in do_bdmat_vec_mult
967#define DECL_DOUBLE __m128d TM2
968#define DECL_FLOAT __m128 TM2
969
970//#define SUMMULT3(r,v1,v2,f1,f2) r += v1*v2
971#define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
972 TMP = _mm_load##UNA1##_##SUF(v1); \
973 LD = _mm_load##UNA2##_##SUF(v2); \
974 TM2 = _mm_load_##SUF(r); \
975 TMP = _mm_mul_##SUF(TMP, LD); \
976 TM2 = _mm_add_##SUF(TM2, TMP); \
977 _MM_STORE(r, TM2, SUF,)
978#if 1 /* These are used in bdmat_vec_mul -- unaligned accesses are unavoidable */
979VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
980 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
981 2, double, __m128d);
982VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
983 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
984 4, float, __m128);
985#else
986VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
987 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
988 2, double, __m128d)
989VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
990 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
991 4, float, __m128)
992#endif
993
994//#define SUMCMULT3(r,v1,v2,f1,f2) r += CPLX__ conj(v1)*v2
995template <> inline void do_add_vec_vec_cmul<double>(const unsigned long sz,
996 double* RESTRICT const r, const double* RESTRICT const v1,
997 const double* RESTRICT const v2)
998{
999 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1000}
1001template <> inline void do_add_vec_vec_cmul<float>(const unsigned long sz,
1002 float* RESTRICT const r, const float* RESTRICT const v1,
1003 const float* RESTRICT const v2)
1004{
1005 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1006}
1007
1008
1030
1031#ifndef TBCI_NO_SIMD_SUM
1032
1033#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1034# warning Info: Using unrolled SSE2 vector kernels for sums (reductions)
1035#endif
1036
1037#define SUM_DOUBLE_PREP(x) REGISTER __m128d f2 = _mm_set_sd(x)
1038#define SUM_FLOAT_PREP(x) REGISTER __m128 f2 = _mm_set_ss(x)
1039
1040#define XSUM_DOUBLE_PREP(x) \
1041 REGISTER __m128d f1 = _mm_setzero_pd();\
1042 REGISTER __m128d f2 = _mm_set_sd(x)
1043#define XSUM_FLOAT_PREP(x) \
1044 REGISTER __m128 f1 = _mm_setzero_ps(); \
1045 REGISTER __m128 f2 = _mm_set_ss(x)
1046
1051
1052#ifdef __SSE3__
1053# define SUM_DOUBLE_SIMD_FINX(f) \
1054 f = _mm_hadd_pd(f, f)
1055# define SUM_FLOAT_SIMD_FINX(f) \
1056 f = _mm_hadd_ps(f, f); \
1057 f = _mm_hadd_ps(f, f)
1058#else // __SSE3__
1059# define SUM_DOUBLE_SIMD_FINX(f) \
1060 __m128d TM##f = f; \
1061 TM##f = _mm_unpackhi_pd(TM##f, f); \
1062 f = _mm_add_sd(f, TM##f)
1063# define SUM_FLOAT_SIMD_FINX(f) \
1064 __m128 TM##f = f; \
1065 TM##f = _mm_shuffle_ps(TM##f, f, 0xb1); \
1066 f = _mm_add_ps(f, TM##f); \
1067 TM##f = f; \
1068 TM##f = _mm_shuffle_ps(TM##f, f, 0x1b); \
1069 f = _mm_add_ss(f, TM##f)
1070# if defined(__GNUC__) && defined(WARN_SSE)
1071# warning Not using SSE3 -- consider passing -msse3
1072# endif
1073#endif // __SSE3__
1074
1075#define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1076#define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1077
1078#define SUM_DOUBLE_FINAL(x) \
1079 _mm_store_sd(&x, f2)
1080#define SUM_FLOAT_FINAL(x) \
1081 _mm_store_ss(&x, f2)
1082
1083
1084/* Define missing intrinsics for full REGISTER copies */
1085#define _mm_move_ps(f, x) x
1086#define _mm_move_pd(f, x) x
1087
1093
1094/* We don't need to save the upper values of f2 any more,
1095 * as the SISD (sd,ss) loop tails now preserve them */
1096#define XSUM_DOUBLE_SIMD_FIN_STORE \
1097 /*double hif1, hif2;*/ \
1098 /*_mm_storeh_pd(&hif1, f1);*/ \
1099 /*_mm_storeh_pd(&hif2, f2)*/ \
1100 do {} while(0)
1101#define XSUM_FLOAT_SIMD_FIN_STORE \
1102 /*float hif1[4], hif2[4];*/ \
1103 /*_mm_store_ps(hif1, f1);*/ \
1104 /*_mm_store_ps(hif2, f2)*/ \
1105 do {} while(0)
1106
1107/* Do the horizontal sums and the final application of the correction */
1108
1109#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1110 /*f2 = _mm_loadh_pd(f2, &hif2);*/ \
1111 /*f1 = _mm_loadh_pd(f1, &hif1);*/ \
1112 SUM_DOUBLE_SIMD_FINX(f2); \
1113 SUM_DOUBLE_SIMD_FINX(f1); \
1114 f2 = _mm_sub_sd(f2, f1); \
1115 _mm_store_sd(&x, f2)
1116#define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1117 /*_mm_store_ss(hif2, f2);*/ \
1118 /*_mm_store_ss(hif1, f1);*/ \
1119 /*f2 = _mm_load_ps(hif2);*/ \
1120 /*f1 = _mm_load_ps(hif1);*/ \
1121 SUM_FLOAT_SIMD_FINX(f2); \
1122 SUM_FLOAT_SIMD_FINX(f1); \
1123 f2 = _mm_sub_ss(f2, f1); \
1124 _mm_store_ss(&x, f2)
1125
1126/* Variant with compensation for lost bits in hadd_pd */
1127#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1128 /*f2 = _mm_loadh_pd(f2, &hif2);*/ \
1129 /*f1 = _mm_loadh_pd(f1, &hif1);*/ \
1130 __m128d TMP = f2; \
1131 SUM_DOUBLE_SIMD_FINX(f2); \
1132 __m128d COR = f2; \
1133 COR = _mm_sub_sd(COR, TMP); \
1134 TMP = _mm_unpackhi_pd(TMP, TMP); \
1135 COR = _mm_sub_sd(COR, TMP); \
1136 f1 = _mm_add_sd(f1, COR); \
1137 SUM_DOUBLE_SIMD_FINX(f1); \
1138 f2 = _mm_sub_sd(f2, f1); \
1139 _mm_store_sd(&x, f2)
1140
1141/* TODO: Variant for floats with compensation for lost bits in hadd_ps */
1142
1143
1144
1146// #define MULT2 (r,v1,f1,f2) f2 += r * v1
1147#define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1148 TMP = _mm_load_##SUF(r); \
1149 LD = _mm_load##UNA1##_##SUF(v1); \
1150 TMP = _mm_mul_##SUF(TMP, LD); \
1151 f2 = _mm_add_##SUF(f2, TMP)
1152VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, sd, pd,
1153 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1154 2, double, __m128d)
1155VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, ss, ps,
1156 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1157 4, float, __m128)
1158
1159//do_vec_mult_exact
1160#define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1161 TMP = _mm_load_##SUF(r); \
1162 LD = _mm_load##UNA1##_##SUF(v1); \
1163 TMP = _mm_mul_##SUF(TMP, LD); \
1164 LD = _mm_move_##SUF(LD, TMP); \
1165 TMP = _mm_add_##SUF(TMP, f2); \
1166 t = TMP; \
1167 TMP = _mm_sub_##SUF(TMP, f2); \
1168 TMP = _mm_sub_##SUF(TMP, LD); \
1169 f1 = _mm_add_##SUF(f1, TMP); \
1170 f2 = _mm_move_##SUF(f2, t)
1171VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, sd, pd,
1172 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1173 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1174 2, double, __m128d)
1175VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, ss, ps,
1176 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1177 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1178 4, float, __m128)
1179
1180
1181template <> inline void do_vec_dot_exact<double>(const unsigned long sz,
1182 const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1183 double& _f2)
1184{
1185 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1186}
1187
1188template <> inline void do_vec_dot_quick<double>(const unsigned long sz,
1189 const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1190 double& _f2)
1191{
1192 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1193}
1194
1195template <> inline void do_vec_dot_exact<float>(const unsigned long sz,
1196 const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1197 float& _f2)
1198{
1199 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1200}
1201
1202template <> inline void do_vec_dot_quick<float>(const unsigned long sz,
1203 const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1204 float& _f2)
1205{
1206 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1207}
1208
1210VKERN_TEMPL_2V_T(do_vec_mult_unaligned_exact, XMULT2, T)
1211VKERN_TEMPL_2V_T(do_vec_mult_unaligned_quick, MULT2, T)
1212
1213// TODO: Implement do_vec_sumsqr_exact
1214
1215
1216// #define SQR1(r,f1,f2) f2 += r*r
1217#define SQR1_SIMD(r,f1,f2,SUF) \
1218 TMP = _mm_load_##SUF(r); \
1219 TMP = _mm_mul_##SUF(TMP, TMP); \
1220 f2 = _mm_add_##SUF(f2, TMP)
1221
1222VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, sd, pd,
1223 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1224 2, double, __m128d)
1225VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, ss, ps,
1226 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1227 4, float, __m128)
1228
1229//do_vec_sumsqr_exact
1230#define XSQR1_SIMD(r,f1,f2,SUF) \
1231 TMP = _mm_load_##SUF(r); \
1232 TMP = _mm_mul_##SUF(TMP, TMP); \
1233 y = TMP; \
1234 TMP = _mm_add_##SUF(TMP, f2); \
1235 t = TMP; \
1236 TMP = _mm_sub_##SUF(TMP, f2); \
1237 TMP = _mm_sub_##SUF(TMP, y); \
1238 f1 = _mm_add_##SUF(f1, TMP); \
1239 f2 = _mm_move_##SUF(f2, t)
1240VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, sd, pd,
1241 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1242 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1243 2, double, __m128d)
1244VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, ss, ps,
1245 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1246 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1247 4, float, __m128)
1248
1249
1250#ifndef TBCI_NO_SIMD_FABSSQR
1251template <> inline void do_vec_fabssqr_quick<double>(const unsigned long sz,
1252 const double * const _v1, double& _f2)
1253{
1254 double F2 = _f2;
1255 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1256 _f2 = F2;
1257}
1258template <> inline void do_vec_fabssqr_exact<double>(const unsigned long sz,
1259 const double * const _v1, double& _f2)
1260{
1261 double F2 = _f2;
1262 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1263 _f2 = F2;
1264}
1265#endif // TBCI_NO_SIMD_FABSSQR
1266#ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1267template <> inline void do_vec_fabssqr_quick<float>(const unsigned long sz,
1268 const float * const _v1, double& _f2)
1269{
1270 float F2 = _f2;
1271 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1272 _f2 = F2;
1273}
1274template <> inline void do_vec_fabssqr_exact<float>(const unsigned long sz,
1275 const float * const _v1, double& _f2)
1276{
1277 float F2 = _f2;
1278 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1279 _f2 = F2;
1280}
1281#endif // TBCI_SIMD_FABSSQR_FLOAT
1282
1284//#define SUM1(r,f1,f2) f2 += r
1285#define SUM1_SIMD(r,f1,f2,SUF) \
1286 TMP = _mm_load_##SUF(r); \
1287 f2 = _mm_add_##SUF(f2, TMP)
1288VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, sd, pd,
1289 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1290 2, double, __m128d)
1291VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, ss, ps,
1292 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1293 4, float, __m128)
1294
1295//#define XSUM1(r,f1,f2) { T t = f2+r; f1 += (t-f2)-r; f2 = t; }
1297#define XSUM1_SIMD(r,f1,f2,SUF) \
1298 y = _mm_load_##SUF(r); \
1299 t = _mm_add_##SUF(f2, y); \
1300 TMP = _mm_sub_##SUF(t, f2); \
1301 TMP = _mm_sub_##SUF(TMP, y); \
1302 f1 = _mm_add_##SUF(f1, TMP); \
1303 f2 = _mm_move_##SUF(f2, t)
1304VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, sd, pd,
1305 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1306 XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X,
1307 2, double, __m128d)
1308VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, ss, ps,
1309 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1310 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1311 4, float, __m128)
1312
1313#endif // TBCI_SIMD_SUM
1314
1316
1317#endif // TBCI_SELECTIVE_INST
1318
1319#endif // __SSE2__
1320
1321#endif // H_VEC_KERN_SPECIAL_H
#define NAMESPACE_END
Definition basics.h:323
#define NAMESPACE_TBCI
Definition basics.h:317
#define RESTRICT
Definition basics.h:89
#define T
Definition bdmatlib.cc:20
const unsigned TMatrix< T > * res
#define LCTYPED(T)
Definition plain_def.h:14
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition plain_def.h:119
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define MULT2(r, v1, f1, f2)