TBCI Numerical high perf. C++ Library 2.8.0
vec_kern_special2.h
Go to the documentation of this file.
1
8
9#ifndef H_VEC_KERN_SPECIAL2_H
10#define H_VEC_KERN_SPECIAL2_H
11
63
64#if defined(__AVX__) && defined(HAVE_IMMINTRIN_H) && defined(HAVE_WEAK_ATTR) && \
65 ( defined(__x86_64__) || defined(__i386__) )
66
67#include <immintrin.h>
68
69#include "tbci/unroll_prefetch_simd_def.h"
70
71/* TODO: Add define, controlling instantiation */
72
73#if 0 //defined(TBCI_SELECTIVE_INST) && !defined(TBCI_INSTANTIATE) && !defined(AUTO_DECL)
74# include "vec_kern_special2_gd.h"
75#else
76
77#ifdef WARN_SSE
78# define WARN_SIMD
79#endif
80
82
83#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
84# warning Info: Using unrolled AVX vector kernels
85#endif
86
87// TODO: Are integers useful as well?
88// Unfortunately, the SSE commands follow a slightly different
89// naming and systematics, so there's some manual work required.
90// Maybe later ...
91
92#ifdef AVX512
93# define __MAVXD __m512d
94# define __MAVXS __m512
95# define _MMAVX(x) _mm512_##x
96# define AVXMD 8
97# define AVXMS 16
98#else
99# define __MAVXD __m256d
100# define __MAVXS __m256
101# define _MMAVX(x) _mm256_##x
102# define AVXMD 4
103# define AVXMS 8
104#endif
105
106
107#define SIMD_EMPTY0 do {} while (0)
108#define SIMD_EMPTY1(x) do {} while (0)
109#define SIMD_EMPTY2(x,y) do {} while (0)
110
111// Note: The
112#define SIMD_CONST_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set1_pd(x))
113#define SIMD_2CONST_DOUBLE_PREP(x,y) REGISTER __MAVXD f1 = _MMAVX(set1_pd(x)), f2 = _MMAVX(set1_pd(y))
114
115#define SIMD_CONST_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set1_ps(x))
116#define SIMD_2CONST_FLOAT_PREP(x,y) REGISTER __MAVXS f1 = _MMAVX(set1_ps(x)), f2 = _MMAVX(set1_ps(y))
117
118
119/* _mm256_load_sd, _ss do not exist */
120#define _mm256_set_sd(d) (__extension__ (__m256d){ d, 0.0, 0.0, 0.0 })
121#define _mm256_set_ss(f) (__extension__ (__m256 ){ f, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
122#define _mm256_load_sd(m) (__extension__ (__m256d){ *m, 0.0, 0.0, 0.0 })
123#define _mm256_loadu_sd(m) (__extension__ (__m256d_u){ *m, 0.0, 0.0, 0.0 })
124#define _mm256_load_ss(m) (__extension__ (__m256 ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
125#define _mm256_loadu_ss(m) (__extension__ (__m256_u ){ *m, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 })
126#define _mm256_store_sd(m, r) *m = ((__v4df)r)[0]
127#define _mm256_store_ss(m, r) *m = ((__v8sf)r)[0]
128
129/* First the stuff from basics.h */
130
131/* Found a compiler bug in gcc 4.0.0 (PR 21239) */
132#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MAJOR__ == 0 && \
133 __GNUC_MINOR__ == 0 && \
134 (! defined(__GNUC_PATCHLEVEL__) || __GNUC_PATCHLEVEL__ == 0)
135# define _MM_STORE(mem, reg, SUF, UNA) \
136 asm(""::"x"(reg)); \
137 _MMAVX(store##UNA##_##SUF(mem, reg))
138#else
139# define _MM_STORE(mem, reg, SUF, UNA) \
140 _MMAVX(store##UNA##_##SUF(mem, reg))
141#endif
142
143/* General policy: We'll always ensure alignment of the result
144 * vector, so no need to use unaligned insns for it. */
145
146#ifndef C_MEMALLOC
148/* Template decl should be in basics.h */
149//# define COPY2(res,v1,f1,f2) res = v1
150#define COPY2_SIMD(r,v1,f1,f2,SUF,UNA1) \
151 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
152 _MM_STORE(r, TMP, SUF,)
153VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, sd, pd,
154 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
155 AVXMD, double, __MAVXD)
156VKERN_TEMPL_2V_SIMD(_tbci_copy, COPY2_SIMD, ss, ps,
157 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
158 AVXMS, float, __MAVXS)
159/* We could use more xmm REGISTERs here ... but the CPU internal
160 * REGISTER renaming should hide the latencies by not doing so. */
161
162/* TODO: Benchmark copy and fill ... */
163/* TODO: Avoid gcc unrolling this even more ... */
164/* TODO: Here integers and pointers certainly make sense */
165
166
167//# define FILL1(res,f1,f2) res = f2
168#define FILL1_SIMD(r,f1,f2,SUF) \
169 _MM_STORE(r, f2, SUF,)
170VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, sd, pd,
171 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
172 AVXMD, double, __MAVXD)
173VKERN_TEMPL_1V_C_SIMD(_tbci_fill, FILL1_SIMD, ss, ps,
174 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
175 AVXMS, float, __MAVXS)
176/* TODO: Here integers and pointers certainly make sense */
177#endif
178
179// TODO: Should we rather use _mm_add_sd, _mm_sub_sd
180#define _mm256_add_sd _mm256_add_pd
181#define _mm256_add_ss _mm256_add_ps
182#define _mm256_sub_sd _mm256_sub_pd
183#define _mm256_sub_ss _mm256_sub_ps
184
185#define _mm256_mul_sd _mm256_mul_pd
186#define _mm256_mul_ss _mm256_mul_ps
187#define _mm256_div_sd _mm256_div_pd
188#define _mm256_div_ss _mm256_div_ps
189
190#define _mm512_add_sd _mm512_add_pd
191#define _mm512_add_ss _mm512_add_ps
192#define _mm512_sub_sd _mm512_sub_pd
193#define _mm512_sub_ss _mm512_sub_ps
194
195#define _mm512_mul_sd _mm512_mul_pd
196#define _mm512_mul_ss _mm512_mul_ps
197#define _mm512_div_sd _mm512_div_pd
198#define _mm512_div_ss _mm512_div_ps
199
200
201/* 3Vec operations */
203//#define ADD3(r,v1,v2,f1,f2) r = v1 + v2
204#define ADD3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
205 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
206 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
207 TMP = _MMAVX(add_##SUF(TMP, LD)); \
208 _MM_STORE(r, TMP, SUF,)
209VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, sd, pd,
210 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
211 AVXMD, double, __MAVXD)
212VKERN_TEMPL_3V_SIMD(do_vec_vec_add, ADD3_SIMD, ss, ps,
213 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
214 AVXMS, float, __MAVXS)
215
216
217//#define SUB3(r,v1,v2,f1,f2) r = v1 - v2
218#define SUB3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
219 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
220 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
221 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
222 _MM_STORE(r, TMP, SUF,)
223VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, sd, pd,
224 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
225 AVXMD, double, __MAVXD)
226VKERN_TEMPL_3V_SIMD(do_vec_vec_sub, SUB3_SIMD, ss, ps,
227 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
228 AVXMS, float, __MAVXS)
229
230
231//#define MUL3(r,v1,v2,f1,f2) r = v1 * v2
232#define MUL3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
233 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
234 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
235 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
236 _MM_STORE(r, TMP, SUF,)
237VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, sd, pd,
238 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
239 AVXMD, double, __MAVXD)
240VKERN_TEMPL_3V_SIMD(do_vec_vec_mul, MUL3_SIMD, ss, ps,
241 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
242 AVXMS, float, __MAVXS)
243
244template <> inline void do_vec_vec_cmul<double>(const unsigned long sz,
245 double* RESTRICT const res, const double* RESTRICT const v1,
246 const double* RESTRICT const v2)
247{
248 do_vec_vec_mul<double>(sz, res, v1, v2);
249}
250template <> inline void do_vec_vec_cmul<float>(const unsigned long sz,
251 float* RESTRICT const res, const float* RESTRICT const v1,
252 const float* RESTRICT const v2)
253{
254 do_vec_vec_mul<float>(sz, res, v1, v2);
255}
256
258//#define DIV3(r,v1,v2,f1,f2) r = v1 / v2
259#define DIV3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
260 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
261 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
262 TMP = _MMAVX(div_##SUF(TMP, LD)); \
263 _MM_STORE(r, TMP, SUF,)
264VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, sd, pd,
265 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
266 AVXMD, double, __MAVXD)
267VKERN_TEMPL_3V_SIMD(do_vec_vec_div, DIV3_SIMD, ss, ps,
268 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
269 AVXMS, float, __MAVXS)
270
271template <> inline void do_vec_vec_cdiv<double>(const unsigned long sz,
272 double* RESTRICT const res, const double* RESTRICT const v1,
273 const double* RESTRICT const v2)
274{
275 do_vec_vec_div<double>(sz, res, v1, v2);
276}
277template <> inline void do_vec_vec_cdiv<float>(const unsigned long sz,
278 float* RESTRICT const res, const float* RESTRICT const v1,
279 const float* RESTRICT const v2)
280{
281 do_vec_vec_div<float>(sz, res, v1, v2);
282}
283
284
286//#define ADD2(r,v1,f1,f2) r += v1
287#define ADD2_SIMD(r,v1,f1,f2,SUF,UNA1) \
288 TMP = _MMAVX(load_##SUF(r)); \
289 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
290 TMP = _MMAVX(add_##SUF(TMP, LD)); \
291 _MM_STORE(r, TMP, SUF,)
292VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, sd, pd,
293 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
294 AVXMD, double, __MAVXD)
295VKERN_TEMPL_2V_SIMD(do_vec_add_vec, ADD2_SIMD, ss, ps,
296 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
297 AVXMS, float, __MAVXS)
298
299
300//#define SUB2(r,v1,f1,f2) r -= v1
301#define SUB2_SIMD(r,v1,f1,f2,SUF,UNA1) \
302 TMP = _MMAVX(load_##SUF(r)); \
303 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
304 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
305 _MM_STORE(r, TMP, SUF,)
306VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, sd, pd,
307 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
308 AVXMD, double, __MAVXD)
309VKERN_TEMPL_2V_SIMD(do_vec_sub_vec, SUB2_SIMD, ss, ps,
310 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
311 AVXMS, float, __MAVXS)
312
313
314//#define SUB2I(r,v1,f1,f2) r = v1 - r
315#define SUB2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
316 TMP = _MMAVX(load_##SUF(r)); \
317 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
318 LD = _MMAVX(sub_##SUF(LD, TMP)); \
319 _MM_STORE(r, LD, SUF,)
320VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, sd, pd,
321 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
322 AVXMD, double, __MAVXD)
323VKERN_TEMPL_2V_SIMD(do_vec_sub_vec_inv, SUB2I_SIMD, ss, ps,
324 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
325 AVXMS, float, __MAVXS)
326
327
328//#define MUL2(r,v1,f1,f2) r *= v1
329#define MUL2_SIMD(r,v1,f1,f2,SUF,UNA1) \
330 TMP = _MMAVX(load_##SUF(r)); \
331 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
332 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
333 _MM_STORE(r, TMP, SUF,)
334VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, sd, pd,
335 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
336 AVXMD, double, __MAVXD)
337VKERN_TEMPL_2V_SIMD(do_vec_mul_vec, MUL2_SIMD, ss, ps,
338 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
339 AVXMS, float, __MAVXS)
340
341
342//#define CMUL2(r,v1,f1,f2) r = CPLX__ conj(r) * v1
343template <> inline void do_vec_cmul_vec<double>(const unsigned long sz,
344 double* RESTRICT const res, const double* RESTRICT const v1)
345{
346 do_vec_mul_vec<double>(sz, res, v1);
347}
348template <> inline void do_vec_cmul_vec<float>(const unsigned long sz,
349 float* RESTRICT const res, const float* RESTRICT const v1)
350{
351 do_vec_mul_vec<float>(sz, res, v1);
352}
354//#define CMUL2I(r,v1,f1,f2) r *= CPLX__ conj(v1)
355template <> inline void do_vec_cmul_vec_inv<double>(const unsigned long sz,
356 double* RESTRICT const res, const double* RESTRICT const v1)
357{
358 do_vec_mul_vec<double>(sz, res, v1);
359}
360template <> inline void do_vec_cmul_vec_inv<float>(const unsigned long sz,
361 float* RESTRICT const res, const float* RESTRICT const v1)
362{
363 do_vec_mul_vec<float>(sz, res, v1);
364}
365
367//#define DIV2(r,v1,f1,f2) r /= v1
368#define DIV2_SIMD(r,v1,f1,f2,SUF,UNA1) \
369 TMP = _MMAVX(load_##SUF(r)); \
370 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
371 TMP = _MMAVX(div_##SUF(TMP, LD)); \
372 _MM_STORE(r, TMP, SUF,)
373VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, sd, pd,
374 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
375 AVXMD, double, __MAVXD)
376VKERN_TEMPL_2V_SIMD(do_vec_div_vec, DIV2_SIMD, ss, ps,
377 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
378 AVXMS, float, __MAVXS)
379
380
381//#define DIV2I(r,v1,f1,f2) r = v1 / r
382#define DIV2I_SIMD(r,v1,f1,f2,SUF,UNA1) \
383 TMP = _MMAVX(load_##SUF(r)); \
384 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
385 LD = _MMAVX(div_##SUF(LD, TMP)); \
386 _MM_STORE(r, LD, SUF,)
387VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, sd, pd,
388 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
389 AVXMD, double, __MAVXD)
390VKERN_TEMPL_2V_SIMD(do_vec_div_vec_inv, DIV2I_SIMD, ss, ps,
391 SIMD_EMPTY0, SIMD_EMPTY0, SIMD_EMPTY0,
392 AVXMS, float, __MAVXS)
393
394
395//#define CDIV2(r,v1,f1,f2) r = CPLX__ conj(r) / v1
396template <> inline void do_vec_cdiv_vec<double>(const unsigned long sz,
397 double* RESTRICT const res, const double* RESTRICT const v1)
398{
399 do_vec_div_vec<double>(sz, res, v1);
400}
401template <> inline void do_vec_cdiv_vec<float>(const unsigned long sz,
402 float* RESTRICT const res, const float* RESTRICT const v1)
403{
404 do_vec_div_vec<float>(sz, res, v1);
405}
406
407
409//#define CDIV2I(r,v1,f1,f2) r = CPLX__ conj(v1) / r
410template <> inline void do_vec_cdiv_vec_inv<double>(const unsigned long sz,
411 double* RESTRICT const res, const double* RESTRICT const v1)
412{
413 do_vec_div_vec_inv<double>(sz, res, v1);
414}
415template <> inline void do_vec_cdiv_vec_inv<float>(const unsigned long sz,
416 float* RESTRICT const res, const float* RESTRICT const v1)
417{
418 do_vec_div_vec_inv<float>(sz, res, v1);
419}
420
422//#define ADD2NV(r,v1,f1,f2) r = v1 + f2
423#define ADD2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
424 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
425 TMP = _MMAVX(add_##SUF(TMP, f2)); \
426 _MM_STORE(r, TMP, SUF,)
427VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, sd, pd,
428 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
429 AVXMD, double, __MAVXD)
430VKERN_TEMPL_2V_C_SIMD(do_vec_val_add, ADD2NV_SIMD, ss, ps,
431 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
432 AVXMS, float, __MAVXS)
433
434
435//#define SUB2NV(r,v1,f1,f2) r = v1 - f2
436#define SUB2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
437 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
438 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
439 _MM_STORE(r, TMP, SUF,)
440VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, sd, pd,
441 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
442 AVXMD, double, __MAVXD)
443VKERN_TEMPL_2V_C_SIMD(do_vec_val_sub, SUB2NV_SIMD, ss, ps,
444 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
445 AVXMS, float, __MAVXS)
446
447
448//#define MUL2NV(r,v1,f1,f2) r = v1 * f2
449//VKERN_TEMPL_2V_C(do_vec_val_mul, MUL2NV);
450#define MUL2NV_SIMD(r,v1,f1,f2,SUF,UNA1) \
451 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
452 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
453 _MM_STORE(r, TMP, SUF,)
454VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, sd, pd,
455 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
456 AVXMD, double, __MAVXD)
457VKERN_TEMPL_2V_C_SIMD(do_vec_val_mul, MUL2NV_SIMD, ss, ps,
458 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
459 AVXMS, float, __MAVXS)
460
461
462
463//#define ADD2RV(r,v1,f1,f2) r = f2 + v1
464template <> inline void do_val_vec_add<double>(const unsigned long sz,
465 double* RESTRICT const res, const double* RESTRICT const v1,
466 LCTYPED(double) _f2)
467{
468 do_vec_val_add<double>(sz, res, v1, _f2);
469}
470template <> inline void do_val_vec_add<float>(const unsigned long sz,
471 float* RESTRICT const res, const float* RESTRICT const v1,
472 LCTYPED(float) _f2)
473{
474 do_vec_val_add<float>(sz, res, v1, _f2);
475}
476
478//#define SUB2RV(r,v1,f1,f2) r = f2 - v1
479#define SUB2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
480 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
481 TMP = _MMAVX(sub_##SUF(f2, TMP)); \
482 _MM_STORE(r, TMP, SUF,)
483VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, sd, pd,
484 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
485 AVXMD, double, __MAVXD)
486VKERN_TEMPL_2V_C_SIMD(do_val_vec_sub, SUB2RV_SIMD, ss, ps,
487 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
488 AVXMS, float, __MAVXS)
489
490
491//#define MUL2RV(r,v1,f1,f2) r = f2 * v1
492template <> inline void do_val_vec_mul<double>(const unsigned long sz,
493 double* RESTRICT const res, const double* RESTRICT const v1,
494 LCTYPED(double) _f2)
495{
496 do_vec_val_mul<double>(sz, res, v1, _f2);
497}
498template <> inline void do_val_vec_mul<float>(const unsigned long sz,
499 float* RESTRICT const res, const float* RESTRICT const v1,
500 LCTYPED(float) _f2)
501{
502 do_vec_val_mul<float>(sz, res, v1, _f2);
503}
504
506//#define DIV2RV(r,v1,f1,f2) r = f2 / v1
507#define DIV2RV_SIMD(r,v1,f1,f2,SUF,UNA1) \
508 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
509 TMP = _MMAVX(div_##SUF(f2, TMP)); \
510 _MM_STORE(r, TMP, SUF,)
511VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, sd, pd,
512 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
513 AVXMD, double, __MAVXD)
514VKERN_TEMPL_2V_C_SIMD(do_val_vec_div, DIV2RV_SIMD, ss, ps,
515 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
516 AVXMS, float, __MAVXS)
517
518/* The one vec operations don't need unaligned versions,
519 * as the loop preamble will always be able to lead us to an
520 * aligned situation */
521
522
523//#define ADD1NV(r,f1,f2) r += f2
524#define ADD1NV_SIMD(r,f1,f2,SUF) \
525 TMP = _MMAVX(load_##SUF(r)); \
526 TMP = _MMAVX(add_##SUF(TMP, f2)); \
527 _MM_STORE(r, TMP, SUF,)
528VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, sd, pd,
529 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
530 AVXMD, double, __MAVXD)
531VKERN_TEMPL_1V_C_SIMD(do_vec_add_val, ADD1NV_SIMD, ss, ps,
532 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
533 AVXMS, float, __MAVXS)
534
535
536//#define SUB1NV(r,f1,f2) r -= f2
537#define SUB1NV_SIMD(r,f1,f2,SUF) \
538 TMP = _MMAVX(load_##SUF(r)); \
539 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
540 _MM_STORE(r, TMP, SUF,)
541VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, sd, pd,
542 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
543 AVXMD, double, __MAVXD)
544VKERN_TEMPL_1V_C_SIMD(do_vec_sub_val, SUB1NV_SIMD, ss, ps,
545 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
546 AVXMS, float, __MAVXS)
547
548
549//#define SUB1RV(r,f1,f2) r = f2 - r
550#define SUB1RV_SIMD(r,f1,f2,SUF) \
551 TMP = _MMAVX(load_##SUF(r)); \
552 TMP = _MMAVX(sub_##SUF(f2, TMP)); \
553 _MM_STORE(r, TMP, SUF,)
554VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, sd, pd,
555 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
556 AVXMD, double, __MAVXD)
557VKERN_TEMPL_1V_C_SIMD(do_val_sub_vec, SUB1RV_SIMD, ss, ps,
558 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
559 AVXMS, float, __MAVXS)
560
561
562//#define MUL1NV(r,f1,f2) r *= f2
563#define MUL1NV_SIMD(r,f1,f2,SUF) \
564 TMP = _MMAVX(load_##SUF(r)); \
565 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
566 _MM_STORE(r, TMP, SUF,)
567VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, sd, pd,
568 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
569 AVXMD, double, __MAVXD)
570VKERN_TEMPL_1V_C_SIMD(do_vec_mul_val, MUL1NV_SIMD, ss, ps,
571 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
572 AVXMS, float, __MAVXS)
573
574
575//#define DIV1NV(r,f1,f2) r /= f2
576#define DIV1NV_SIMD(r,f1,f2,SUF) \
577 TMP = _MMAVX(load_##SUF(r)); \
578 TMP = _MMAVX(div_##SUF(TMP, f2)); \
579 _MM_STORE(r, TMP, SUF,)
580VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, sd, pd,
581 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
582 AVXMD, double, __MAVXD)
583VKERN_TEMPL_1V_C_SIMD(do_vec_div_val, DIV1NV_SIMD, ss, ps,
584 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
585 AVXMS, float, __MAVXS)
586
587
588//#define DIV1RV(r,f1,f2) r = f2 / r
589#define DIV1RV_SIMD(r,f1,f2,SUF) \
590 TMP = _MMAVX(load_##SUF(r)); \
591 TMP = _MMAVX(div_##SUF(f2, TMP)); \
592 _MM_STORE(r, TMP, SUF,)
593VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, sd, pd,
594 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
595 AVXMD, double, __MAVXD)
596VKERN_TEMPL_1V_C_SIMD(do_val_div_vec, DIV1RV_SIMD, ss, ps,
597 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
598 AVXMS, float, __MAVXS)
599
600
601//#define ADD1RV(r,f1,f2) r = f2 + r
602template <> inline void do_val_add_vec<double>(const unsigned long sz,
603 double* RESTRICT const res, LCTYPED(double) _f2)
604{
605 do_vec_add_val<double>(sz, res, _f2);
606}
607template <> inline void do_val_add_vec<float>(const unsigned long sz,
608 float* RESTRICT const res, LCTYPED(float) _f2)
609{
610 do_vec_add_val<float>(sz, res, _f2);
611}
612
613// unused ...
614//#define MUL1RV(r,f1,f2) r = f2 * r;
615
616
617
618/* TSVector stuff */
619
621
623//#define ADD2NS(r,v1,f1,f2) r += f2*v1
624#if defined( __FMA__) && !defined(NO_FMA)
625#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SIMD)
626#warning Using FMA
627#endif
628#define _mm256_fmadd_sd _mm256_fmadd_pd
629#define _mm256_fmadd_ss _mm256_fmadd_ps
630#define _mm256_fmsub_sd _mm256_fmsub_pd
631#define _mm256_fmsub_ss _mm256_fmsub_ps
632
633#define _mm256_FMA(SUF,a,b,c,d) \
634 d = _mm256_fmadd_##SUF(a,b,c)
635#define _mm256_FMS(SUF,a,b,c,d) \
636 d = _mm256_fmsub_##SUF(a,b,c)
637#else
638#define _mm256_FMA(SUF,a,b,c,d) \
639 b = _mm256_mul_##SUF(a, b); \
640 d = _mm256_add_##SUF(b, c)
641#define _mm256_FMS(SUF,a,b,c,d) \
642 b = _mm256_mul_##SUF(a, b); \
643 d = _mm256_sub_##SUF(b, c)
644#endif
645
646#define ADD2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
647 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
648 TMP = _MMAVX(load_##SUF(r)); \
649 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
650 _MM_STORE(r, TMP, SUF,)
651VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, sd, pd,
652 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
653 AVXMD, double, __MAVXD)
654VKERN_TEMPL_2V_C_SIMD(do_vec_add_svc, ADD2NS_SIMD, ss, ps,
655 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
656 AVXMS, float, __MAVXS)
657
658
659//#define SUB2NS(r,v1,f1,f2) r -= f2*v1
660#define SUB2NS_SIMD(r,v1,f1,f2,SUF,UNA1) \
661 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
662 TMP = _MMAVX(load_##SUF(r)); \
663 LD = _MMAVX(mul_##SUF(f2,LD)); \
664 TMP = _MMAVX(sub_##SUF(TMP,LD)); \
665 _MM_STORE(r, TMP, SUF,)
666VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, sd, pd,
667 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
668 AVXMD, double, __MAVXD)
669VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc, SUB2NS_SIMD, ss, ps,
670 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
671 AVXMS, float, __MAVXS)
672
673
674//#define SUB2RS(r,v1,f1,f2) r = f2*v1 - r
675#define SUB2RS_SIMD(r,v1,f1,f2,SUF,UNA1) \
676 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
677 TMP = _MMAVX(load_##SUF(r)); \
678 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
679 _MM_STORE(r, LD, SUF,)
680VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, sd, pd,
681 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
682 AVXMD, double, __MAVXD)
683VKERN_TEMPL_2V_C_SIMD(do_vec_sub_svc_inv, SUB2RS_SIMD, ss, ps,
684 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
685 AVXMS, float, __MAVXS)
686
687
688//#define ADD3NS(r,v1,v2,f1,f2) r = v1 + f2*v2
689#define ADD3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
690 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
691 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
692 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
693 _MM_STORE(r, TMP, SUF,)
694VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, sd, pd,
695 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
696 AVXMD, double, __MAVXD)
697VKERN_TEMPL_3V_C_SIMD(do_vec_svc_add, ADD3NS_SIMD, ss, ps,
698 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
699 AVXMS, float, __MAVXS)
700
701
702//#define SUB3NS(r,v1,v2,f1,f2) r = v1 - f2*v2
703#define SUB3NS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
704 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
705 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
706 LD = _MMAVX(mul_##SUF(LD, f2)); \
707 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
708 _MM_STORE(r, TMP, SUF,)
709VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, sd, pd,
710 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
711 AVXMD, double, __MAVXD)
712VKERN_TEMPL_3V_C_SIMD(do_vec_svc_sub, SUB3NS_SIMD, ss, ps,
713 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
714 AVXMS, float, __MAVXS)
715
716
717
718//#define ADD3SN(r,v1,v2,f1,f2) r = f2*v1 + v2
719#define ADD3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
720 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
721 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
722 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
723 _MM_STORE(r, TMP, SUF,)
724VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, sd, pd,
725 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
726 AVXMD, double, __MAVXD)
727VKERN_TEMPL_3V_C_SIMD(do_svc_vec_add, ADD3SN_SIMD, ss, ps,
728 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
729 AVXMS, float, __MAVXS)
730
731
732//#define SUB3SN(r,v1,v2,f1,f2) r = f2*v1 - v2
733#define SUB3SN_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
734 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
735 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
736 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
737 _MM_STORE(r, LD, SUF,)
738VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, sd, pd,
739 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
740 AVXMD, double, __MAVXD)
741VKERN_TEMPL_3V_C_SIMD(do_svc_vec_sub, SUB3SN_SIMD, ss, ps,
742 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
743 AVXMS, float, __MAVXS)
744
745
746
747//#define ADD3SS(r,v1,v2,f1,f2) r = f1*v1 + f2*v2
748#define ADD3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
749 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
750 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
751 TMP = _MMAVX(mul_##SUF(f2,TMP)); \
752 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
753 _MM_STORE(r, LD, SUF,)
754VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, sd, pd,
755 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
756 AVXMD, double, __MAVXD)
757VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_add, ADD3SS_SIMD, ss, ps,
758 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
759 AVXMS, float, __MAVXS)
760
761
762//#define SUB3SS(r,v1,v2,f1,f2) r = f1*v1 - f2*v2
763#define SUB3SS_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2)\
764 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
765 TMP = _MMAVX(load##UNA2##_##SUF(v2)); \
766 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
767 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
768 _MM_STORE(r, LD, SUF,)
769VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, sd, pd,
770 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
771 AVXMD, double, __MAVXD)
772VKERN_TEMPL_3V_CC_SIMD(do_svc_svc_sub, SUB3SS_SIMD, ss, ps,
773 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
774 AVXMS, float, __MAVXS)
775
776
777
778//#define ADD2SN(r,v1,f1,f2) r = f2*r + v1
779#define ADD2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
780 LD = _MMAVX(load_##SUF(r)); \
781 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
782 _MMAVX(FMA(SUF,f2,LD,TMP,TMP)); \
783 _MM_STORE(r, TMP, SUF,)
784VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, sd, pd,
785 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
786 AVXMD, double, __MAVXD)
787VKERN_TEMPL_2V_C_SIMD(do_svc_add_vec, ADD2SN_SIMD, ss, ps,
788 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
789 AVXMS, float, __MAVXS)
790
791
792//#define SUB2SN(r,v1,f1,f2) r = f2*r - v1
793#define SUB2SN_SIMD(r,v1,f1,f2,SUF,UNA1) \
794 LD = _MMAVX(load_##SUF(r)); \
795 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
796 _MMAVX(FMS(SUF,f2,LD,TMP,LD)); \
797 _MM_STORE(r, LD, SUF,)
798VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, sd, pd,
799 SIMD_CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
800 AVXMD, double, __MAVXD)
801VKERN_TEMPL_2V_C_SIMD(do_svc_sub_vec, SUB2SN_SIMD, ss, ps,
802 SIMD_CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY1,
803 AVXMS, float, __MAVXS)
804
805
806//#define ADD2SS(r,v1,f1,f2) r = f1*r + f2*v1
807#define ADD2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
808 LD = _MMAVX(load_##SUF(r)); \
809 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
810 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
811 _MMAVX(FMA(SUF,f1,LD,TMP,LD)); \
812 _MM_STORE(r, LD, SUF,)
813VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, sd, pd,
814 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
815 AVXMD, double, __MAVXD)
816VKERN_TEMPL_2V_CC_SIMD(do_svc_add_svc, ADD2SS_SIMD, ss, ps,
817 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
818 AVXMS, float, __MAVXS)
819
820
821//#define SUB2SS(r,v1,f1,f2) r = f1*r - f2*v1
822#define SUB2SS_SIMD(r,v1,f1,f2,SUF,UNA1) \
823 LD = _MMAVX(load_##SUF(r)); \
824 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
825 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
826 _MMAVX(FMS(SUF,f1,LD,TMP,LD)); \
827 _MM_STORE(r, LD, SUF,)
828VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, sd, pd,
829 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
830 AVXMD, double, __MAVXD)
831VKERN_TEMPL_2V_CC_SIMD(do_svc_sub_svc, SUB2SS_SIMD, ss, ps,
832 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
833 AVXMS, float, __MAVXS)
834
835
836
837//#define ADD2SV(r,v1,f1,f2) r = f1*v1 + f2
838#define ADD2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
839 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
840 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
841 _MM_STORE(r, TMP, SUF,)
842VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, sd, pd,
843 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
844 AVXMD, double, __MAVXD)
845VKERN_TEMPL_2V_CC_SIMD(do_svc_val_add, ADD2SV_SIMD, ss, ps,
846 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
847 AVXMS, float, __MAVXS)
848
849
850//#define SUB2SV(r,v1,f1,f2) r = f1*v1 - f2
851#define SUB2SV_SIMD(r,v1,f1,f2,SUF,UNA1) \
852 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
853 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
854 _MM_STORE(r, TMP, SUF,)
855VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, sd, pd,
856 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
857 AVXMD, double, __MAVXD)
858VKERN_TEMPL_2V_CC_SIMD(do_svc_val_sub, SUB2SV_SIMD, ss, ps,
859 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
860 AVXMS, float, __MAVXS)
861
862
863
864//#define ADD1SV(r,f1,f2) r = f1*r + f2
865#define ADD1SV_SIMD(r,f1,f2,SUF) \
866 TMP = _MMAVX(load_##SUF(r)); \
867 _MMAVX(FMA(SUF,f1,TMP,f2,TMP)); \
868 _MM_STORE(r, TMP, SUF,)
869VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, sd, pd,
870 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
871 AVXMD, double, __MAVXD)
872VKERN_TEMPL_1V_CC_SIMD(do_svc_add_val, ADD1SV_SIMD, ss, ps,
873 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
874 AVXMS, float, __MAVXS)
875
876
877//#define SUB1SV(r,f1,f2) r = f1*r - f2
878#define SUB1SV_SIMD(r,f1,f2,SUF) \
879 TMP = _MMAVX(load_##SUF(r)); \
880 _MMAVX(FMS(SUF,f1,TMP,f2,TMP)); \
881 _MM_STORE(r, TMP, SUF,)
882VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, sd, pd,
883 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
884 AVXMD, double, __MAVXD)
885VKERN_TEMPL_1V_CC_SIMD(do_svc_sub_val, SUB1SV_SIMD, ss, ps,
886 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
887 AVXMS, float, __MAVXS)
888
889
890
891//#define ADD2VS(r,v1,f1,f2) r = f1 + f2*v1
892template <> inline void do_val_svc_add<double>(const unsigned long sz,
893 double* RESTRICT const res, const double* RESTRICT const v1,
894 LCTYPED(double) f1, LCTYPED(double) f2)
895{
896 do_svc_val_add<double>(sz, res, v1, f2, f1); // note the reverse order!
897}
898template <> inline void do_val_svc_add<float>(const unsigned long sz,
899 float* RESTRICT const res, const float* RESTRICT const v1,
900 LCTYPED(float) f1, LCTYPED(float) f2)
901{
902 do_svc_val_add<float>(sz, res, v1, f2, f1); // note the reverse order!
903}
904
906//#define SUB2VS(r,v1,f1,f2) r = f1 - f2*v1
907#define SUB2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
908 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
909 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
910 TMP = _MMAVX(sub_##SUF(f1, TMP)); \
911 _MM_STORE(r, TMP, SUF,)
912VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, sd, pd,
913 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
914 AVXMD, double, __MAVXD)
915VKERN_TEMPL_2V_CC_SIMD(do_val_svc_sub, SUB2VS_SIMD, ss, ps,
916 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
917 AVXMS, float, __MAVXS)
918
919
920//#define DIV2VS(r,v1,f1,f2) r = f1 / (f2*v1)
921#define DIV2VS_SIMD(r,v1,f1,f2,SUF,UNA1) \
922 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
923 TMP = _MMAVX(mul_##SUF(TMP, f2)); \
924 TMP = _MMAVX(div_##SUF(f1, TMP)); \
925 _MM_STORE(r, TMP, SUF,)
926VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, sd, pd,
927 SIMD_2CONST_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
928 AVXMD, double, __MAVXD)
929VKERN_TEMPL_2V_CC_SIMD(do_val_svc_div, DIV2VS_SIMD, ss, ps,
930 SIMD_2CONST_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY2,
931 AVXMS, float, __MAVXS)
932
933
934/* ... */
935
936
937/* For negation, use our knowledge of the position of the sign bits */
938#ifdef AVX512
939#ifdef HAVE_LONG_LONG
940#define NEG_DOUBLE_PREP \
941 static union _negmask { \
942 unsigned LONG_LONG lng[8]; \
943 double dbl[8]; \
944 __m512d m512d; \
945 } ALIGN(64) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
946 __m512d neg = _MMAVX(load_pd(negmask.dbl))
947#else
948#define NEG_DOUBLE_PREP \
949 static union _negmask { \
950 unsigned int lng[16]; \
951 double dbl[4]; \
952 __m512d m512d; \
953 } ALIGN(64) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
954 __m512d neg = _MMAVX(load_pd(negmask.dbl))
955#endif
956#define NEG_FLOAT_PREP \
957 static union _negmask { \
958 unsigned int itg[8]; \
959 float flt[8]; \
960 __m512 m512s; \
961 } ALIGN(64) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
962 __m512 neg = _MMAVX(load_ps(negmask.flt))
963#else
964#ifdef HAVE_LONG_LONG
965#define NEG_DOUBLE_PREP \
966 static union _negmask { \
967 unsigned LONG_LONG lng[4]; \
968 double dbl[4]; \
969 __m256d m256d; \
970 } ALIGN(32) negmask = { {0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL}, }; \
971 __m256d neg = _MMAVX(load_pd(negmask.dbl))
972#else
973#define NEG_DOUBLE_PREP \
974 static union _negmask { \
975 unsigned int lng[8]; \
976 double dbl[4]; \
977 __m256d m256d; \
978 } ALIGN(32) negmask = { {0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U, 0x0U, 0x80000000U}, }; \
979 __m256d neg = _MMAVX(load_pd(negmask.dbl))
980#endif
981#define NEG_FLOAT_PREP \
982 static union _negmask { \
983 unsigned int itg[8]; \
984 float flt[8]; \
985 __m256 m256s; \
986 } ALIGN(32) negmask = { {0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U}, }; \
987 __m256 neg = _MMAVX(load_ps(negmask.flt))
988
989#endif
990/* single val xor operations don't exist, but we don't care ... */
991#define _mm256_xor_sd _mm256_xor_pd
992#define _mm256_xor_ss _mm256_xor_ps
993#define _mm512_xor_sd _mm512_xor_pd
994#define _mm512_xor_ss _mm512_xor_ps
995
997//#define NEG2(r,v1,f1,f2) r = -v1
998#define NEG2_SIMD(r,v1,f1,f2,SUF,UNA1) \
999 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1000 TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1001 _MM_STORE(r, TMP, SUF,)
1002VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, sd, pd,
1003 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1004 AVXMD, double, __MAVXD)
1005VKERN_TEMPL_2V_SIMD(do_vec_neg_vec, NEG2_SIMD, ss, ps,
1006 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1007 AVXMS, float, __MAVXS)
1008
1009
1010//#define NEG1(r,f1,f2) r = -r
1011#define NEG1_SIMD(r,f1,f2,SUF) \
1012 TMP = _MMAVX(load_##SUF(r)); \
1013 TMP = _MMAVX(xor_##SUF(TMP, neg)); \
1014 _MM_STORE(r, TMP, SUF,)
1015VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, sd, pd,
1016 NEG_DOUBLE_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1017 AVXMD, double, __MAVXD)
1018VKERN_TEMPL_1V_SIMD(do_vec_neg, NEG1_SIMD, ss, ps,
1019 NEG_FLOAT_PREP, SIMD_EMPTY0, SIMD_EMPTY0,
1020 AVXMS, float, __MAVXS)
1021
1022
1023
1024//#define COMP2(r,v1,f1,f2) if (r != v1) { ++f2; break; }
1025//VKERN_TEMPL_2V_T(do_vv_comp, COMP2, volatile long);
1026#define VL_PREP(x) long f2 = (x)
1027#define VL_FIN(x) x = f2
1028#define _mm256_movemask_sd(x) \
1029 _mm256_movemask_pd(x); rg &= 0x1
1030#define _mm256_movemask_ss(x) \
1031 _mm256_movemask_ps(x); rg &= 0x1
1032#define _mm512_movemask_sd(x) \
1033 _mm512_movemask_pd(x); rg &= 0x1
1034#define _mm512_movemask_ss(x) \
1035 _mm512_movemask_ps(x); rg &= 0x1
1036
1037
1038#define _mm256_cmpneq_pd(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_UQ)
1039#define _mm256_cmpneq_ps(a,b) _mm256_cmp_ps(a,b,_CMP_NEQ_UQ)
1040#define _mm256_cmpneq_sd _mm256_cmpneq_pd
1041#define _mm256_cmpneq_ss _mm256_cmpneq_ps
1042//#define _mm256_cmp_pd(a,b) _mm256_cmp_pd(a,b,_CMP_EQ_OQ)
1043//#define _mm256_cmp_ps(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_OQ)
1044#define _mm256_cmp_sd _mm256_cmp_pd
1045#define _mm256_cmp_ss _mm256_cmp_ps
1046
1047
1048#define COMP2_SIMD(r,v1,f1,f2,SUF,UNA) \
1049 TMP = _MMAVX(load_##SUF(r)); \
1050 LD = _MMAVX(load##UNA##_##SUF(v1)); \
1051 TMP = _MMAVX(cmpneq_##SUF(TMP, LD)); \
1052 /* And now? movmskpd and bt? */ \
1053 rg = _MMAVX(movemask_##SUF(TMP)); \
1054 if (rg) { ++f2; /*fprintf(stderr, "DIFF @ %li: %i\n", sz-i, rg);*/ goto _fin; }
1055VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, sd, pd,
1056 VL_PREP, SIMD_EMPTY0, VL_FIN,
1057 AVXMD, double, __MAVXD)
1058VKERN_TEMPL_2V_T_SIMD_VL(do_vv_comp, COMP2_SIMD, ss, ps,
1059 VL_PREP, SIMD_EMPTY0, VL_FIN,
1060 AVXMS, float, __MAVXS)
1061
1062
1063// Used in do_bdmat_vec_mult
1064#define DECL_DOUBLE __MAVXD TM2
1065#define DECL_FLOAT __MAVXS TM2
1066// Unaligned accesses for single loads are harmless ...
1067#define _mm_loadu_sd _mm_load_sd
1068#define _mm_loadu_ss _mm_load_ss
1069
1070//#define SUMMULT3(r,v1,v2,f1,f2) r += v1*v2
1071#define SUMMULT3_SIMD(r,v1,v2,f1,f2,SUF,UNA1,UNA2) \
1072 TMP = _MMAVX(load##UNA1##_##SUF(v1)); \
1073 LD = _MMAVX(load##UNA2##_##SUF(v2)); \
1074 TM2 = _MMAVX(load_##SUF(r)); \
1075 _MMAVX(FMA(SUF,TMP,LD,TM2,TM2)); \
1076 _MM_STORE(r, TM2, SUF,)
1077#if 1 /* These are used in bdmat_vec_mul -- unaligned accesses are unavoidable */
1078VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
1079 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1080 AVXMD, double, __MAVXD);
1081VKERN_TEMPL_3V_SIMD_UA(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
1082 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1083 AVXMS, float, __MAVXS);
1084#else
1085VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, sd, pd,
1086 DECL_DOUBLE, SIMD_EMPTY0, SIMD_EMPTY0,
1087 AVXMD, double, __MAVXD)
1088VKERN_TEMPL_3V_SIMD(do_add_vec_vec_mul, SUMMULT3_SIMD, ss, ps,
1089 DECL_FLOAT, SIMD_EMPTY0, SIMD_EMPTY0,
1090 AVXMS, float, __MAVXS)
1091#endif
1092
1093//#define SUMCMULT3(r,v1,v2,f1,f2) r += CPLX__ conj(v1)*v2
1094template <> inline void do_add_vec_vec_cmul<double>(const unsigned long sz,
1095 double* RESTRICT const r, const double* RESTRICT const v1,
1096 const double* RESTRICT const v2)
1097{
1098 do_add_vec_vec_mul<double>(sz, r, v1, v2);
1099}
1100template <> inline void do_add_vec_vec_cmul<float>(const unsigned long sz,
1101 float* RESTRICT const r, const float* RESTRICT const v1,
1102 const float* RESTRICT const v2)
1103{
1104 do_add_vec_vec_mul<float>(sz, r, v1, v2);
1105}
1106
1107
1129
1130#ifndef TBCI_NO_SIMD_SUM
1131
1132#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !defined(AUTO_DECL) && !defined(NOWARN) && defined(WARN_SSE)
1133# warning Info: Using unrolled AVX vector kernels for sums (reductions)
1134#endif
1135
1136#define SUM_DOUBLE_PREP(x) REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1137#define SUM_FLOAT_PREP(x) REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1138
1139#define XSUM_DOUBLE_PREP(x) \
1140 REGISTER __MAVXD f1 = _MMAVX(setzero_pd()); \
1141 REGISTER __MAVXD f2 = _MMAVX(set_sd(x))
1142#define XSUM_FLOAT_PREP(x) \
1143 REGISTER __MAVXS f1 = _MMAVX(setzero_ps()); \
1144 REGISTER __MAVXS f2 = _MMAVX(set_ss(x))
1145
1150
1151// FIXME: Do we need more horizontal summing for AVX512
1152#if 1 //def __SSE3__
1153#ifndef AVX512
1154# define SUM_DOUBLE_SIMD_FINX(f) \
1155 f = _mm256_hadd_pd(f, _mm256_permute2f128_pd(f, f, 0x33)); \
1156 f = _mm256_hadd_pd(f, f)
1157
1158# define SUM_FLOAT_SIMD_FINX(f) \
1159 f = _mm256_hadd_ps(f, _mm256_permute2f128_ps(f, f, 0x33)); \
1160 f = _mm256_hadd_ps(f, f); \
1161 f = _mm256_hadd_ps(f, f)
1162#else
1163# warning horizontal sum AVX512 not yet implemented
1164#endif
1165#else // __SSE3__
1166# define SUM_DOUBLE_SIMD_FINX(f) \
1167 __MAVXD TM##f = f; \
1168 TM##f = _MMAVX(unpackhi_pd(TM##f, f)); \
1169 f = _MMAVX(add_sd(f, TM##f))
1170# define SUM_FLOAT_SIMD_FINX(f) \
1171 __MAVXS TM##f = f; \
1172 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0xb1)); \
1173 f = _MMAVX(add_ps(f, TM##f)); \
1174 TM##f = f; \
1175 TM##f = _MMAVX(shuffle_ps(TM##f, f, 0x1b)); \
1176 f = _MMAVX(add_ss(f, TM##f))
1177# if defined(__GNUC__) && defined(WARN_SSE)
1178# warning Not using SSE3 -- consider passing -msse3
1179# endif
1180#endif // __SSE3__
1181
1182#define SUM_DOUBLE_SIMD_FIN SUM_DOUBLE_SIMD_FINX(f2)
1183#define SUM_FLOAT_SIMD_FIN SUM_FLOAT_SIMD_FINX(f2)
1184
1185#define SUM_DOUBLE_FINAL(x) \
1186 _MMAVX(store_sd(&x, f2))
1187#define SUM_FLOAT_FINAL(x) \
1188 _MMAVX(store_ss(&x, f2))
1189
1190
1191/* Define missing intrinsics for full REGISTER copies */
1192#define _mm256_move_ps(f, x) x
1193#define _mm256_move_pd(f, x) x
1194#define _mm256_move_ss(x, f) (__extension__ (__m256 ) { ((__v8sf)f)[0],((__v8sf)x)[1],((__v8sf)x)[2],((__v8sf)x)[3], \
1195 ((__v8sf)x)[4],((__v8sf)x)[5],((__v8sf)x)[6],((__v8sf)x)[7] })
1196#define _mm256_move_sd(x, f) (__extension__ (__m256d) { ((__v4df)f)[0],((__v4df)x)[1],((__v4df)x)[2],((__v4df)x)[3] })
1197
1198
1204
1205/* We don't need to save the upper values of f2 any more,
1206 * as the SISD (sd,ss) loop tails now preserve them */
1207#define XSUM_DOUBLE_SIMD_FIN_STORE \
1208 /*double hif1, hif2;*/ \
1209 /*_MMAVX(storeh_pd(&hif1, f1));*/ \
1210 /*_MMAVX(storeh_pd(&hif2, f2))*/ \
1211 do {} while(0)
1212#define XSUM_FLOAT_SIMD_FIN_STORE \
1213 /*float hif1[4], hif2[4];*/ \
1214 /*_MMAVX(store_ps(hif1, f1));*/ \
1215 /*_MMAVX(store_ps(hif2, f2))*/ \
1216 do {} while(0)
1217
1218/* Do the horizontal sums and the final application of the correction */
1219/* FIXME: These may need adjustment for AVX/AVX-512 */
1220#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE(x) \
1221 /*f2 = _MMAVX(loadh_pd(f2, &hif2));*/ \
1222 /*f1 = _MMAVX(loadh_pd(f1, &hif1));*/ \
1223 SUM_DOUBLE_SIMD_FINX(f2); \
1224 SUM_DOUBLE_SIMD_FINX(f1); \
1225 f2 = _MMAVX(sub_sd(f2, f1)); \
1226 _MMAVX(store_sd(&x, f2))
1227#define XSUM_FLOAT_SIMD_FINAL_COMPLETE(x) \
1228 /*_MMAVX(store_ss(hif2, f2));*/ \
1229 /*_MMAVX(store_ss(hif1, f1));*/ \
1230 /*f2 = _MMAVX(load_ps(hif2));*/ \
1231 /*f1 = _MMAVX(load_ps(hif1));*/ \
1232 SUM_FLOAT_SIMD_FINX(f2); \
1233 SUM_FLOAT_SIMD_FINX(f1); \
1234 f2 = _MMAVX(sub_ss(f2, f1)); \
1235 _MMAVX(store_ss(&x, f2))
1236
1237/* Variant with compensation for lost bits in hadd_pd */
1238// FIXME: This is not elegant ...
1239// ... and maybe not worth the trouble
1240#define XSUM_DOUBLE_SIMD_FINAL_COMPLETE_X(x) \
1241 /*f2 = _MMAVX(loadh_pd(f2, &hif2));*/ \
1242 /*f1 = _MMAVX(loadh_pd(f1, &hif1));*/ \
1243 __MAVXD TMP = f2; \
1244 SUM_DOUBLE_SIMD_FINX(f2); \
1245 __MAVXD COR = f2; \
1246 COR = _MMAVX(sub_sd(COR, TMP)); \
1247 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1248 COR = _MMAVX(sub_sd(COR, TMP)); \
1249 TMP = _MMAVX(unpackhi_pd(TMP, TMP)); \
1250 COR = _MMAVX(sub_sd(COR, TMP)); \
1251 TMP = _MMAVX(permute2f128_pd(TMP, TMP, 0x10)); \
1252 COR = _MMAVX(sub_sd(COR, TMP)); \
1253 COR = _MMAVX(set_sd(((__v4df)COR)[0])); \
1254 f1 = _MMAVX(add_sd(f1, COR)); \
1255 SUM_DOUBLE_SIMD_FINX(f1); \
1256 f2 = _MMAVX(sub_sd(f2, f1)); \
1257 _MMAVX(store_sd(&x, f2))
1258
1259/* TODO: Variant for floats with compensation for lost bits in hadd_ps */
1260
1261
1263// #define MULT2 (r,v1,f1,f2) f2 += r * v1
1264#define MULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1265 TMP = _MMAVX(load_##SUF(r)); \
1266 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1267 _MMAVX(FMA(SUF,TMP,LD,f2,f2))
1268VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, sd, pd,
1269 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1270 AVXMD, double, __MAVXD)
1271VKERN_TEMPL_2V_T_SIMD(do_vec_mult_quick, MULT2_SIMD, ss, ps,
1272 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1273 AVXMS, float, __MAVXS)
1274
1275//#define XMULT2(r,v1,f1,f2) { T y = r * v1; T t = f2+y; f1 += (t-f2)-y; f2 = t; }
1276 /* TMP = r*v1;
1277 * LD = TMP; // y = r*v1 // lower bits = v1 ?
1278 * TMP = TMP+f2; // y+f2
1279 * t = TMP; // t = y+f2
1280 * TMP = TMP-f2; // t-f2
1281 * TMP = TMP-LD; // (t-f2)-y
1282 * f1 = f1+TMP; // f1 += (t-f2)-y
1283 * f2 = t; // f2 += t; // lower bits = unchanged ?
1284 */
1285//do_vec_mult_exact
1286#define XMULT2_SIMD(r,v1,f1,f2,SUF,UNA1) \
1287 TMP = _MMAVX(load_##SUF(r)); \
1288 LD = _MMAVX(load##UNA1##_##SUF(v1)); \
1289 TMP = _MMAVX(mul_##SUF(TMP, LD)); \
1290 LD = _MMAVX(move_##SUF(LD, TMP)); \
1291 /* LD = TMP; */ \
1292 TMP = _MMAVX(add_##SUF(TMP, f2)); \
1293 t = TMP; \
1294 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1295 TMP = _MMAVX(sub_##SUF(TMP, LD)); \
1296 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1297 /* f2 = t */ \
1298 f2 = _MMAVX(move_##SUF(f2, t))
1299VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, sd, pd,
1300 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1301 XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1302 AVXMD, double, __MAVXD)
1303VKERN_TEMPL_2V_T_SIMD(do_vec_mult_exact, XMULT2_SIMD, ss, ps,
1304 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1305 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1306 AVXMS, float, __MAVXS)
1307
1308
1309template <> inline void do_vec_dot_exact<double>(const unsigned long sz,
1310 const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1311 double& _f2)
1312{
1313 do_vec_mult_exact<double>(sz, _v1, _v2, _f2);
1314}
1315
1316template <> inline void do_vec_dot_quick<double>(const unsigned long sz,
1317 const double * RESTRICT const _v1, const double * RESTRICT const _v2,
1318 double& _f2)
1319{
1320 do_vec_mult_quick<double>(sz, _v1, _v2, _f2);
1321}
1322
1323template <> inline void do_vec_dot_exact<float>(const unsigned long sz,
1324 const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1325 float& _f2)
1326{
1327 do_vec_mult_exact<float>(sz, _v1, _v2, _f2);
1328}
1329
1330template <> inline void do_vec_dot_quick<float>(const unsigned long sz,
1331 const float * RESTRICT const _v1, const float * RESTRICT const _v2,
1332 float& _f2)
1333{
1334 do_vec_mult_quick<float>(sz, _v1, _v2, _f2);
1335}
1336
1338VKERN_TEMPL_2V_T(do_vec_mult_unaligned_exact, XMULT2, T)
1339VKERN_TEMPL_2V_T(do_vec_mult_unaligned_quick, MULT2, T)
1340
1341// TODO: Implement do_vec_sumsqr_exact
1342
1343
1344// #define SQR1(r,f1,f2) f2 += r*r
1345#define SQR1_SIMD(r,f1,f2,SUF) \
1346 TMP = _MMAVX(load_##SUF(r)); \
1347 _MMAVX(FMA(SUF,TMP,TMP,f2,f2))
1348
1349VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, sd, pd,
1350 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1351 AVXMD, double, __MAVXD)
1352VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_quick, SQR1_SIMD, ss, ps,
1353 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1354 AVXMS, float, __MAVXS)
1355
1356//do_vec_sumsqr_exact
1357#define XSQR1_SIMD(r,f1,f2,SUF) \
1358 TMP = _MMAVX(load_##SUF(r)); \
1359 TMP = _MMAVX(mul_##SUF(TMP, TMP)); \
1360 y = TMP; \
1361 TMP = _MMAVX(add_##SUF(TMP, f2)); \
1362 t = TMP; \
1363 TMP = _MMAVX(sub_##SUF(TMP, f2)); \
1364 TMP = _MMAVX(sub_##SUF(TMP, y)); \
1365 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1366 /* f2 = t */ \
1367 f2 = _MMAVX(move_##SUF(f2, t))
1368VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, sd, pd,
1369 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1370 XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1371 AVXMD, double, __MAVXD)
1372VKERN_TEMPL_1V_T_SIMD(do_vec_sumsqr_exact, XSQR1_SIMD, ss, ps,
1373 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1374 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1375 AVXMS, float, __MAVXS)
1376
1377
1378#ifndef TBCI_NO_SIMD_FABSSQR
1379template <> inline void do_vec_fabssqr_quick<double>(const unsigned long sz,
1380 const double * const _v1, double& _f2)
1381{
1382 double F2 = _f2;
1383 do_vec_sumsqr_quick<double>(sz, _v1, F2);
1384 _f2 = F2;
1385}
1386template <> inline void do_vec_fabssqr_exact<double>(const unsigned long sz,
1387 const double * const _v1, double& _f2)
1388{
1389 double F2 = _f2;
1390 do_vec_sumsqr_exact<double>(sz, _v1, F2);
1391 _f2 = F2;
1392}
1393#endif // TBCI_NO_SIMD_FABSSQR
1394#ifdef TBCI_SIMD_FABSSQR_FLOAT // The loss of precision with float is unbearable
1395template <> inline void do_vec_fabssqr_quick<float>(const unsigned long sz,
1396 const float * const _v1, double& _f2)
1397{
1398 float F2 = _f2;
1399 do_vec_sumsqr_quick<float>(sz, _v1, F2);
1400 _f2 = F2;
1401}
1402template <> inline void do_vec_fabssqr_exact<float>(const unsigned long sz,
1403 const float * const _v1, double& _f2)
1404{
1405 float F2 = _f2;
1406 do_vec_sumsqr_exact<float>(sz, _v1, F2);
1407 _f2 = F2;
1408}
1409#endif // TBCI_SIMD_FABSSQR_FLOAT
1410
1412//#define SUM1(r,f1,f2) f2 += r
1413#define SUM1_SIMD(r,f1,f2,SUF) \
1414 TMP = _MMAVX(load_##SUF(r)); \
1415 f2 = _MMAVX(add_##SUF(f2, TMP))
1416VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, sd, pd,
1417 SUM_DOUBLE_PREP, SUM_DOUBLE_SIMD_FIN, SUM_DOUBLE_FINAL,
1418 AVXMD, double, __MAVXD)
1419VKERN_TEMPL_1V_T_SIMD(do_vec_sum_quick, SUM1_SIMD, ss, ps,
1420 SUM_FLOAT_PREP, SUM_FLOAT_SIMD_FIN, SUM_FLOAT_FINAL,
1421 AVXMS, float, __MAVXS)
1422
1423//#define XSUM1(r,f1,f2) { T t = f2+r; f1 += (t-f2)-r; f2 = t; }
1425#define XSUM1_SIMD(r,f1,f2,SUF) \
1426 y = _MMAVX(load_##SUF(r)); \
1427 t = _MMAVX(add_##SUF(f2, y)); \
1428 TMP = _MMAVX(sub_##SUF(t, f2)); \
1429 TMP = _MMAVX(sub_##SUF(TMP, y)); \
1430 f1 = _MMAVX(add_##SUF(f1, TMP)); \
1431 /*f2 = _MMAVX(move_##SUF(f2, t))*/ \
1432 f2 = t
1433VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, sd, pd,
1434 XSUM_DOUBLE_PREP, XSUM_DOUBLE_SIMD_FIN_STORE,
1435 XSUM_DOUBLE_SIMD_FINAL_COMPLETE/*_X*/,
1436 AVXMD, double, __MAVXD)
1437VKERN_TEMPL_1V_T_SIMD(do_vec_sum_exact, XSUM1_SIMD, ss, ps,
1438 XSUM_FLOAT_PREP, XSUM_FLOAT_SIMD_FIN_STORE,
1439 XSUM_FLOAT_SIMD_FINAL_COMPLETE,
1440 AVXMS, float, __MAVXS)
1441
1442#endif // TBCI_SIMD_SUM
1443
1445
1446#endif // TBCI_SELECTIVE_INST
1447
1448#endif // __AVX__
1449
1450#endif // H_VEC_KERN_SPECIAL_H
#define NAMESPACE_END
Definition basics.h:323
#define NAMESPACE_TBCI
Definition basics.h:317
#define RESTRICT
Definition basics.h:89
#define T
Definition bdmatlib.cc:20
const unsigned TMatrix< T > * res
#define LCTYPED(T)
Definition plain_def.h:14
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition plain_def.h:119
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
Without the unaligned warning.
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
#define XMULT2(r, v1, f1, f2)
#define MULT2(r, v1, f1, f2)