TBCI Numerical high perf. C++ Library 2.8.0
vec_kern_unr_pref.h
Go to the documentation of this file.
1
8
9#ifndef TBCI_VEC_KERN_UNR_PREF_H
10#define TBCI_VEC_KERN_UNR_PREF_H
11#include "tbci/basics.h"
12
13/* "unroll_prefetch_def.h" is #include'd from basics.h */
14
16
17#define ADD3(r,v1,v2,f1,f2) r = v1 + v2
19VKERN_TEMPL_3V(do_vec_vec_add, ADD3);
20
21#define SUB3(r,v1,v2,f1,f2) r = v1 - v2
23VKERN_TEMPL_3V(do_vec_vec_sub, SUB3);
24
25#define MUL3(r,v1,v2,f1,f2) r = v1 * v2
27VKERN_TEMPL_3V(do_vec_vec_mul, MUL3);
28
29#define CMUL3(r,v1,v2,f1,f2) r = CPLX__ conj(v1) * v2
31VKERN_TEMPL_3V(do_vec_vec_cmul, CMUL3);
32
33#define DIV3(r,v1,v2,f1,f2) r = v1 / v2
35VKERN_TEMPL_3V(do_vec_vec_div, DIV3);
36
37#define CDIV3(r,v1,v2,f1,f2) r = CPLX__ conj(v1) / v2
39VKERN_TEMPL_3V(do_vec_vec_cdiv, CDIV3);
40
41
42#define ADD2(r,v1,f1,f2) r += v1
44VKERN_TEMPL_2V(do_vec_add_vec, ADD2);
45
46#define SUB2(r,v1,f1,f2) r -= v1
48VKERN_TEMPL_2V(do_vec_sub_vec, SUB2);
49
50#define SUB2I(r,v1,f1,f2) r = v1 - r
52VKERN_TEMPL_2V(do_vec_sub_vec_inv, SUB2I);
53
54#define MUL2(r,v1,f1,f2) r *= v1
56VKERN_TEMPL_2V(do_vec_mul_vec, MUL2);
57
58#define CMUL2(r,v1,f1,f2) r = CPLX__ conj(r) * v1
60VKERN_TEMPL_2V(do_vec_cmul_vec, CMUL2);
61
62#define CMUL2I(r,v1,f1,f2) r *= CPLX__ conj(v1)
64VKERN_TEMPL_2V(do_vec_cmul_vec_inv, CMUL2I);
65
66#define DIV2(r,v1,f1,f2) r /= v1
68VKERN_TEMPL_2V(do_vec_div_vec, DIV2);
69
70#define DIV2I(r,v1,f1,f2) r = v1 / r
72VKERN_TEMPL_2V(do_vec_div_vec_inv, DIV2I);
73
74#define CDIV2(r,v1,f1,f2) r = CPLX__ conj(r) / v1
76VKERN_TEMPL_2V(do_vec_cdiv_vec, CDIV2);
77
78#define CDIV2I(r,v1,f1,f2) r = CPLX__ conj(v1) / r
80VKERN_TEMPL_2V(do_vec_cdiv_vec_inv, CDIV2I);
81
82
83
84#define ADD2NV(r,v1,f1,f2) r = v1 + f2
86VKERN_TEMPL_2V_C(do_vec_val_add, ADD2NV);
87
88#define SUB2NV(r,v1,f1,f2) r = v1 - f2
90VKERN_TEMPL_2V_C(do_vec_val_sub, SUB2NV);
91
92#define MUL2NV(r,v1,f1,f2) r = v1 * f2
94VKERN_TEMPL_2V_C(do_vec_val_mul, MUL2NV);
95
96
97#define ADD2RV(r,v1,f1,f2) r = f2 + v1
99VKERN_TEMPL_2V_C(do_val_vec_add, ADD2RV);
100
101#define SUB2RV(r,v1,f1,f2) r = f2 - v1
103VKERN_TEMPL_2V_C(do_val_vec_sub, SUB2RV);
104
105#define MUL2RV(r,v1,f1,f2) r = f2 * v1
107VKERN_TEMPL_2V_C(do_val_vec_mul, MUL2RV);
108
109#define DIV2RV(r,v1,f1,f2) r = f2 / v1
111VKERN_TEMPL_2V_C(do_val_vec_div, DIV2RV);
112
113
114#define ADD1NV(r,f1,f2) r += f2
116VKERN_TEMPL_1V_C(do_vec_add_val, ADD1NV);
117
118#define SUB1NV(r,f1,f2) r -= f2
120VKERN_TEMPL_1V_C(do_vec_sub_val, SUB1NV);
121
122#define SUB1RV(r,f1,f2) r = f2 - r
124VKERN_TEMPL_1V_C(do_val_sub_vec, SUB1RV);
125
126#define MUL1NV(r,f1,f2) r *= f2
128VKERN_TEMPL_1V_C(do_vec_mul_val, MUL1NV);
129
130#define DIV1NV(r,f1,f2) r /= f2
132VKERN_TEMPL_1V_C(do_vec_div_val, DIV1NV);
133
134#define DIV1RV(r,f1,f2) r = f2 / r
136VKERN_TEMPL_1V_C(do_val_div_vec, DIV1RV);
137
138#define ADD1RV(r,f1,f2) r = f2 + r
140VKERN_TEMPL_1V_C(do_val_add_vec, ADD1RV);
141
142// unused ...
143#define MUL1RV(r,f1,f2) r = f2 * r;
144
145
146/* TSVector stuff */
147
148#define ADD2NS(r,v1,f1,f2) r += f2*v1
150VKERN_TEMPL_2V_C(do_vec_add_svc, ADD2NS);
151
152#define SUB2NS(r,v1,f1,f2) r -= f2*v1
154VKERN_TEMPL_2V_C(do_vec_sub_svc, SUB2NS);
155
156#define SUB2RS(r,v1,f1,f2) r = f2*v1 - r
158VKERN_TEMPL_2V_C(do_vec_sub_svc_inv, SUB2RS);
159
160
161#define ADD3NS(r,v1,v2,f1,f2) r = v1 + f2*v2
163VKERN_TEMPL_3V_C(do_vec_svc_add, ADD3NS);
164
165#define SUB3NS(r,v1,v2,f1,f2) r = v1 - f2*v2
167VKERN_TEMPL_3V_C(do_vec_svc_sub, SUB3NS);
168
169
170#define ADD3SN(r,v1,v2,f1,f2) r = f2*v1 + v2
172VKERN_TEMPL_3V_C(do_svc_vec_add, ADD3SN);
173
174#define SUB3SN(r,v1,v2,f1,f2) r = f2*v1 - v2
176VKERN_TEMPL_3V_C(do_svc_vec_sub, SUB3SN);
177
178
179#define ADD3SS(r,v1,v2,f1,f2) r = f1*v1 + f2*v2
181VKERN_TEMPL_3V_CC(do_svc_svc_add, ADD3SS);
182
183#define SUB3SS(r,v1,v2,f1,f2) r = f1*v1 - f2*v2
185VKERN_TEMPL_3V_CC(do_svc_svc_sub, SUB3SS);
186
187
188#define ADD2SN(r,v1,f1,f2) r = f2*r + v1
190VKERN_TEMPL_2V_C(do_svc_add_vec, ADD2SN);
191
192#define SUB2SN(r,v1,f1,f2) r = f2*r - v1
194VKERN_TEMPL_2V_C(do_svc_sub_vec, SUB2SN);
195
196
197#define ADD2SS(r,v1,f1,f2) r = f1*r + f2*v1
199VKERN_TEMPL_2V_CC(do_svc_add_svc, ADD2SS);
200
201#define SUB2SS(r,v1,f1,f2) r = f1*r - f2*v1
203VKERN_TEMPL_2V_CC(do_svc_sub_svc, SUB2SS);
204
205
206#define ADD2SV(r,v1,f1,f2) r = f1*v1 + f2
208VKERN_TEMPL_2V_CC(do_svc_val_add, ADD2SV);
209
210#define SUB2SV(r,v1,f1,f2) r = f1*v1 - f2
212VKERN_TEMPL_2V_CC(do_svc_val_sub, SUB2SV);
213
214
215#define ADD1SV(r,f1,f2) r = f1*r + f2
217VKERN_TEMPL_1V_CC(do_svc_add_val, ADD1SV);
218
219#define SUB1SV(r,f1,f2) r = f1*r - f2
221VKERN_TEMPL_1V_CC(do_svc_sub_val, SUB1SV);
222
223
224#define ADD2VS(r,v1,f1,f2) r = f1 + f2*v1
226VKERN_TEMPL_2V_CC(do_val_svc_add, ADD2VS);
227
228#define SUB2VS(r,v1,f1,f2) r = f1 - f2*v1
230VKERN_TEMPL_2V_CC(do_val_svc_sub, SUB2VS);
231
232#define DIV2VS(r,v1,f1,f2) r = f1 / (f2*v1)
234VKERN_TEMPL_2V_CC(do_val_svc_div, DIV2VS);
235
236
237#define NEG2(r,v1,f1,f2) r = -v1
239VKERN_TEMPL_2V(do_vec_neg_vec, NEG2);
240
241#define NEG1(r,f1,f2) r = -r
243VKERN_TEMPL_1V(do_vec_neg, NEG1);
244
245#define DOT2(r,v1,f1,f2) f2 += CPLX__ conj(r) * v1
247VKERN_TEMPL_2V_T(do_vec_dot_quick, DOT2, T);
248
249#define XDOT2(r,v1,f1,f2) { T y = CPLX__ conj(r) * v1; T t = f2+y; f1 += (t-f2)-y; f2 = t; }
251VKERN_TEMPL_2V_T(do_vec_dot_exact, XDOT2, T);
252
253#define MULT2(r,v1,f1,f2) f2 += r * v1
255VKERN_TEMPL_2V_T(do_vec_mult_quick, MULT2, T);
256VKERN_TEMPL_2V_T_STRIDE(do_vec_mult_stride_quick, MULT2, T);
257
258#define XMULT2(r,v1,f1,f2) { T y = r * v1; T t = f2+y; f1 += (t-f2)-y; f2 = t; }
260VKERN_TEMPL_2V_T(do_vec_mult_exact, XMULT2, T);
261VKERN_TEMPL_2V_T_STRIDE(do_vec_mult_stride_exact, XMULT2, T);
262
263#define FABS1(r,f1,f2) f2 += fabssqr(r)
265VKERN_TEMPL_1V_T_LD(do_vec_fabssqr_quick, FABS1, double);
266
267#define XFABS1(r,f1,f2) { double y = fabssqr(r); double t = f2+y; f1 += (t-f2)-y; f2 = t; }
269VKERN_TEMPL_1V_T(do_vec_fabssqr_exact, XFABS1, double);
270
271#define SQR1(r,f1,f2) f2 += r*r
273VKERN_TEMPL_1V_T(do_vec_sumsqr_quick, SQR1, T);
274
275#define XSQR1(r,f1,f2) { T y = r*r; T t = f2+y; f1 += (t-f2)-y; f2 = t; }
277VKERN_TEMPL_1V_T(do_vec_sumsqr_exact, XSQR1, T);
278
279#define SUM1(r,f1,f2) f2 += r
281VKERN_TEMPL_1V_T(do_vec_sum_quick, SUM1, T);
282
283#define XSUM1(r,f1,f2) { T t = f2+r; f1 += (t-f2)-r; f2 = t; }
285VKERN_TEMPL_1V_T(do_vec_sum_exact, XSUM1, T);
286
287
288// Used in do_bdmat_vec_mult
289#define SUMMULT3(r,v1,v2,f1,f2) r += v1*v2
290VKERN_TEMPL_3V(do_add_vec_vec_mul, SUMMULT3);
291#define SUMCMULT3(r,v1,v2,f1,f2) r += CPLX__ conj(v1)*v2
292VKERN_TEMPL_3V(do_add_vec_vec_cmul, SUMCMULT3);
293
294
296
297#endif /* TBCI_VEC_KERN_UNR_PREF_H */
298
#define NAMESPACE_END
Definition basics.h:323
#define NAMESPACE_TBCI
Definition basics.h:317
#define T
Definition bdmatlib.cc:20
#define VKERN_TEMPL_1V(FNAME, OP1)
Operations of type VEC = OP self.
Definition plain_def.h:159
#define VKERN_TEMPL_1V_C(FNAME, OP1)
Operations of type VEC OP= VAL.
Definition plain_def.h:172
#define VKERN_TEMPL_3V_CC(FNAME, OP3)
Operations of type vec = val * vec OP val * vec.
Definition plain_def.h:54
#define VKERN_TEMPL_2V_CC(FNAME, OP2)
Operations of type VEC = VEC OP VAL or VAL OP VEC.
Definition plain_def.h:102
#define VKERN_TEMPL_2V_T_STRIDE(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition plain_def.h:139
#define VKERN_TEMPL_2V(FNAME, OP2)
Operations of type vec OP= vec.
Definition plain_def.h:72
#define VKERN_TEMPL_2V_C(FNAME, OP2)
Operations of type VEC = VEC OP VAL or VAL OP VEC.
Definition plain_def.h:86
#define VKERN_TEMPL_1V_T_LD(FNAME, OP1, TYPE)
Operations of type TYPE = OP VEC (using LONG_DOUBLE internally).
Definition plain_def.h:219
#define VKERN_TEMPL_3V(FNAME, OP3)
We leave unrolling and prefetching to the compiler.
Definition plain_def.h:21
#define VKERN_TEMPL_3V_C(FNAME, OP3)
Operations of type vec = vec OP val * vec.
Definition plain_def.h:37
#define VKERN_TEMPL_1V_T(FNAME, OP1, TYPE)
Operations of type TYPE = OP VEC.
Definition plain_def.h:202
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
Operations of type TYPE = VEC OP VEC.
Definition plain_def.h:119
#define VKERN_TEMPL_1V_CC(FNAME, OP1)
Operations of type VEC *= S OP= VAL.
Definition plain_def.h:186
#define SQR1(r, f1, f2)
#define SUB3(r, v1, v2, f1, f2)
#define MUL2RV(r, v1, f1, f2)
#define DIV3(r, v1, v2, f1, f2)
#define XSQR1(r, f1, f2)
#define XFABS1(r, f1, f2)
#define SUM1(r, f1, f2)
#define MUL2(r, v1, f1, f2)
#define ADD3NS(r, v1, v2, f1, f2)
#define SUB2RV(r, v1, f1, f2)
#define ADD1RV(r, f1, f2)
#define ADD1NV(r, f1, f2)
#define CDIV3(r, v1, v2, f1, f2)
#define XMULT2(r, v1, f1, f2)
#define ADD2SS(r, v1, f1, f2)
#define XSUM1(r, f1, f2)
#define SUB2VS(r, v1, f1, f2)
#define SUB2NV(r, v1, f1, f2)
#define ADD3(r, v1, v2, f1, f2)
#define MUL2NV(r, v1, f1, f2)
#define SUB1RV(r, f1, f2)
#define SUB2SV(r, v1, f1, f2)
#define ADD1SV(r, f1, f2)
#define SUB3SN(r, v1, v2, f1, f2)
#define DIV2VS(r, v1, f1, f2)
#define XDOT2(r, v1, f1, f2)
#define ADD2SV(r, v1, f1, f2)
#define MUL3(r, v1, v2, f1, f2)
#define DIV2I(r, v1, f1, f2)
#define ADD2NV(r, v1, f1, f2)
#define DIV1RV(r, f1, f2)
#define ADD2VS(r, v1, f1, f2)
#define SUB2(r, v1, f1, f2)
#define CMUL3(r, v1, v2, f1, f2)
#define CDIV2I(r, v1, f1, f2)
#define DIV2RV(r, v1, f1, f2)
#define SUB3SS(r, v1, v2, f1, f2)
#define SUB2NS(r, v1, f1, f2)
#define SUB1SV(r, f1, f2)
#define ADD2SN(r, v1, f1, f2)
#define DOT2(r, v1, f1, f2)
#define DIV1NV(r, f1, f2)
#define SUB2SS(r, v1, f1, f2)
#define MULT2(r, v1, f1, f2)
#define SUMCMULT3(r, v1, v2, f1, f2)
#define SUMMULT3(r, v1, v2, f1, f2)
#define CMUL2(r, v1, f1, f2)
#define CDIV2(r, v1, f1, f2)
#define SUB1NV(r, f1, f2)
#define SUB2SN(r, v1, f1, f2)
#define CMUL2I(r, v1, f1, f2)
#define MUL1NV(r, f1, f2)
#define SUB2I(r, v1, f1, f2)
#define ADD2RV(r, v1, f1, f2)
#define NEG2(r, v1, f1, f2)
#define SUB3NS(r, v1, v2, f1, f2)
#define SUB2RS(r, v1, f1, f2)
#define DIV2(r, v1, f1, f2)
#define FABS1(r, f1, f2)
#define ADD3SS(r, v1, v2, f1, f2)
#define ADD2NS(r, v1, f1, f2)
#define ADD2(r, v1, f1, f2)
#define NEG1(r, f1, f2)
#define ADD3SN(r, v1, v2, f1, f2)