TBCI Numerical high perf. C++ Library
2.8.0
Main Page
Related Pages
Namespaces
Classes
Files
File List
File Members
home
abuild
rpmbuild
BUILD
numerix-2.0
lina
include
unroll_prefetch_def2.h
Go to the documentation of this file.
1
8
#ifndef TBCI_UNROLL_PREFETCH_DEF2_H
9
#define TBCI_UNROLL_PREFETCH_DEF2_H
10
11
//#include "tbci/basics.h"
12
14
#define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15
#define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16
42
#ifndef UNROLL_DEPTH
43
# define UNROLL_DEPTH 4
44
#endif
45
46
47
/***********************************************************
48
* 3 pointer operations
49
***********************************************************/
50
52
#define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
53
--i; \
54
OPER(res[0], v1[0], v2[0], f1, f2); \
55
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
56
++v1; \
57
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
58
++v2; \
59
PREFETCH_W(res+PREF_OFFS(T), CA0); \
60
++res
61
63
#define UNROLL1_KERNEL5(OPER) \
64
--i; \
65
OPER(res[0], v1[0], v2[0], f1, f2); \
66
++v1; ++v2; ++res
67
68
#define UNROLL1_KERNEL5_PREPARE do {} while(0)
69
#define UNROLL1_KERNEL5_FIXUP do {} while(0)
70
71
73
#define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
74
if (EL_PER_CL(T) <= 1) { \
75
OPER(res[0], v1[0], v2[0], f1, f2); \
76
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
77
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
78
i -= 2; \
79
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
80
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
81
OPER(res[1], v1[1], v2[1], f1, f2); \
82
v1 += 2; v2 += 2; \
83
PREFETCH_W(res+PREF_OFFS(T), CA0); \
84
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
85
res += 2; \
86
} else { \
87
OPER(res[0], v1[0], v2[0], f1, f2); \
88
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
89
i -= 2; \
90
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
91
OPER(res[1], v1[1], v2[1], f1, f2); \
92
v1 += 2; v2 += 2; \
93
PREFETCH_W(res+PREF_OFFS(T), CA0); \
94
res += 2; \
95
} \
96
97
99
#define UNROLL2_KERNEL5(OPER) \
100
OPER(res[0], v1[0], v2[0], f1, f2); \
101
i -= 2; \
102
OPER(res[1], v1[1], v2[1], f1, f2); \
103
v1 += 2; v2 += 2; res += 2
104
105
#define UNROLL2_KERNEL5_PREPARE do {} while(0)
106
#define UNROLL2_KERNEL5_FIXUP do {} while(0)
107
108
110
#define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
111
if (EL_PER_CL(T) <= 1) { \
112
OPER(res[0], v1[0], v2[0], f1, f2); \
113
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
114
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
115
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
116
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
117
OPER(res[1], v1[1], v2[1], f1, f2); \
118
i -= 4; \
119
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
120
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
121
OPER(res[2], v1[2], v2[2], f1, f2); \
122
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
123
PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
124
PREFETCH_W(res+PREF_OFFS(T), CA0); \
125
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126
OPER(res[3], v1[3], v2[3], f1, f2); \
127
v1 += 4; v2 += 4; \
128
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
129
PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
130
res += 4; \
131
} else if (EL_PER_CL(T) <= 2) { \
132
OPER(res[0], v1[0], v2[0], f1, f2); \
133
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135
OPER(res[1], v1[1], v2[1], f1, f2); \
136
i -= 4; \
137
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139
OPER(res[2], v1[2], v2[2], f1, f2); \
140
PREFETCH_W(res+PREF_OFFS(T), CA0); \
141
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
142
OPER(res[3], v1[3], v2[3], f1, f2); \
143
v1 += 4; v2 += 4; \
144
res += 4; \
145
} else { \
146
OPER(res[0], v1[0], v2[0], f1, f2); \
147
i -= 4; \
148
OPER(res[1], v1[1], v2[1], f1, f2); \
149
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
150
OPER(res[2], v1[2], v2[2], f1, f2); \
151
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152
OPER(res[3], v1[3], v2[3], f1, f2); \
153
v1 += 4; v2 += 4; \
154
PREFETCH_W(res+PREF_OFFS(T), CA0); \
155
res += 4; \
156
}
157
159
#define UNROLL4_KERNEL5(OPER) \
160
OPER(res[0], v1[0], v2[0], f1, f2); \
161
OPER(res[1], v1[1], v2[1], f1, f2); \
162
i -= 4; \
163
OPER(res[2], v1[2], v2[2], f1, f2); \
164
OPER(res[3], v1[3], v2[3], f1, f2); \
165
v1 += 4; v2 += 4; \
166
res += 4
167
168
#define UNROLL4_KERNEL5_PREPARE do {} while(0)
169
#define UNROLL4_KERNEL5_FIXUP do {} while(0)
170
171
173
#define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
174
if (EL_PER_CL(T) <= 1) { \
175
OPER(res[0], v1[0], v2[0], f1, f2); \
176
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
177
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
178
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
179
OPER(res[1], v1[1], v2[1], f1, f2); \
180
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
181
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
182
PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
183
OPER(res[2], v1[2], v2[2], f1, f2); \
184
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
185
PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
186
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
187
OPER(res[3], v1[3], v2[3], f1, f2); \
188
i -= 8; \
189
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
190
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
191
PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
192
OPER(res[4], v1[4], v2[4], f1, f2); \
193
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
194
PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
195
PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
196
OPER(res[5], v1[5], v2[5], f1, f2); \
197
PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
198
PREFETCH_W(res+PREF_OFFS(T), CA0); \
199
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
200
OPER(res[6], v1[6], v2[6], f1, f2); \
201
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
202
PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
203
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
204
OPER(res[7], v1[7], v2[7], f1, f2); \
205
v1 += 8; v2 += 8; \
206
PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
207
PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
208
PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
209
res += 8; \
210
} else if (EL_PER_CL(T) <= 2) { \
211
OPER(res[0], v1[0], v2[0], f1, f2); \
212
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
213
OPER(res[1], v1[1], v2[1], f1, f2); \
214
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
215
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
216
OPER(res[2], v1[2], v2[2], f1, f2); \
217
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
218
OPER(res[3], v1[3], v2[3], f1, f2); \
219
i -= 8; \
220
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
221
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
222
OPER(res[4], v1[4], v2[4], f1, f2); \
223
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
224
PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
225
OPER(res[5], v1[5], v2[5], f1, f2); \
226
PREFETCH_W(res+PREF_OFFS(T), CA0); \
227
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
228
OPER(res[6], v1[6], v2[6], f1, f2); \
229
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
230
PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
231
OPER(res[7], v1[7], v2[7], f1, f2); \
232
v1 += 8; v2 += 8; \
233
res += 8; \
234
} else if (EL_PER_CL(T) <= 4) { \
235
OPER(res[0], v1[0], v2[0], f1, f2); \
236
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
237
OPER(res[1], v1[1], v2[1], f1, f2); \
238
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
239
OPER(res[2], v1[2], v2[2], f1, f2); \
240
i -= 8; \
241
OPER(res[3], v1[3], v2[3], f1, f2); \
242
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
243
OPER(res[4], v1[4], v2[4], f1, f2); \
244
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
245
OPER(res[5], v1[5], v2[5], f1, f2); \
246
PREFETCH_W(res+PREF_OFFS(T), CA0); \
247
OPER(res[6], v1[6], v2[6], f1, f2); \
248
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
249
OPER(res[7], v1[7], v2[7], f1, f2); \
250
v1 += 8; v2 += 8; \
251
res += 8; \
252
} else { \
253
OPER(res[0], v1[0], v2[0], f1, f2); \
254
OPER(res[1], v1[1], v2[1], f1, f2); \
255
i -= 8; \
256
OPER(res[2], v1[2], v2[2], f1, f2); \
257
OPER(res[3], v1[3], v2[3], f1, f2); \
258
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
259
OPER(res[4], v1[4], v2[4], f1, f2); \
260
OPER(res[5], v1[5], v2[5], f1, f2); \
261
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
262
OPER(res[6], v1[6], v2[6], f1, f2); \
263
OPER(res[7], v1[7], v2[7], f1, f2); \
264
v1 += 8; v2 += 8; \
265
PREFETCH_W(res+PREF_OFFS(T), CA0); \
266
res += 8; \
267
}
268
269
271
#define UNROLL8_KERNEL5(OPER) \
272
OPER(res[0], v1[0], v2[0], f1, f2); \
273
OPER(res[1], v1[1], v2[1], f1, f2); \
274
OPER(res[2], v1[2], v2[2], f1, f2); \
275
OPER(res[3], v1[3], v2[3], f1, f2); \
276
i -= 8; \
277
OPER(res[4], v1[4], v2[4], f1, f2); \
278
OPER(res[5], v1[5], v2[5], f1, f2); \
279
OPER(res[6], v1[6], v2[6], f1, f2); \
280
OPER(res[7], v1[7], v2[7], f1, f2); \
281
v1 += 8; v2 += 8; \
282
res += 8
283
284
#define UNROLL8_KERNEL5_PREPARE do {} while(0)
285
#define UNROLL8_KERNEL5_FIXUP do {} while(0)
286
288
#define PREF_AHEAD3(T,CA0,CA1,CA2) \
289
if (PREFETCH_AHEAD >= 16) { \
290
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
291
PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
292
PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
293
PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
294
PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
295
PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
296
PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
297
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
298
PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
299
PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
300
PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
301
PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
302
PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
303
PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
304
PREFETCH_W(res+EL_PER_CL(T), CA0); \
305
PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
306
PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
307
PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
308
PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
309
PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
310
PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
311
PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
312
PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
313
PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
314
PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
315
PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
316
PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
317
PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
318
PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
319
PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
320
PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
321
PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
322
PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
323
PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
324
PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
325
PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
326
PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
327
PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
328
PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
329
PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
330
PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
331
PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
332
PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
333
PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
334
PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
335
} else if (PREFETCH_AHEAD >= 8) { \
336
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
337
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
338
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
339
PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
340
PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
341
PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
342
PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
343
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
344
PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
345
PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
346
PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
347
PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
348
PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
349
PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
350
PREFETCH_W(res+EL_PER_CL(T), CA0); \
351
PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
352
PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
353
PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
354
PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
355
PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
356
PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
357
} else if (PREFETCH_AHEAD >= 4) { \
358
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
359
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
360
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
361
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
362
PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
363
PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
364
PREFETCH_W(res+EL_PER_CL(T), CA0); \
365
PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
366
PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
367
} else if (PREFETCH_AHEAD >= 2) { \
368
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
369
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
370
PREFETCH_W(res+EL_PER_CL(T), CA0); \
371
}
372
373
374
/***********************************************************
375
* 2 pointer operations
376
***********************************************************/
377
379
#define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
380
OPER(res[0], v1[0], f1, f2); \
381
--i; \
382
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
383
++v1; \
384
PREFETCH_X(res+PREF_OFFS(T), CA0); \
385
++res
386
388
#define UNROLL1_KERNEL4(OPER) \
389
--i; \
390
OPER(res[0], v1[0], f1, f2); \
391
++v1; ++res
392
393
#define UNROLL1_KERNEL4_PREPARE do {} while(0)
394
#define UNROLL1_KERNEL4_FIXUP do {} while(0)
395
396
398
#define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
399
if (EL_PER_CL(T) <= 1) { \
400
OPER(res[0], v1[0], f1, f2); \
401
i -= 2; \
402
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
403
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
404
OPER(res[1], v1[1], f1, f2); \
405
v1 += 2; \
406
PREFETCH_X(res+PREF_OFFS(T), CA0); \
407
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
408
res += 2; \
409
} else { \
410
OPER(res[0], v1[0], f1, f2); \
411
i -= 2; \
412
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
413
OPER(res[1], v1[1], f1, f2); \
414
v1 += 2; \
415
PREFETCH_X(res+PREF_OFFS(T), CA0); \
416
res += 2; \
417
} \
418
419
421
#define UNROLL2_KERNEL4(OPER) \
422
OPER(res[0], v1[0], f1, f2); \
423
v1 += 2; i -= 2; \
424
OPER(res[1], v1[-1],f1, f2); \
425
res += 2
426
427
#define UNROLL2_KERNEL4_PREPARE do {} while(0)
428
#define UNROLL2_KERNEL4_FIXUP do {} while(0)
429
430
432
#define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433
if (EL_PER_CL(T) <= 1) { \
434
OPER(res[0], v1[0], f1, f2); \
435
i -= 4; \
436
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
437
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
438
OPER(res[1], v1[1], f1, f2); \
439
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
440
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
441
PREFETCH_X(res+PREF_OFFS(T), CA0); \
442
OPER(res[2], v1[2], f1, f2); \
443
v1 += 4; \
444
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
445
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
446
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
447
OPER(res[3], v1[-1], f1, f2); \
448
res += 4; \
449
} else if (EL_PER_CL(T) <= 2) { \
450
OPER(res[0], v1[0], f1, f2); \
451
i -= 4; \
452
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
453
OPER(res[1], v1[1], f1, f2); \
454
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
455
OPER(res[2], v1[2], f1, f2); \
456
v1 += 4; \
457
PREFETCH_X(res+PREF_OFFS(T), CA0); \
458
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
459
OPER(res[3], v1[-1], f1, f2); \
460
res += 4; \
461
} else { \
462
OPER(res[0], v1[0], f1, f2); \
463
i -= 4; \
464
OPER(res[1], v1[1], f1, f2); \
465
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
466
OPER(res[2], v1[2], f1, f2); \
467
v1 += 4; \
468
PREFETCH_X(res+PREF_OFFS(T), CA0); \
469
OPER(res[3], v1[-1], f1, f2); \
470
res += 4; \
471
}
472
474
#define UNROLL4_KERNEL4(OPER) \
475
OPER(res[0], v1[0], f1, f2); \
476
OPER(res[1], v1[1], f1, f2); \
477
v1 += 4; i -= 4; \
478
OPER(res[2], v1[-2], f1, f2); \
479
OPER(res[3], v1[-1], f1, f2); \
480
res += 4
481
482
#define UNROLL4_KERNEL4_PREPARE do {} while(0)
483
#define UNROLL4_KERNEL4_FIXUP do {} while(0)
484
485
487
#define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
488
if (EL_PER_CL(T) <= 1) { \
489
OPER(res[0], v1[0], f1, f2); \
490
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
491
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
492
OPER(res[1], v1[1], f1, f2); \
493
i -= 8; \
494
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
495
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
496
OPER(res[2], v1[2], f1, f2); \
497
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
498
PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
499
OPER(res[3], v1[3], f1, f2); \
500
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
501
PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
502
OPER(res[4], v1[4], f1, f2); \
503
v1 += 8; \
504
PREFETCH_X(res+PREF_OFFS(T), CA0); \
505
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
506
OPER(res[5], v1[-3], f1, f2); \
507
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
508
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
509
OPER(res[6], v1[-2], f1, f2); \
510
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
511
PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
512
OPER(res[7], v1[-1], f1, f2); \
513
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
514
PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
515
res += 8; \
516
} else if (EL_PER_CL(T) <= 2) { \
517
OPER(res[0], v1[0], f1, f2); \
518
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
519
OPER(res[1], v1[1], f1, f2); \
520
i -= 8; \
521
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
522
OPER(res[2], v1[2], f1, f2); \
523
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
524
OPER(res[3], v1[3], f1, f2); \
525
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
526
OPER(res[4], v1[4], f1, f2); \
527
v1 += 8; \
528
PREFETCH_X(res+PREF_OFFS(T), CA0); \
529
OPER(res[5], v1[-3], f1, f2); \
530
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
531
OPER(res[6], v1[-2], f1, f2); \
532
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
533
OPER(res[7], v1[-1], f1, f2); \
534
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
535
res += 8; \
536
} else if (EL_PER_CL(T) <= 4) { \
537
OPER(res[0], v1[0], f1, f2); \
538
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
539
OPER(res[1], v1[1], f1, f2); \
540
OPER(res[2], v1[2], f1, f2); \
541
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
542
OPER(res[3], v1[3], f1, f2); \
543
OPER(res[4], v1[4], f1, f2); \
544
v1 += 8; \
545
PREFETCH_X(res+PREF_OFFS(T), CA0); \
546
OPER(res[5], v1[-3], f1, f2); \
547
i -= 8; \
548
OPER(res[6], v1[-2], f1, f2); \
549
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
550
OPER(res[7], v1[-1], f1, f2); \
551
res += 8; \
552
} else { \
553
OPER(res[0], v1[0], f1, f2); \
554
OPER(res[1], v1[1], f1, f2); \
555
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
556
OPER(res[2], v1[2], f1, f2); \
557
OPER(res[3], v1[3], f1, f2); \
558
v1 += 8; \
559
PREFETCH_X(res+PREF_OFFS(T), CA0); \
560
OPER(res[4], v1[-4], f1, f2); \
561
OPER(res[5], v1[-3], f1, f2); \
562
i -= 8; \
563
OPER(res[6], v1[-2], f1, f2); \
564
OPER(res[7], v1[-1], f1, f2); \
565
res += 8; \
566
}
567
568
570
#define UNROLL8_KERNEL4(OPER) \
571
OPER(res[0], v1[0], f1, f2); \
572
OPER(res[1], v1[1], f1, f2); \
573
OPER(res[2], v1[2], f1, f2); \
574
OPER(res[3], v1[3], f1, f2); \
575
v1 += 8; i -= 8; \
576
OPER(res[4], v1[-4], f1, f2); \
577
OPER(res[5], v1[-3], f1, f2); \
578
OPER(res[6], v1[-2], f1, f2); \
579
OPER(res[7], v1[-1], f1, f2); \
580
res += 8
581
582
#define UNROLL8_KERNEL4_PREPARE do {} while(0)
583
#define UNROLL8_KERNEL4_FIXUP do {} while(0)
584
585
587
#define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
588
if (PREFETCH_AHEAD >= 16) { \
589
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
590
PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
591
PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
592
PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
593
PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
594
PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
595
PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
596
PREFETCH_X(res+EL_PER_CL(T), CA0); \
597
PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
598
PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
599
PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
600
PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
601
PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
602
PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
603
PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
604
PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
605
PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
606
PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
607
PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
608
PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
609
PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
610
PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
611
PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
612
PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
613
PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
614
PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
615
PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
616
PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
617
PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
618
PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
619
} else if (PREFETCH_AHEAD >= 8) { \
620
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
621
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
622
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
623
PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
624
PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
625
PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
626
PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
627
PREFETCH_X(res+EL_PER_CL(T), CA0); \
628
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
629
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
630
PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
631
PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
632
PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
633
PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
634
} else if (PREFETCH_AHEAD >= 4) { \
635
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
636
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
637
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
638
PREFETCH_X(res+EL_PER_CL(T), CA0); \
639
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
640
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
641
} else if (PREFETCH_AHEAD >= 2) { \
642
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
643
PREFETCH_X(res+EL_PER_CL(T), CA0); \
644
}
645
646
647
/***********************************************************
648
* 1 pointer operations
649
***********************************************************/
650
652
#define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
653
OPER(res[0], f1, f2); \
654
--i; \
655
PREFETCH_X(res+PREF_OFFS(T), CA0); \
656
++res
657
659
#define UNROLL1_KERNEL3(OPER) \
660
--i; \
661
OPER(res[0], f1, f2); \
662
++res
663
664
#define UNROLL1_KERNEL3_PREPARE do {} while(0)
665
#define UNROLL1_KERNEL3_FIXUP do {} while(0)
666
667
669
#define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
670
if (EL_PER_CL(T) <= 1) { \
671
OPER(res[0], f1, f2); \
672
PREFETCH_X(res+PREF_OFFS(T), CA0); \
673
i -= 2; \
674
OPER(res[1], f1, f2); \
675
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
676
res += 2; \
677
} else { \
678
OPER(res[0], f1, f2); \
679
i -= 2; \
680
OPER(res[1], f1, f2); \
681
PREFETCH_X(res+PREF_OFFS(T), CA0); \
682
res += 2; \
683
} \
684
685
687
#define UNROLL2_KERNEL3(OPER) \
688
OPER(res[0], f1, f2); \
689
i -= 2; \
690
OPER(res[1], f1, f2); \
691
res += 2
692
693
#define UNROLL2_KERNEL3_PREPARE do {} while(0)
694
#define UNROLL2_KERNEL3_FIXUP do {} while(0)
695
696
698
#define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
699
if (EL_PER_CL(T) <= 1) { \
700
OPER(res[0], f1, f2); \
701
i -= 4; \
702
PREFETCH_X(res+PREF_OFFS(T), CA0); \
703
OPER(res[1], f1, f2); \
704
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
705
OPER(res[2], f1, f2); \
706
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
707
OPER(res[3], f1, f2); \
708
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
709
res += 4; \
710
} else if (EL_PER_CL(T) <= 2) { \
711
OPER(res[0], f1, f2); \
712
PREFETCH_X(res+PREF_OFFS(T), CA0); \
713
OPER(res[1], f1, f2); \
714
i -= 4; \
715
OPER(res[2], f1, f2); \
716
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
717
OPER(res[3], f1, f2); \
718
res += 4; \
719
} else { \
720
OPER(res[0], f1, f2); \
721
i -= 4; \
722
OPER(res[1], f1, f2); \
723
PREFETCH_X(res+PREF_OFFS(T), CA0); \
724
OPER(res[2], f1, f2); \
725
OPER(res[3], f1, f2); \
726
res += 4; \
727
}
728
730
#define UNROLL4_KERNEL3(OPER) \
731
OPER(res[0], f1, f2); \
732
OPER(res[1], f1, f2); \
733
i -= 4; \
734
OPER(res[2], f1, f2); \
735
OPER(res[3], f1, f2); \
736
res += 4
737
738
#define UNROLL4_KERNEL3_PREPARE do {} while(0)
739
#define UNROLL4_KERNEL3_FIXUP do {} while(0)
740
741
743
#define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
744
if (EL_PER_CL(T) <= 1) { \
745
OPER(res[0], f1, f2); \
746
PREFETCH_X(res+PREF_OFFS(T), CA0); \
747
OPER(res[1], f1, f2); \
748
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
749
OPER(res[2], f1, f2); \
750
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
751
OPER(res[3], f1, f2); \
752
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
753
OPER(res[4], f1, f2); \
754
i -= 8; \
755
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
756
OPER(res[5], f1, f2); \
757
PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
758
OPER(res[6], f1, f2); \
759
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
760
OPER(res[7], f1, f2); \
761
PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
762
res += 8; \
763
} else if (EL_PER_CL(T) <= 2) { \
764
OPER(res[0], f1, f2); \
765
OPER(res[1], f1, f2); \
766
PREFETCH_X(res+PREF_OFFS(T), CA0); \
767
OPER(res[2], f1, f2); \
768
OPER(res[3], f1, f2); \
769
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
770
OPER(res[4], f1, f2); \
771
i -= 8; \
772
OPER(res[5], f1, f2); \
773
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
774
OPER(res[6], f1, f2); \
775
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
776
OPER(res[7], f1, f2); \
777
res += 8; \
778
} else if (EL_PER_CL(T) <= 4) { \
779
OPER(res[0], f1, f2); \
780
OPER(res[1], f1, f2); \
781
PREFETCH_X(res+PREF_OFFS(T), CA0); \
782
OPER(res[2], f1, f2); \
783
OPER(res[3], f1, f2); \
784
i -= 8; \
785
OPER(res[4], f1, f2); \
786
OPER(res[5], f1, f2); \
787
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
788
OPER(res[6], f1, f2); \
789
OPER(res[7], f1, f2); \
790
res += 8; \
791
} else { \
792
OPER(res[0], f1, f2); \
793
OPER(res[1], f1, f2); \
794
OPER(res[2], f1, f2); \
795
PREFETCH_X(res+PREF_OFFS(T), CA0); \
796
OPER(res[3], f1, f2); \
797
OPER(res[4], f1, f2); \
798
OPER(res[5], f1, f2); \
799
i -= 8; \
800
OPER(res[6], f1, f2); \
801
OPER(res[7], f1, f2); \
802
res += 8; \
803
}
804
805
807
#define UNROLL8_KERNEL3(OPER) \
808
OPER(res[0], f1, f2); \
809
OPER(res[1], f1, f2); \
810
OPER(res[2], f1, f2); \
811
OPER(res[3], f1, f2); \
812
i -= 8; \
813
OPER(res[4], f1, f2); \
814
OPER(res[5], f1, f2); \
815
OPER(res[6], f1, f2); \
816
OPER(res[7], f1, f2); \
817
res += 8
818
819
#define UNROLL8_KERNEL3_PREPARE do {} while(0)
820
#define UNROLL8_KERNEL3_FIXUP do {} while(0)
821
822
824
#define PREF_AHEAD1(T,PREFETCH_X,CA0) \
825
if (PREFETCH_AHEAD >= 16) { \
826
PREFETCH_X(res+EL_PER_CL(T), CA0); \
827
PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
828
PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
829
PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
830
PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
831
PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
832
PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
833
PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
834
PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
835
PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
836
PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
837
PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
838
PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
839
PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
840
PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
841
} else if (PREFETCH_AHEAD >= 8) { \
842
PREFETCH_X(res+EL_PER_CL(T), CA0); \
843
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
844
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
845
PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
846
PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
847
PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
848
PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
849
} else if (PREFETCH_AHEAD >= 4) { \
850
PREFETCH_X(res+EL_PER_CL(T), CA0); \
851
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
852
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
853
} else if (PREFETCH_AHEAD >= 2) { \
854
PREFETCH_X(res+EL_PER_CL(T), CA0); \
855
}
856
857
858
859
// Select default kernels
860
#if UNROLL_DEPTH == 1
861
862
# define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
863
# define UNR_KERNEL5 UNROLL1_KERNEL5
864
# define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
865
# define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
866
867
# define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
868
# define UNR_KERNEL4 UNROLL1_KERNEL4
869
# define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
870
# define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
871
872
# define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
873
# define UNR_KERNEL3 UNROLL1_KERNEL3
874
# define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
875
# define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
876
877
#elif UNROLL_DEPTH == 2
878
879
# define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
880
# define UNR_KERNEL5 UNROLL2_KERNEL5
881
# define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
882
# define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
883
884
# define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
885
# define UNR_KERNEL4 UNROLL2_KERNEL4
886
# define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
887
# define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
888
889
# define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
890
# define UNR_KERNEL3 UNROLL2_KERNEL3
891
# define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
892
# define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
893
894
#elif UNROLL_DEPTH == 4
895
896
# define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
897
# define UNR_KERNEL5 UNROLL4_KERNEL5
898
# define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
899
# define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
900
901
# define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
902
# define UNR_KERNEL4 UNROLL4_KERNEL4
903
# define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
904
# define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
905
906
# define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
907
# define UNR_KERNEL3 UNROLL4_KERNEL3
908
# define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
909
# define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
910
911
#elif UNROLL_DEPTH == 8
912
913
# define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
914
# define UNR_KERNEL5 UNROLL8_KERNEL5
915
# define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
916
# define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
917
918
# define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
919
# define UNR_KERNEL4 UNROLL8_KERNEL4
920
# define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
921
# define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
922
923
# define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
924
# define UNR_KERNEL3 UNROLL8_KERNEL3
925
# define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
926
# define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
927
928
#else
929
930
# error "UNROLL_DEPTH may only be 1, 2, 4, 8"
931
932
#endif
/* UNROLL_DEPTH */
933
952
/****************************************************************
953
* Macros with fragments for the implementation
954
****************************************************************/
955
956
#ifdef USE_PREFETCH
957
958
# define VKERN_TEMPL_3V_PREF(OP3,T) \
959
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
960
PREFETCH_W (res, 3); \
961
PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
962
UNR_KERNEL5_PREP; \
963
do { \
964
UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
965
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
966
UNR_KERNEL5_FIX; \
967
}
968
969
# define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
970
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
971
PREFETCH_X (res, 3); \
972
PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
973
UNR_KERNEL4_PREP; \
974
do { \
975
UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
976
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
977
UNR_KERNEL4_FIX; \
978
}
979
980
# define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
981
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
982
PREFETCH_X (res, 3); \
983
PREF_AHEAD1(T,PREFETCH_X,CW); \
984
UNR_KERNEL3_PREP; \
985
do { \
986
UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
987
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
988
UNR_KERNEL3_FIX; \
989
}
990
#else
991
# define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
992
# define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
993
# define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
994
#endif
/* USE_PREFETCH */
995
996
997
/****************************************************************
998
* Templates for routines
999
****************************************************************/
1000
1012
#define VKERN_TEMPL_3V(FNAME,OP3) \
1014
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1015
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1016
template <typename T> \
1017
VEC_INLINE void FNAME (const unsigned long sz, \
1018
T* RESTRICT const _res, \
1019
const T* RESTRICT const _v1, \
1020
const T* RESTRICT const _v2) \
1021
{ \
1022
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1023
REGISTER const T *v1 = _v1, *v2 = _v2; \
1024
REGISTER T *res = _res; \
1025
REGISTER long i = sz; \
1026
VKERN_TEMPL_3V_PREF(OP3,T); \
1027
\
1028
if (LIKELY(i >= UNROLL_DEPTH)) { \
1029
UNR_KERNEL5_PREP; \
1030
do { \
1031
UNR_KERNEL5(OP3); \
1032
} while (i >= UNROLL_DEPTH); \
1033
UNR_KERNEL5_FIX; \
1034
} \
1035
\
1036
for (; i; --i) { \
1037
OP3(*res, *v1, *v2, f1, f2); \
1038
++v1; ++v2; ++res; \
1039
} \
1040
}
1041
1043
#define VKERN_TEMPL_3V_C(FNAME,OP3) \
1044
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1045
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1046
const T* RESTRICT const, LCTYPED(T));) \
1047
template <typename T> \
1048
VEC_INLINE void FNAME (const unsigned long sz, \
1049
T* RESTRICT const _res, \
1050
const T* RESTRICT const _v1, \
1051
const T* RESTRICT const _v2, \
1052
LCTYPE(T) f2) \
1053
{ \
1054
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1055
REGISTER const T *v1 = _v1. *v2 = _v2; \
1056
REGISTER T *res = _res; \
1057
REGISTER long i = sz; \
1058
VKERN_TEMPL_3V_PREF(OP3,T); \
1059
\
1060
if (LIKELY(i >= UNROLL_DEPTH)) { \
1061
UNR_KERNEL5_PREP; \
1062
do { \
1063
UNR_KERNEL5(OP3); \
1064
} while (i >= UNROLL_DEPTH); \
1065
UNR_KERNEL5_FIX; \
1066
} \
1067
\
1068
for (; i; --i) { \
1069
OP3(*res, *v1, *v2, f1, f2); \
1070
++v1; ++v2; ++res; \
1071
} \
1072
}
1073
1075
#define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1076
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1077
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1078
const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1079
template <typename T> \
1080
VEC_INLINE void FNAME (const unsigned long sz, \
1081
T* RESTRICT const _res, \
1082
const T* RESTRICT const _v1, \
1083
const T* RESTRICT const _v2, \
1084
LCTYPE(T) f1, \
1085
LCTYPE(T) f2) \
1086
{ \
1087
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1088
REGISTER long i = sz; \
1089
REGISTER const T *v1 = _v1, *v2 = _v2; \
1090
REGISTER T *res = _res; \
1091
VKERN_TEMPL_3V_PREF(OP3,T); \
1092
\
1093
if (LIKELY(i >= UNROLL_DEPTH)) { \
1094
UNR_KERNEL5_PREP; \
1095
do { \
1096
UNR_KERNEL5(OP3); \
1097
} while (i >= UNROLL_DEPTH); \
1098
UNR_KERNEL5_FIX; \
1099
} \
1100
\
1101
for (; i; --i) { \
1102
OP3(*res, *v1, *v2, f1, f2); \
1103
++v1; ++v2; ++res; \
1104
} \
1105
}
1106
1108
#define VKERN_TEMPL_2V(FNAME,OP2) \
1109
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1110
(const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1111
template <typename T> \
1112
VEC_INLINE void FNAME (const unsigned long sz, \
1113
T* RESTRICT const _res, \
1114
const T* RESTRICT const _v1) \
1115
{ \
1116
PREFETCH_R(_v1, 3); \
1117
REGISTER const T *v1 = _v1; \
1118
REGISTER T* res = _res; \
1119
REGISTER long i = sz; \
1120
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1121
\
1122
if (LIKELY(i >= UNROLL_DEPTH)) { \
1123
UNR_KERNEL4_PREP; \
1124
do { \
1125
UNR_KERNEL4(OP2); \
1126
} while (i >= UNROLL_DEPTH); \
1127
UNR_KERNEL4_FIX; \
1128
} \
1129
\
1130
for (; i; --i) { \
1131
OP2(*res, *v1, f1, f2); \
1132
++v1; ++res; \
1133
} \
1134
}
1135
1137
#define VKERN_TEMPL_2V_C(FNAME,OP2) \
1138
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1139
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1140
LCTYPED(T));) \
1141
template <typename T> \
1142
VEC_INLINE void FNAME (const unsigned long sz, \
1143
T* RESTRICT const _res, \
1144
const T* RESTRICT const _v1, \
1145
LCTYPE(T) f2) \
1146
{ \
1147
PREFETCH_R(_v1, 3); \
1148
REGISTER const T *v1 = _v1; \
1149
REGISTER T* res = _res; \
1150
REGISTER long i = sz; \
1151
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1152
\
1153
if (LIKELY(i >= UNROLL_DEPTH)) { \
1154
UNR_KERNEL4_PREP; \
1155
do { \
1156
UNR_KERNEL4(OP2); \
1157
} while (i >= UNROLL_DEPTH); \
1158
UNR_KERNEL4_FIX; \
1159
} \
1160
\
1161
for (; i; --i) { \
1162
OP2(*res, *v1, f1, f2); \
1163
++v1; ++res; \
1164
} \
1165
}
1166
1168
#define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1169
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1170
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1171
LCTYPED(T), LCTYPED(T));) \
1172
template <typename T> \
1173
VEC_INLINE void FNAME (const unsigned long sz, \
1174
T* RESTRICT const _res, \
1175
const T* RESTRICT const _v1, \
1176
LCTYPE(T) f1, \
1177
LCTYPE(T) f2) \
1178
{ \
1179
PREFETCH_R(_v1, 3); \
1180
REGISTER const T *v1 = _v1; \
1181
REGISTER T* res = _res; \
1182
REGISTER long i = sz; \
1183
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1184
\
1185
if (LIKELY(i >= UNROLL_DEPTH)) { \
1186
UNR_KERNEL4_PREP; \
1187
do { \
1188
UNR_KERNEL4(OP2); \
1189
} while (i >= UNROLL_DEPTH); \
1190
UNR_KERNEL4_FIX; \
1191
} \
1192
\
1193
for (; i; --i) { \
1194
OP2(*res, *v1, f1, f2); \
1195
++v1; ++res; \
1196
} \
1197
}
1198
1200
#define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1201
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1202
(const unsigned long, const T* RESTRICT const, \
1203
const T* RESTRICT const, TYPE&);) \
1204
template <typename T> \
1205
VEC_INLINE void FNAME (const unsigned long sz, \
1206
const T* RESTRICT const _res, \
1207
const T* RESTRICT const _v1, \
1208
TYPE &_f2) \
1209
{ \
1210
PREFETCH_R(_v1, 3); \
1211
/* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1212
REGISTER TYPE f2(_f2), f1(0.0); \
1213
REGISTER const T *v1 = _v1; \
1214
REGISTER const T* res = _res; \
1215
REGISTER long i = sz; \
1216
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1217
\
1218
if (LIKELY(i >= UNROLL_DEPTH)) { \
1219
UNR_KERNEL4_PREP; \
1220
do { \
1221
UNR_KERNEL4(OP2); \
1222
} while (i >= UNROLL_DEPTH); \
1223
UNR_KERNEL4_FIX; \
1224
} \
1225
\
1226
for (; i; --i) { \
1227
OP2(*res, *v1, f1, f2); \
1228
++v1; ++res; \
1229
} \
1230
_fin: \
1231
_f2 = f2 - f1; \
1232
}
1233
1235
#define VKERN_TEMPL_1V(FNAME,OP1) \
1236
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1237
(const unsigned long, T* RESTRICT const);) \
1238
template <typename T> \
1239
VEC_INLINE void FNAME (const unsigned long sz, \
1240
T* RESTRICT const _res) \
1241
{ \
1242
REGISTER long i = sz; \
1243
REGISTER T* res = _res; \
1244
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1245
\
1246
if (LIKELY(i >= UNROLL_DEPTH)) { \
1247
UNR_KERNEL3_PREP; \
1248
do { \
1249
UNR_KERNEL3(OP1); \
1250
} while (i >= UNROLL_DEPTH); \
1251
UNR_KERNEL3_FIX; \
1252
} \
1253
\
1254
for (; i; --i) { \
1255
OP1(*res, f1, f2); \
1256
++res; \
1257
} \
1258
}
1259
1261
#define VKERN_TEMPL_1V_C(FNAME,OP1) \
1262
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1263
(const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1264
template <typename T> \
1265
VEC_INLINE void FNAME (const unsigned long sz, \
1266
T* RESTRICT const _res, \
1267
LCTYPE(T) f2) \
1268
{ \
1269
REGISTER long i = sz; \
1270
REGISTER T* res = _res; \
1271
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1272
\
1273
if (LIKELY(i >= UNROLL_DEPTH)) { \
1274
UNR_KERNEL3_PREP; \
1275
do { \
1276
UNR_KERNEL3(OP1); \
1277
} while (i >= UNROLL_DEPTH); \
1278
UNR_KERNEL3_FIX; \
1279
} \
1280
\
1281
for (; i; --i) { \
1282
OP1(*res, f1, f2); \
1283
++res; \
1284
} \
1285
}
1286
1288
#define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1289
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1290
(const unsigned long, T* RESTRICT const, LCTYPED(T), \
1291
LCTYPED(T));) \
1292
template <typename T> \
1293
VEC_INLINE void FNAME (const unsigned long sz, \
1294
T* RESTRICT const _res, \
1295
LCTYPE(T) f1, \
1296
LCTYPE(T) f2) \
1297
{ \
1298
REGISTER long i = sz; \
1299
REGISTER T* res = _res; \
1300
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1301
\
1302
if (LIKELY(i >= UNROLL_DEPTH)) { \
1303
UNR_KERNEL3_PREP; \
1304
do { \
1305
UNR_KERNEL3(OP1); \
1306
} while (i >= UNROLL_DEPTH); \
1307
UNR_KERNEL3_FIX; \
1308
} \
1309
\
1310
for (; i; --i) { \
1311
OP1(*res, f1, f2); \
1312
++res; \
1313
} \
1314
}
1315
1317
#define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1318
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1319
(const unsigned long, const T* const, TYPE&);) \
1320
template <typename T> \
1321
VEC_INLINE void FNAME (const unsigned long sz, \
1322
const T* const _res, \
1323
TYPE &_f2) \
1324
{ \
1325
/* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1326
REGISTER TYPE f2(_f2), f1(0.0); \
1327
REGISTER const T* res = _res; \
1328
REGISTER long i = sz; \
1329
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1330
\
1331
if (LIKELY(i >= UNROLL_DEPTH)) { \
1332
UNR_KERNEL3_PREP; \
1333
do { \
1334
UNR_KERNEL3(OP1); \
1335
} while (i >= UNROLL_DEPTH); \
1336
UNR_KERNEL3_FIX; \
1337
} \
1338
\
1339
for (; i; --i) { \
1340
OP1(*res, f1, f2); \
1341
++res; \
1342
} \
1343
_f2 = f2 - f1; \
1344
}
1345
1347
#define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1348
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1349
(const unsigned long, const T* const, TYPE&);) \
1350
template <typename T> \
1351
VEC_INLINE void FNAME (const unsigned long sz, \
1352
const T* const _res, \
1353
TYPE &_f2) \
1354
{ \
1355
/* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1356
REGISTER LONG_DOUBLE f2(_f2); \
1357
REGISTER const T* res = _res; \
1358
REGISTER long i = sz; \
1359
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1360
\
1361
if (LIKELY(i >= UNROLL_DEPTH)) { \
1362
UNR_KERNEL3_PREP; \
1363
do { \
1364
UNR_KERNEL3(OP1); \
1365
} while (i >= UNROLL_DEPTH); \
1366
UNR_KERNEL3_FIX; \
1367
} \
1368
\
1369
for (; i; --i) { \
1370
OP1(*res, f1, f2); \
1371
++res; \
1372
} \
1373
_f2 = f2; \
1374
}
1375
1376
#endif
/* TBCI_UNROLL_PREFETCH_DEF2_H */
Generated by
1.8.5