TBCI Numerical high perf. C++ Library
2.8.0
Main Page
Related Pages
Namespaces
Classes
Files
File List
File Members
home
abuild
rpmbuild
BUILD
numerix-2.0
lina
include
unroll_prefetch_def.h
Go to the documentation of this file.
1
8
#ifndef TBCI_UNROLL_PREFETCH_DEF_H
9
#define TBCI_UNROLL_PREFETCH_DEF_H
10
11
//#include "tbci/basics.h"
12
14
#define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15
#define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16
39
#ifndef UNROLL_DEPTH
40
# define UNROLL_DEPTH 4
41
#endif
42
43
/***********************************************************
44
* 3 pointer operations
45
***********************************************************/
46
48
#define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
49
OPER(res[0], v1[0], v2[0], f1, f2); \
50
--i; \
51
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
52
++v1; \
53
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
54
++v2; \
55
PREFETCH_W(res+PREF_OFFS(T), CA0); \
56
++res
57
59
#define UNROLL1_KERNEL5(OPER) \
60
--i; \
61
OPER(res[0], v1[0], v2[0], f1, f2); \
62
++v1; ++v2; ++res
63
64
#define UNROLL1_KERNEL5_PREPARE do {} while(0)
65
#define UNROLL1_KERNEL5_FIXUP do {} while(0)
66
67
69
#define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
70
if (EL_PER_CL(T) <= 1) { \
71
i -= 2; \
72
OPER(res[0], v1[0], v2[0], f1, f2); \
73
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
74
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
75
v1 += 2; \
76
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
77
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
78
OPER(res[1], v1[-1], v2[1], f1, f2); \
79
v2 += 2; \
80
PREFETCH_W(res+PREF_OFFS(T), CA0); \
81
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
82
res += 2; \
83
} else { \
84
i -= 2; \
85
OPER(res[0], v1[0], v2[0], f1, f2); \
86
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
87
v1 += 2; \
88
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
89
OPER(res[1], v1[-1], v2[1], f1, f2); \
90
v2 += 2; \
91
PREFETCH_W(res+PREF_OFFS(T), CA0); \
92
res += 2; \
93
} \
94
95
97
#define UNROLL2_KERNEL5(OPER) \
98
OPER(res[0], v1[0], v2[0], f1, f2); \
99
v1 += 2; i -= 2; \
100
OPER(res[1], v1[-1], v2[1], f1, f2); \
101
v2 += 2; res += 2
102
103
#define UNROLL2_KERNEL5_PREPARE do {} while(0)
104
#define UNROLL2_KERNEL5_FIXUP do {} while(0)
105
106
108
#define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
109
if (EL_PER_CL(T) <= 1) { \
110
OPER(res[0], v1[0], v2[0], f1, f2); \
111
i -= 4; \
112
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
113
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
114
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
115
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
116
OPER(res[1], v1[1], v2[1], f1, f2); \
117
v1 += 4; \
118
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
119
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
120
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
121
PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
122
OPER(res[2], v1[-2], v2[2], f1, f2); \
123
v2 += 4; \
124
PREFETCH_W(res+PREF_OFFS(T), CA0); \
125
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
127
PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
128
OPER(res[3], v1[-1], v2[-1], f1, f2); \
129
res += 4; \
130
} else if (EL_PER_CL(T) <= 2) { \
131
OPER(res[0], v1[0], v2[0], f1, f2); \
132
i -= 4; \
133
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135
OPER(res[1], v1[1], v2[1], f1, f2); \
136
v1 += 4; \
137
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139
OPER(res[2], v1[-2], v2[2], f1, f2); \
140
v2 += 4; \
141
PREFETCH_W(res+PREF_OFFS(T), CA0); \
142
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
143
OPER(res[3], v1[-1], v2[-1], f1, f2); \
144
res += 4; \
145
} else { \
146
OPER(res[0], v1[0], v2[0], f1, f2); \
147
i -= 4; \
148
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
149
OPER(res[1], v1[1], v2[1], f1, f2); \
150
v1 += 4; \
151
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152
OPER(res[2], v1[-2], v2[2], f1, f2); \
153
v2 += 4; \
154
PREFETCH_W(res+PREF_OFFS(T), CA0); \
155
OPER(res[3], v1[-1], v2[-1], f1, f2); \
156
res += 4; \
157
}
158
160
#define UNROLL4_KERNEL5(OPER) \
161
OPER(res[0], v1[0], v2[0], f1, f2); \
162
i -= 4; \
163
OPER(res[1], v1[1], v2[1], f1, f2); \
164
v1 += 4; \
165
OPER(res[2], v1[-2], v2[2], f1, f2); \
166
v2 += 4; \
167
OPER(res[3], v1[-1], v2[-1], f1, f2); \
168
res += 4
169
170
#define UNROLL4_KERNEL5_PREPARE do {} while(0)
171
#define UNROLL4_KERNEL5_FIXUP do {} while(0)
172
173
175
#define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
176
if (EL_PER_CL(T) <= 1) { \
177
OPER(res[0], v1[0], v2[0], f1, f2); \
178
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
179
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
180
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
181
OPER(res[1], v1[1], v2[1], f1, f2); \
182
i -= 8; \
183
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
184
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
185
PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
186
OPER(res[2], v1[2], v2[2], f1, f2); \
187
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
188
PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
189
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
190
OPER(res[3], v1[3], v2[3], f1, f2); \
191
v1 += 8; \
192
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
193
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
194
PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
195
OPER(res[4], v1[-4], v2[4], f1, f2); \
196
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
197
PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
198
PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
199
OPER(res[5], v1[-3], v2[5], f1, f2); \
200
PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
201
PREFETCH_W(res+PREF_OFFS(T), CA0); \
202
PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
203
OPER(res[6], v1[-2], v2[6], f1, f2); \
204
v2 += 8; \
205
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
206
PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
207
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
208
OPER(res[7], v1[-1], v2[-1], f1, f2); \
209
PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
210
PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
211
PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
212
res += 8; \
213
} else if (EL_PER_CL(T) <= 2) { \
214
OPER(res[0], v1[0], v2[0], f1, f2); \
215
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
216
i -= 8; \
217
OPER(res[1], v1[1], v2[1], f1, f2); \
218
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
219
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
220
OPER(res[2], v1[2], v2[2], f1, f2); \
221
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
222
OPER(res[3], v1[3], v2[3], f1, f2); \
223
v1 += 8; \
224
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
225
PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
226
OPER(res[4], v1[-4], v2[4], f1, f2); \
227
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
228
PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
229
OPER(res[5], v1[-3], v2[5], f1, f2); \
230
v2 += 8; \
231
PREFETCH_W(res+PREF_OFFS(T), CA0); \
232
PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
233
OPER(res[6], v1[-2], v2[-2], f1, f2); \
234
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
235
PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
236
OPER(res[7], v1[-1], v2[-1], f1, f2); \
237
res += 8; \
238
} else if (EL_PER_CL(T) <= 4) { \
239
OPER(res[0], v1[0], v2[0], f1, f2); \
240
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
241
OPER(res[1], v1[1], v2[1], f1, f2); \
242
i -= 8; \
243
OPER(res[2], v1[2], v2[2], f1, f2); \
244
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
245
OPER(res[3], v1[3], v2[3], f1, f2); \
246
v1 += 8; \
247
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
248
OPER(res[4], v1[-4], v2[4], f1, f2); \
249
PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
250
OPER(res[5], v1[-3], v2[5], f1, f2); \
251
v2 += 8; \
252
PREFETCH_W(res+PREF_OFFS(T), CA0); \
253
OPER(res[6], v1[-2], v2[-2], f1, f2); \
254
PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
255
OPER(res[7], v1[-1], v2[-1], f1, f2); \
256
res += 8; \
257
} else { \
258
OPER(res[0], v1[0], v2[0], f1, f2); \
259
i -= 8; \
260
OPER(res[1], v1[1], v2[1], f1, f2); \
261
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
262
OPER(res[2], v1[2], v2[2], f1, f2); \
263
OPER(res[3], v1[3], v2[3], f1, f2); \
264
v1 += 8; \
265
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
266
OPER(res[4], v1[-4], v2[4], f1, f2); \
267
OPER(res[5], v1[-3], v2[5], f1, f2); \
268
v2 += 8; \
269
PREFETCH_W(res+PREF_OFFS(T), CA0); \
270
OPER(res[6], v1[-2], v2[-2], f1, f2); \
271
OPER(res[7], v1[-1], v2[-1], f1, f2); \
272
res += 8; \
273
}
274
275
277
#define UNROLL8_KERNEL5(OPER) \
278
OPER(res[0], v1[0], v2[0], f1, f2); \
279
OPER(res[1], v1[1], v2[1], f1, f2); \
280
i -= 8; \
281
OPER(res[2], v1[2], v2[2], f1, f2); \
282
OPER(res[3], v1[3], v2[3], f1, f2); \
283
v1 += 8; \
284
OPER(res[4], v1[-4], v2[4], f1, f2); \
285
OPER(res[5], v1[-3], v2[5], f1, f2); \
286
v2 += 8; \
287
OPER(res[6], v1[-2], v2[-2], f1, f2); \
288
OPER(res[7], v1[-1], v2[-1], f1, f2); \
289
res += 8
290
291
#define UNROLL8_KERNEL5_PREPARE do {} while(0)
292
#define UNROLL8_KERNEL5_FIXUP do {} while(0)
293
295
#define PREF_AHEAD3(T,CA0,CA1,CA2) \
296
if (PREFETCH_AHEAD >= 16) { \
297
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
298
PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
299
PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
300
PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
301
PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
302
PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
303
PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
304
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
305
PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
306
PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
307
PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
308
PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
309
PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
310
PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
311
PREFETCH_W(res+EL_PER_CL(T), CA0); \
312
PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
313
PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
314
PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
315
PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
316
PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
317
PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
318
PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
319
PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
320
PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
321
PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
322
PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
323
PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
324
PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
325
PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
326
PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
327
PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
328
PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
329
PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
330
PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
331
PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
332
PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
333
PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
334
PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
335
PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
336
PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
337
PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
338
PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
339
PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
340
PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
341
PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
342
} else if (PREFETCH_AHEAD >= 8) { \
343
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
344
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
345
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
346
PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
347
PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
348
PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
349
PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
350
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
351
PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
352
PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
353
PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
354
PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
355
PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
356
PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
357
PREFETCH_W(res+EL_PER_CL(T), CA0); \
358
PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
359
PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
360
PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
361
PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
362
PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
363
PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
364
} else if (PREFETCH_AHEAD >= 4) { \
365
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
366
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
367
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
368
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
369
PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
370
PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
371
PREFETCH_W(res+EL_PER_CL(T), CA0); \
372
PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
373
PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
374
} else if (PREFETCH_AHEAD >= 2) { \
375
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
376
PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
377
PREFETCH_W(res+EL_PER_CL(T), CA0); \
378
}
379
380
381
/***********************************************************
382
* 2 pointer operations
383
***********************************************************/
384
386
#define UNROLL1_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
387
OPER(res[0], v1[0], f1, f2); \
388
--i; \
389
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
390
++v1; \
391
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
392
res+=RI
393
394
#define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,RI) \
395
UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,1)
396
398
#define UNROLL1_KERNEL4_STRIDE(OPER,RI) \
399
--i; \
400
OPER(res[0], v1[0], f1, f2); \
401
++v1; res+=RI
402
403
#define UNROLL1_KERNEL4(OPER) \
404
UNROLL1_KERNEL4_STRIDE(OPER,1)
405
406
#define UNROLL1_KERNEL4_PREPARE do {} while(0)
407
#define UNROLL1_KERNEL4_FIXUP do {} while(0)
408
409
411
#define UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
412
if (EL_PER_CL(T) <= 1) { \
413
i -= 2; \
414
OPER(res[0], v1[0], f1, f2); \
415
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
416
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
417
OPER(res[RI],v1[1], f1, f2); \
418
v1 += 2; \
419
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
420
PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
421
res += 2*RI; \
422
} else { \
423
i -= 2; \
424
OPER(res[0], v1[0], f1, f2); \
425
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
426
OPER(res[RI],v1[1], f1, f2); \
427
v1 += 2; \
428
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
429
res += 2*RI; \
430
} \
431
432
#define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433
UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
434
436
#define UNROLL2_KERNEL4_STRIDE(OPER,RI) \
437
OPER(res[0], v1[0], f1, f2); \
438
v1 += 2; i -= 2; \
439
OPER(res[RI],v1[-1],f1, f2); \
440
res += 2*RI
441
442
#define UNROLL2_KERNEL4(OPER) \
443
UNROLL2_KERNEL4_STRIDE(OPER,1) \
444
445
#define UNROLL2_KERNEL4_PREPARE do {} while(0)
446
#define UNROLL2_KERNEL4_FIXUP do {} while(0)
447
448
450
#define UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
451
if (EL_PER_CL(T) <= 1) { \
452
OPER(res[0], v1[0], f1, f2); \
453
i -= 4; \
454
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
455
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
456
OPER(res[RI],v1[1], f1, f2); \
457
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
458
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
459
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
460
OPER(res[2*RI], v1[2], f1, f2); \
461
v1 += 4; \
462
PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
463
PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
464
PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
465
OPER(res[3*RI], v1[-1], f1, f2); \
466
res += 4*RI; \
467
} else if (EL_PER_CL(T) <= 2) { \
468
OPER(res[0], v1[0], f1, f2); \
469
i -= 4; \
470
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
471
OPER(res[RI],v1[1], f1, f2); \
472
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
473
OPER(res[2*RI], v1[2], f1, f2); \
474
v1 += 4; \
475
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
476
PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
477
OPER(res[3*RI], v1[-1], f1, f2); \
478
res += 4*RI; \
479
} else { \
480
OPER(res[0], v1[0], f1, f2); \
481
i -= 4; \
482
OPER(res[RI],v1[1], f1, f2); \
483
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
484
OPER(res[2*RI], v1[2], f1, f2); \
485
v1 += 4; \
486
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
487
OPER(res[3*RI], v1[-1], f1, f2); \
488
res += 4*RI; \
489
}
490
491
#define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
492
UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
493
495
#define UNROLL4_KERNEL4_STRIDE(OPER,RI) \
496
OPER(res[0], v1[0], f1, f2); \
497
OPER(res[RI],v1[1], f1, f2); \
498
v1 += 4; i -= 4; \
499
OPER(res[2*RI], v1[-2], f1, f2); \
500
OPER(res[3*RI], v1[-1], f1, f2); \
501
res += 4*RI
502
503
#define UNROLL4_KERNEL4(OPER) \
504
UNROLL4_KERNEL4_STRIDE(OPER,1)
505
506
#define UNROLL4_KERNEL4_PREPARE do {} while(0)
507
#define UNROLL4_KERNEL4_FIXUP do {} while(0)
508
509
511
#define UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
512
if (EL_PER_CL(T) <= 1) { \
513
OPER(res[0], v1[0], f1, f2); \
514
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
515
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
516
OPER(res[RI],v1[1], f1, f2); \
517
i -= 8; \
518
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
519
PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
520
OPER(res[2*RI], v1[2], f1, f2); \
521
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
522
PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
523
OPER(res[3*RI], v1[3], f1, f2); \
524
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
525
PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
526
OPER(res[4*RI], v1[4], f1, f2); \
527
v1 += 8; \
528
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
529
PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
530
OPER(res[5*RI], v1[-3], f1, f2); \
531
PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
532
PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
533
OPER(res[6*RI], v1[-2], f1, f2); \
534
PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
535
PREFETCH_X(res+RI*(PREF_OFFS(T)+5), CA0); \
536
OPER(res[7*RI], v1[-1], f1, f2); \
537
PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
538
PREFETCH_X(res+RI*(PREF_OFFS(T)+7), CA0); \
539
res += 8*RI; \
540
} else if (EL_PER_CL(T) <= 2) { \
541
OPER(res[0], v1[0], f1, f2); \
542
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
543
OPER(res[RI],v1[1], f1, f2); \
544
i -= 8; \
545
PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
546
OPER(res[2*RI], v1[2], f1, f2); \
547
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
548
OPER(res[3*RI], v1[3], f1, f2); \
549
PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
550
OPER(res[4*RI], v1[4], f1, f2); \
551
v1 += 8; \
552
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
553
OPER(res[5*RI], v1[-3], f1, f2); \
554
PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
555
OPER(res[6*RI], v1[-2], f1, f2); \
556
PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
557
OPER(res[7*RI], v1[-1], f1, f2); \
558
PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
559
res += 8*RI; \
560
} else if (EL_PER_CL(T) <= 4) { \
561
OPER(res[0], v1[0], f1, f2); \
562
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
563
OPER(res[RI],v1[1], f1, f2); \
564
OPER(res[2*RI], v1[2], f1, f2); \
565
PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
566
OPER(res[3*RI], v1[3], f1, f2); \
567
OPER(res[4*RI], v1[4], f1, f2); \
568
v1 += 8; \
569
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
570
OPER(res[5*RI], v1[-3], f1, f2); \
571
i -= 8; \
572
OPER(res[6*RI], v1[-2], f1, f2); \
573
PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
574
OPER(res[7*RI], v1[-1], f1, f2); \
575
res += 8*RI; \
576
} else { \
577
OPER(res[0], v1[0], f1, f2); \
578
OPER(res[RI],v1[1], f1, f2); \
579
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
580
OPER(res[2*RI], v1[2], f1, f2); \
581
OPER(res[3*RI], v1[3], f1, f2); \
582
v1 += 8; \
583
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
584
OPER(res[4*RI], v1[-4], f1, f2); \
585
OPER(res[5*RI], v1[-3], f1, f2); \
586
i -= 8; \
587
OPER(res[6*RI], v1[-2], f1, f2); \
588
OPER(res[7*RI], v1[-1], f1, f2); \
589
res += 8*RI; \
590
}
591
592
#define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
593
UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
594
596
#define UNROLL8_KERNEL4_STRIDE(OPER,RI) \
597
OPER(res[0], v1[0], f1, f2); \
598
OPER(res[RI],v1[1], f1, f2); \
599
OPER(res[2*RI], v1[2], f1, f2); \
600
OPER(res[3*RI], v1[3], f1, f2); \
601
v1 += 8; i -= 8; \
602
OPER(res[4*RI], v1[-4], f1, f2); \
603
OPER(res[5*RI], v1[-3], f1, f2); \
604
OPER(res[6*RI], v1[-2], f1, f2); \
605
OPER(res[7*RI], v1[-1], f1, f2); \
606
res += 8*RI
607
608
#define UNROLL8_KERNEL4(OPER) \
609
UNROLL8_KERNEL4_STRIDE(OPER,1)
610
611
#define UNROLL8_KERNEL4_PREPARE do {} while(0)
612
#define UNROLL8_KERNEL4_FIXUP do {} while(0)
613
614
616
#define PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,RI) \
617
if (PREFETCH_AHEAD >= 16) { \
618
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
619
PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
620
PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
621
PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
622
PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
623
PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
624
PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
625
PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
626
PREFETCH_X(res+RI*EL_PER_CL(T)* 2, CA0); \
627
PREFETCH_X(res+RI*EL_PER_CL(T)* 3, CA0); \
628
PREFETCH_X(res+RI*EL_PER_CL(T)* 4, CA0); \
629
PREFETCH_X(res+RI*EL_PER_CL(T)* 5, CA0); \
630
PREFETCH_X(res+RI*EL_PER_CL(T)* 6, CA0); \
631
PREFETCH_X(res+RI*EL_PER_CL(T)* 7, CA0); \
632
PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
633
PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
634
PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
635
PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
636
PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
637
PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
638
PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
639
PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
640
PREFETCH_X(res+RI*EL_PER_CL(T)* 8, CA0); \
641
PREFETCH_X(res+RI*EL_PER_CL(T)* 9, CA0); \
642
PREFETCH_X(res+RI*EL_PER_CL(T)*10, CA0); \
643
PREFETCH_X(res+RI*EL_PER_CL(T)*11, CA0); \
644
PREFETCH_X(res+RI*EL_PER_CL(T)*12, CA0); \
645
PREFETCH_X(res+RI*EL_PER_CL(T)*13, CA0); \
646
PREFETCH_X(res+RI*EL_PER_CL(T)*14, CA0); \
647
PREFETCH_X(res+RI*EL_PER_CL(T)*15, CA0); \
648
} else if (PREFETCH_AHEAD >= 8) { \
649
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
650
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
651
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
652
PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
653
PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
654
PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
655
PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
656
PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
657
PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
658
PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
659
PREFETCH_X(res+RI*EL_PER_CL(T)*4, CA0); \
660
PREFETCH_X(res+RI*EL_PER_CL(T)*5, CA0); \
661
PREFETCH_X(res+RI*EL_PER_CL(T)*6, CA0); \
662
PREFETCH_X(res+RI*EL_PER_CL(T)*7, CA0); \
663
} else if (PREFETCH_AHEAD >= 4) { \
664
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
665
PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
666
PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
667
PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
668
PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
669
PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
670
} else if (PREFETCH_AHEAD >= 2) { \
671
PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
672
PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
673
}
674
675
#define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
676
PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,1)
677
678
/***********************************************************
679
* 1 pointer operations
680
***********************************************************/
681
683
#define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
684
OPER(res[0], f1, f2); \
685
--i; \
686
PREFETCH_X(res+PREF_OFFS(T), CA0); \
687
++res
688
690
#define UNROLL1_KERNEL3(OPER) \
691
--i; \
692
OPER(res[0], f1, f2); \
693
++res
694
695
#define UNROLL1_KERNEL3_PREPARE do {} while(0)
696
#define UNROLL1_KERNEL3_FIXUP do {} while(0)
697
698
700
#define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
701
if (EL_PER_CL(T) <= 1) { \
702
OPER(res[0], f1, f2); \
703
PREFETCH_X(res+PREF_OFFS(T), CA0); \
704
i -= 2; \
705
OPER(res[1], f1, f2); \
706
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
707
res += 2; \
708
} else { \
709
OPER(res[0], f1, f2); \
710
i -= 2; \
711
OPER(res[1], f1, f2); \
712
PREFETCH_X(res+PREF_OFFS(T), CA0); \
713
res += 2; \
714
} \
715
716
718
#define UNROLL2_KERNEL3(OPER) \
719
OPER(res[0], f1, f2); \
720
i -= 2; \
721
OPER(res[1], f1, f2); \
722
res += 2
723
724
#define UNROLL2_KERNEL3_PREPARE do {} while(0)
725
#define UNROLL2_KERNEL3_FIXUP do {} while(0)
726
727
729
#define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
730
if (EL_PER_CL(T) <= 1) { \
731
OPER(res[0], f1, f2); \
732
i -= 4; \
733
PREFETCH_X(res+PREF_OFFS(T), CA0); \
734
OPER(res[1], f1, f2); \
735
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
736
OPER(res[2], f1, f2); \
737
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
738
OPER(res[3], f1, f2); \
739
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
740
res += 4; \
741
} else if (EL_PER_CL(T) <= 2) { \
742
OPER(res[0], f1, f2); \
743
PREFETCH_X(res+PREF_OFFS(T), CA0); \
744
OPER(res[1], f1, f2); \
745
i -= 4; \
746
OPER(res[2], f1, f2); \
747
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
748
OPER(res[3], f1, f2); \
749
res += 4; \
750
} else { \
751
OPER(res[0], f1, f2); \
752
i -= 4; \
753
OPER(res[1], f1, f2); \
754
OPER(res[2], f1, f2); \
755
PREFETCH_X(res+PREF_OFFS(T), CA0); \
756
OPER(res[3], f1, f2); \
757
res += 4; \
758
}
759
761
#define UNROLL4_KERNEL3(OPER) \
762
OPER(res[0], f1, f2); \
763
OPER(res[1], f1, f2); \
764
i -= 4; \
765
OPER(res[2], f1, f2); \
766
OPER(res[3], f1, f2); \
767
res += 4
768
769
#define UNROLL4_KERNEL3_PREPARE do {} while(0)
770
#define UNROLL4_KERNEL3_FIXUP do {} while(0)
771
772
774
#define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
775
if (EL_PER_CL(T) <= 1) { \
776
OPER(res[0], f1, f2); \
777
PREFETCH_X(res+PREF_OFFS(T), CA0); \
778
OPER(res[1], f1, f2); \
779
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
780
OPER(res[2], f1, f2); \
781
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
782
OPER(res[3], f1, f2); \
783
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
784
OPER(res[4], f1, f2); \
785
i -= 8; \
786
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
787
OPER(res[5], f1, f2); \
788
PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
789
OPER(res[6], f1, f2); \
790
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
791
OPER(res[7], f1, f2); \
792
PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
793
res += 8; \
794
} else if (EL_PER_CL(T) <= 2) { \
795
OPER(res[0], f1, f2); \
796
OPER(res[1], f1, f2); \
797
PREFETCH_X(res+PREF_OFFS(T), CA0); \
798
OPER(res[2], f1, f2); \
799
OPER(res[3], f1, f2); \
800
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
801
OPER(res[4], f1, f2); \
802
i -= 8; \
803
OPER(res[5], f1, f2); \
804
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
805
OPER(res[6], f1, f2); \
806
PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
807
OPER(res[7], f1, f2); \
808
res += 8; \
809
} else if (EL_PER_CL(T) <= 4) { \
810
OPER(res[0], f1, f2); \
811
OPER(res[1], f1, f2); \
812
PREFETCH_X(res+PREF_OFFS(T), CA0); \
813
OPER(res[2], f1, f2); \
814
OPER(res[3], f1, f2); \
815
i -= 8; \
816
OPER(res[4], f1, f2); \
817
OPER(res[5], f1, f2); \
818
PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
819
OPER(res[6], f1, f2); \
820
OPER(res[7], f1, f2); \
821
res += 8; \
822
} else { \
823
OPER(res[0], f1, f2); \
824
OPER(res[1], f1, f2); \
825
OPER(res[2], f1, f2); \
826
PREFETCH_X(res+PREF_OFFS(T), CA0); \
827
OPER(res[3], f1, f2); \
828
OPER(res[4], f1, f2); \
829
OPER(res[5], f1, f2); \
830
i -= 8; \
831
OPER(res[6], f1, f2); \
832
OPER(res[7], f1, f2); \
833
res += 8; \
834
}
835
836
838
#define UNROLL8_KERNEL3(OPER) \
839
OPER(res[0], f1, f2); \
840
OPER(res[1], f1, f2); \
841
OPER(res[2], f1, f2); \
842
OPER(res[3], f1, f2); \
843
i -= 8; \
844
OPER(res[4], f1, f2); \
845
OPER(res[5], f1, f2); \
846
OPER(res[6], f1, f2); \
847
OPER(res[7], f1, f2); \
848
res += 8
849
850
#define UNROLL8_KERNEL3_PREPARE do {} while(0)
851
#define UNROLL8_KERNEL3_FIXUP do {} while(0)
852
853
855
#define PREF_AHEAD1(T,PREFETCH_X,CA0) \
856
if (PREFETCH_AHEAD >= 16) { \
857
PREFETCH_X(res+EL_PER_CL(T), CA0); \
858
PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
859
PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
860
PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
861
PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
862
PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
863
PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
864
PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
865
PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
866
PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
867
PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
868
PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
869
PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
870
PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
871
PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
872
} else if (PREFETCH_AHEAD >= 8) { \
873
PREFETCH_X(res+EL_PER_CL(T), CA0); \
874
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
875
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
876
PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
877
PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
878
PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
879
PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
880
} else if (PREFETCH_AHEAD >= 4) { \
881
PREFETCH_X(res+EL_PER_CL(T), CA0); \
882
PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
883
PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
884
} else if (PREFETCH_AHEAD >= 2) { \
885
PREFETCH_X(res+EL_PER_CL(T), CA0); \
886
}
887
888
889
890
// Select default kernels
891
#if UNROLL_DEPTH == 1
892
893
# define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
894
# define UNR_KERNEL5 UNROLL1_KERNEL5
895
# define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
896
# define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
897
898
# define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
899
# define UNR_PREF_KERNEL4_STRIDE UNROLL1_PREF_KERNEL4_STRIDE
900
# define UNR_KERNEL4 UNROLL1_KERNEL4
901
# define UNR_KERNEL4_STRIDE UNROLL1_KERNEL4_STRIDE
902
# define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
903
# define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
904
905
# define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
906
# define UNR_KERNEL3 UNROLL1_KERNEL3
907
# define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
908
# define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
909
910
#elif UNROLL_DEPTH == 2
911
912
# define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
913
# define UNR_KERNEL5 UNROLL2_KERNEL5
914
# define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
915
# define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
916
917
# define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
918
# define UNR_PREF_KERNEL4_STRIDE UNROLL2_PREF_KERNEL4_STRIDE
919
# define UNR_KERNEL4 UNROLL2_KERNEL4
920
# define UNR_KERNEL4_STRIDE UNROLL2_KERNEL4_STRIDE
921
# define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
922
# define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
923
924
# define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
925
# define UNR_KERNEL3 UNROLL2_KERNEL3
926
# define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
927
# define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
928
929
#elif UNROLL_DEPTH == 4
930
931
# define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
932
# define UNR_KERNEL5 UNROLL4_KERNEL5
933
# define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
934
# define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
935
936
# define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
937
# define UNR_PREF_KERNEL4_STRIDE UNROLL4_PREF_KERNEL4_STRIDE
938
# define UNR_KERNEL4 UNROLL4_KERNEL4
939
# define UNR_KERNEL4_STRIDE UNROLL4_KERNEL4_STRIDE
940
# define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
941
# define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
942
943
# define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
944
# define UNR_KERNEL3 UNROLL4_KERNEL3
945
# define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
946
# define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
947
948
#elif UNROLL_DEPTH == 8
949
950
# define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
951
# define UNR_KERNEL5 UNROLL8_KERNEL5
952
# define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
953
# define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
954
955
# define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
956
# define UNR_PREF_KERNEL4_STRIDE UNROLL8_PREF_KERNEL4_STRIDE
957
# define UNR_KERNEL4 UNROLL8_KERNEL4
958
# define UNR_KERNEL4_STRIDE UNROLL8_KERNEL4_STRIDE
959
# define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
960
# define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
961
962
# define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
963
# define UNR_KERNEL3 UNROLL8_KERNEL3
964
# define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
965
# define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
966
967
#else
968
969
# error "UNROLL_DEPTH may only be 1, 2, 4, 8"
970
971
#endif
/* UNROLL_DEPTH */
972
991
/****************************************************************
992
* Macros with fragments for the implementation
993
****************************************************************/
994
995
#ifdef USE_PREFETCH
996
997
# define VKERN_TEMPL_3V_PREF(OP3,T) \
998
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
999
PREFETCH_W (res, 3); \
1000
PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
1001
UNR_KERNEL5_PREP; \
1002
do { \
1003
UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
1004
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1005
UNR_KERNEL5_FIX; \
1006
}
1007
1008
# define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
1009
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1010
PREFETCH_X (res, 3); \
1011
PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
1012
UNR_KERNEL4_PREP; \
1013
do { \
1014
UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
1015
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1016
UNR_KERNEL4_FIX; \
1017
}
1018
# define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) \
1019
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1020
PREFETCH_X (res, 3); \
1021
PREF_AHEAD2_STRIDE(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ),RI); \
1022
UNR_KERNEL4_PREP; \
1023
do { \
1024
UNR_PREF_KERNEL4_STRIDE(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ,RI); \
1025
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1026
UNR_KERNEL4_FIX; \
1027
}
1028
1029
# define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
1030
if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1031
PREFETCH_X (res, 3); \
1032
PREF_AHEAD1(T,PREFETCH_X,CW); \
1033
UNR_KERNEL3_PREP; \
1034
do { \
1035
UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
1036
} while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1037
UNR_KERNEL3_FIX; \
1038
}
1039
#else
1040
# define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
1041
# define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1042
# define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) do {} while (0)
1043
# define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1044
#endif
/* USE_PREFETCH */
1045
1046
1047
/****************************************************************
1048
* Templates for routines
1049
****************************************************************/
1050
1062
#define VKERN_TEMPL_3V(FNAME,OP3) \
1064
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1065
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1066
template <typename T> \
1067
VEC_INLINE void FNAME (const unsigned long sz, \
1068
T* RESTRICT const _res, \
1069
const T* RESTRICT const _v1, \
1070
const T* RESTRICT const _v2) \
1071
{ \
1072
PREFETCH_W(_res, 3); \
1073
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1074
REGISTER const T *v1 = _v1, *v2 = _v2; \
1075
REGISTER T *res = _res; \
1076
REGISTER long i = sz; \
1077
VKERN_TEMPL_3V_PREF(OP3,T); \
1078
\
1079
if (LIKELY(i >= UNROLL_DEPTH)) { \
1080
UNR_KERNEL5_PREP; \
1081
do { \
1082
UNR_KERNEL5(OP3); \
1083
} while (i >= UNROLL_DEPTH); \
1084
UNR_KERNEL5_FIX; \
1085
} \
1086
\
1087
for (; i; --i) { \
1088
OP3(*res, *v1, *v2, f1, f2); \
1089
++v1; ++v2; ++res; \
1090
} \
1091
}
1092
1094
#define VKERN_TEMPL_3V_C(FNAME,OP3) \
1095
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1096
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1097
const T* RESTRICT const, LCTYPED(T));) \
1098
template <typename T> \
1099
VEC_INLINE void FNAME (const unsigned long sz, \
1100
T* RESTRICT const _res, \
1101
const T* RESTRICT const _v1, \
1102
const T* RESTRICT const _v2, \
1103
LCTYPE(T) f2) \
1104
{ \
1105
PREFETCH_W(_res, 3); \
1106
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1107
REGISTER const T *v1 = _v1, *v2 = _v2; \
1108
REGISTER T *res = _res; \
1109
REGISTER long i = sz; \
1110
VKERN_TEMPL_3V_PREF(OP3,T); \
1111
\
1112
if (LIKELY(i >= UNROLL_DEPTH)) { \
1113
UNR_KERNEL5_PREP; \
1114
do { \
1115
UNR_KERNEL5(OP3); \
1116
} while (i >= UNROLL_DEPTH); \
1117
UNR_KERNEL5_FIX; \
1118
} \
1119
\
1120
for (; i; --i) { \
1121
OP3(*res, *v1, *v2, f1, f2); \
1122
++v1; ++v2; ++res; \
1123
} \
1124
}
1125
1127
#define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1128
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1129
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1130
const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1131
template <typename T> \
1132
VEC_INLINE void FNAME (const unsigned long sz, \
1133
T* RESTRICT const _res, \
1134
const T* RESTRICT const _v1, \
1135
const T* RESTRICT const _v2, \
1136
LCTYPE(T) f1, \
1137
LCTYPE(T) f2) \
1138
{ \
1139
PREFETCH_W(_res, 3); \
1140
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1141
REGISTER long i = sz; \
1142
REGISTER const T *v1 = _v1, *v2 = _v2; \
1143
REGISTER T *res = _res; \
1144
VKERN_TEMPL_3V_PREF(OP3,T); \
1145
\
1146
if (LIKELY(i >= UNROLL_DEPTH)) { \
1147
UNR_KERNEL5_PREP; \
1148
do { \
1149
UNR_KERNEL5(OP3); \
1150
} while (i >= UNROLL_DEPTH); \
1151
UNR_KERNEL5_FIX; \
1152
} \
1153
\
1154
for (; i; --i) { \
1155
OP3(*res, *v1, *v2, f1, f2); \
1156
++v1; ++v2; ++res; \
1157
} \
1158
}
1159
1161
#define VKERN_TEMPL_2V(FNAME,OP2) \
1162
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1163
(const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1164
template <typename T> \
1165
VEC_INLINE void FNAME (const unsigned long sz, \
1166
T* RESTRICT const _res, \
1167
const T* RESTRICT const _v1) \
1168
{ \
1169
PREFETCH_W(_res, 3); \
1170
PREFETCH_R(_v1, 3); \
1171
REGISTER const T *v1 = _v1; \
1172
REGISTER T* res = _res; \
1173
REGISTER long i = sz; \
1174
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1175
\
1176
if (LIKELY(i >= UNROLL_DEPTH)) { \
1177
UNR_KERNEL4_PREP; \
1178
do { \
1179
UNR_KERNEL4(OP2); \
1180
} while (i >= UNROLL_DEPTH); \
1181
UNR_KERNEL4_FIX; \
1182
} \
1183
\
1184
for (; i; --i) { \
1185
OP2(*res, *v1, f1, f2); \
1186
++v1; ++res; \
1187
} \
1188
}
1189
1191
#define VKERN_TEMPL_2V_C(FNAME,OP2) \
1192
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1193
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1194
LCTYPED(T));) \
1195
template <typename T> \
1196
VEC_INLINE void FNAME (const unsigned long sz, \
1197
T* RESTRICT const _res, \
1198
const T* RESTRICT const _v1, \
1199
LCTYPE(T) f2) \
1200
{ \
1201
PREFETCH_W(_res, 3); \
1202
PREFETCH_R(_v1, 3); \
1203
REGISTER const T *v1 = _v1; \
1204
REGISTER T* res = _res; \
1205
REGISTER long i = sz; \
1206
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1207
\
1208
if (LIKELY(i >= UNROLL_DEPTH)) { \
1209
UNR_KERNEL4_PREP; \
1210
do { \
1211
UNR_KERNEL4(OP2); \
1212
} while (i >= UNROLL_DEPTH); \
1213
UNR_KERNEL4_FIX; \
1214
} \
1215
\
1216
for (; i; --i) { \
1217
OP2(*res, *v1, f1, f2); \
1218
++v1; ++res; \
1219
} \
1220
}
1221
1223
#define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1224
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1225
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1226
LCTYPED(T), LCTYPED(T));) \
1227
template <typename T> \
1228
VEC_INLINE void FNAME (const unsigned long sz, \
1229
T* RESTRICT const _res, \
1230
const T* RESTRICT const _v1, \
1231
LCTYPE(T) f1, \
1232
LCTYPE(T) f2) \
1233
{ \
1234
PREFETCH_W(_res, 3); \
1235
PREFETCH_R(_v1, 3); \
1236
REGISTER const T *v1 = _v1; \
1237
REGISTER T* res = _res; \
1238
REGISTER long i = sz; \
1239
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1240
\
1241
if (LIKELY(i >= UNROLL_DEPTH)) { \
1242
UNR_KERNEL4_PREP; \
1243
do { \
1244
UNR_KERNEL4(OP2); \
1245
} while (i >= UNROLL_DEPTH); \
1246
UNR_KERNEL4_FIX; \
1247
} \
1248
\
1249
for (; i; --i) { \
1250
OP2(*res, *v1, f1, f2); \
1251
++v1; ++res; \
1252
} \
1253
}
1254
1256
#define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1257
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1258
(const unsigned long, const T* RESTRICT const, \
1259
const T* RESTRICT const, TYPE&);) \
1260
template <typename T> \
1261
VEC_INLINE void FNAME (const unsigned long sz, \
1262
const T* RESTRICT const _res, \
1263
const T* RESTRICT const _v1, \
1264
TYPE &_f2) \
1265
{ \
1266
PREFETCH_R(_res, 3); \
1267
PREFETCH_R(_v1, 3); \
1268
REGISTER const T *v1 = _v1; \
1269
REGISTER const T* res = _res; \
1270
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1271
REGISTER TYPE f2(_f2), f1(0.0); \
1272
REGISTER long i = sz; \
1273
VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1274
\
1275
if (LIKELY(i >= UNROLL_DEPTH)) { \
1276
UNR_KERNEL4_PREP; \
1277
do { \
1278
UNR_KERNEL4(OP2); \
1279
} while (i >= UNROLL_DEPTH); \
1280
UNR_KERNEL4_FIX; \
1281
} \
1282
\
1283
for (; i; --i) { \
1284
OP2(*res, *v1, f1, f2); \
1285
++v1; ++res; \
1286
} \
1287
_fin: \
1288
_f2 = f2 - f1; \
1289
}
1290
1292
#define VKERN_TEMPL_2V_T_STRIDE(FNAME,OP2,TYPE) \
1293
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1294
(const unsigned long, const T* RESTRICT const, \
1295
const T* RESTRICT const, TYPE&, const unsigned);) \
1296
template <typename T> \
1297
VEC_INLINE void FNAME (const unsigned long sz, \
1298
const T* RESTRICT const _res, \
1299
const T* RESTRICT const _v1, \
1300
TYPE &_f2, const unsigned rincr) \
1301
{ \
1302
PREFETCH_R(_res, 3); \
1303
PREFETCH_R(_v1, 3); \
1304
REGISTER const T *v1 = _v1; \
1305
REGISTER const T* res = _res; \
1306
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1307
REGISTER TYPE f2(_f2), f1(0.0); \
1308
REGISTER long i = sz; \
1309
VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_R,CACHE_LOC_READ,rincr); \
1310
\
1311
if (LIKELY(i >= UNROLL_DEPTH)) { \
1312
UNR_KERNEL4_PREP; \
1313
do { \
1314
UNR_KERNEL4_STRIDE(OP2,rincr); \
1315
} while (i >= UNROLL_DEPTH); \
1316
UNR_KERNEL4_FIX; \
1317
} \
1318
\
1319
for (; i; --i) { \
1320
OP2(*res, *v1, f1, f2); \
1321
++v1; res += rincr; \
1322
} \
1323
_fin: \
1324
_f2 = f2 - f1; \
1325
}
1326
1328
#define VKERN_TEMPL_1V(FNAME,OP1) \
1329
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1330
(const unsigned long, T* RESTRICT const);) \
1331
template <typename T> \
1332
VEC_INLINE void FNAME (const unsigned long sz, \
1333
T* RESTRICT const _res) \
1334
{ \
1335
PREFETCH_W(_res, 3); \
1336
REGISTER long i = sz; \
1337
REGISTER T* res = _res; \
1338
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1339
\
1340
if (LIKELY(i >= UNROLL_DEPTH)) { \
1341
UNR_KERNEL3_PREP; \
1342
do { \
1343
UNR_KERNEL3(OP1); \
1344
} while (i >= UNROLL_DEPTH); \
1345
UNR_KERNEL3_FIX; \
1346
} \
1347
\
1348
for (; i; --i) { \
1349
OP1(*res, f1, f2); \
1350
++res; \
1351
} \
1352
}
1353
1355
#define VKERN_TEMPL_1V_C(FNAME,OP1) \
1356
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1357
(const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1358
template <typename T> \
1359
VEC_INLINE void FNAME (const unsigned long sz, \
1360
T* RESTRICT const _res, \
1361
LCTYPE(T) f2) \
1362
{ \
1363
PREFETCH_W(_res, 3); \
1364
REGISTER long i = sz; \
1365
REGISTER T* res = _res; \
1366
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1367
\
1368
if (LIKELY(i >= UNROLL_DEPTH)) { \
1369
UNR_KERNEL3_PREP; \
1370
do { \
1371
UNR_KERNEL3(OP1); \
1372
} while (i >= UNROLL_DEPTH); \
1373
UNR_KERNEL3_FIX; \
1374
} \
1375
\
1376
for (; i; --i) { \
1377
OP1(*res, f1, f2); \
1378
++res; \
1379
} \
1380
}
1381
1383
#define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1384
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1385
(const unsigned long, T* RESTRICT const, \
1386
LCTYPED(T), LCTYPED(T));) \
1387
template <typename T> \
1388
VEC_INLINE void FNAME (const unsigned long sz, \
1389
T* RESTRICT const _res, \
1390
LCTYPE(T) f1, \
1391
LCTYPE(T) f2) \
1392
{ \
1393
PREFETCH_W(_res, 3); \
1394
REGISTER long i = sz; \
1395
REGISTER T* res = _res; \
1396
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1397
\
1398
if (LIKELY(i >= UNROLL_DEPTH)) { \
1399
UNR_KERNEL3_PREP; \
1400
do { \
1401
UNR_KERNEL3(OP1); \
1402
} while (i >= UNROLL_DEPTH); \
1403
UNR_KERNEL3_FIX; \
1404
} \
1405
\
1406
for (; i; --i) { \
1407
OP1(*res, f1, f2); \
1408
++res; \
1409
} \
1410
}
1411
1412
1416
#define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1417
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1418
(const unsigned long, const T* const, TYPE&);) \
1419
template <typename T> \
1420
VEC_INLINE void FNAME (const unsigned long sz, \
1421
const T* const _res, \
1422
TYPE &_f2) \
1423
{ \
1424
PREFETCH_R(_res, 3); \
1425
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1426
REGISTER TYPE f2(_f2), f1(0.0); \
1427
REGISTER const T* res = _res; \
1428
REGISTER long i = sz; \
1429
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1430
\
1431
if (LIKELY(i >= UNROLL_DEPTH)) { \
1432
UNR_KERNEL3_PREP; \
1433
do { \
1434
UNR_KERNEL3(OP1); \
1435
} while (i >= UNROLL_DEPTH); \
1436
UNR_KERNEL3_FIX; \
1437
} \
1438
\
1439
for (; i; --i) { \
1440
OP1(*res, f1, f2); \
1441
++res; \
1442
} \
1443
_f2 = f2 - f1; \
1444
}
1445
1450
#define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1451
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1452
(const unsigned long, const T* const, TYPE&);) \
1453
template <typename T> \
1454
VEC_INLINE void FNAME (const unsigned long sz, \
1455
const T* const _res, \
1456
TYPE &_f2) \
1457
{ \
1458
PREFETCH_R(_res, 3); \
1459
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */
\
1460
REGISTER LONG_DOUBLE f2(_f2); \
1461
REGISTER const T* res = _res; \
1462
REGISTER long i = sz; \
1463
VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1464
\
1465
if (LIKELY(i >= UNROLL_DEPTH)) { \
1466
UNR_KERNEL3_PREP; \
1467
do { \
1468
UNR_KERNEL3(OP1); \
1469
} while (i >= UNROLL_DEPTH); \
1470
UNR_KERNEL3_FIX; \
1471
} \
1472
\
1473
for (; i; --i) { \
1474
OP1(*res, f1, f2); \
1475
++res; \
1476
} \
1477
_f2 = f2; \
1478
}
1479
1480
#endif
/* TBCI_UNROLL_PREFETCH_DEF_H */
Generated by
1.8.5