TBCI Numerical high perf. C++ Library 2.8.0
unroll_prefetch_def.h
Go to the documentation of this file.
1
7
8#ifndef TBCI_UNROLL_PREFETCH_DEF_H
9#define TBCI_UNROLL_PREFETCH_DEF_H
10
11//#include "tbci/basics.h"
12
14#define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15#define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16
38
39#ifndef UNROLL_DEPTH
40# define UNROLL_DEPTH 4
41#endif
42
43/***********************************************************
44 * 3 pointer operations
45 ***********************************************************/
46
48#define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
49 OPER(res[0], v1[0], v2[0], f1, f2); \
50 --i; \
51 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
52 ++v1; \
53 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
54 ++v2; \
55 PREFETCH_W(res+PREF_OFFS(T), CA0); \
56 ++res
57
59#define UNROLL1_KERNEL5(OPER) \
60 --i; \
61 OPER(res[0], v1[0], v2[0], f1, f2); \
62 ++v1; ++v2; ++res
63
64#define UNROLL1_KERNEL5_PREPARE do {} while(0)
65#define UNROLL1_KERNEL5_FIXUP do {} while(0)
66
67
69#define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
70 if (EL_PER_CL(T) <= 1) { \
71 i -= 2; \
72 OPER(res[0], v1[0], v2[0], f1, f2); \
73 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
74 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
75 v1 += 2; \
76 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
77 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
78 OPER(res[1], v1[-1], v2[1], f1, f2); \
79 v2 += 2; \
80 PREFETCH_W(res+PREF_OFFS(T), CA0); \
81 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
82 res += 2; \
83 } else { \
84 i -= 2; \
85 OPER(res[0], v1[0], v2[0], f1, f2); \
86 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
87 v1 += 2; \
88 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
89 OPER(res[1], v1[-1], v2[1], f1, f2); \
90 v2 += 2; \
91 PREFETCH_W(res+PREF_OFFS(T), CA0); \
92 res += 2; \
93 } \
94
95
97#define UNROLL2_KERNEL5(OPER) \
98 OPER(res[0], v1[0], v2[0], f1, f2); \
99 v1 += 2; i -= 2; \
100 OPER(res[1], v1[-1], v2[1], f1, f2); \
101 v2 += 2; res += 2
102
103#define UNROLL2_KERNEL5_PREPARE do {} while(0)
104#define UNROLL2_KERNEL5_FIXUP do {} while(0)
105
106
108#define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
109 if (EL_PER_CL(T) <= 1) { \
110 OPER(res[0], v1[0], v2[0], f1, f2); \
111 i -= 4; \
112 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
113 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
114 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
115 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
116 OPER(res[1], v1[1], v2[1], f1, f2); \
117 v1 += 4; \
118 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
119 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
120 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
121 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
122 OPER(res[2], v1[-2], v2[2], f1, f2); \
123 v2 += 4; \
124 PREFETCH_W(res+PREF_OFFS(T), CA0); \
125 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
127 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
128 OPER(res[3], v1[-1], v2[-1], f1, f2); \
129 res += 4; \
130 } else if (EL_PER_CL(T) <= 2) { \
131 OPER(res[0], v1[0], v2[0], f1, f2); \
132 i -= 4; \
133 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135 OPER(res[1], v1[1], v2[1], f1, f2); \
136 v1 += 4; \
137 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139 OPER(res[2], v1[-2], v2[2], f1, f2); \
140 v2 += 4; \
141 PREFETCH_W(res+PREF_OFFS(T), CA0); \
142 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
143 OPER(res[3], v1[-1], v2[-1], f1, f2); \
144 res += 4; \
145 } else { \
146 OPER(res[0], v1[0], v2[0], f1, f2); \
147 i -= 4; \
148 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
149 OPER(res[1], v1[1], v2[1], f1, f2); \
150 v1 += 4; \
151 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152 OPER(res[2], v1[-2], v2[2], f1, f2); \
153 v2 += 4; \
154 PREFETCH_W(res+PREF_OFFS(T), CA0); \
155 OPER(res[3], v1[-1], v2[-1], f1, f2); \
156 res += 4; \
157 }
158
160#define UNROLL4_KERNEL5(OPER) \
161 OPER(res[0], v1[0], v2[0], f1, f2); \
162 i -= 4; \
163 OPER(res[1], v1[1], v2[1], f1, f2); \
164 v1 += 4; \
165 OPER(res[2], v1[-2], v2[2], f1, f2); \
166 v2 += 4; \
167 OPER(res[3], v1[-1], v2[-1], f1, f2); \
168 res += 4
169
170#define UNROLL4_KERNEL5_PREPARE do {} while(0)
171#define UNROLL4_KERNEL5_FIXUP do {} while(0)
172
173
175#define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
176 if (EL_PER_CL(T) <= 1) { \
177 OPER(res[0], v1[0], v2[0], f1, f2); \
178 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
179 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
180 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
181 OPER(res[1], v1[1], v2[1], f1, f2); \
182 i -= 8; \
183 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
184 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
185 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
186 OPER(res[2], v1[2], v2[2], f1, f2); \
187 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
188 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
189 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
190 OPER(res[3], v1[3], v2[3], f1, f2); \
191 v1 += 8; \
192 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
193 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
194 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
195 OPER(res[4], v1[-4], v2[4], f1, f2); \
196 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
197 PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
198 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
199 OPER(res[5], v1[-3], v2[5], f1, f2); \
200 PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
201 PREFETCH_W(res+PREF_OFFS(T), CA0); \
202 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
203 OPER(res[6], v1[-2], v2[6], f1, f2); \
204 v2 += 8; \
205 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
206 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
207 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
208 OPER(res[7], v1[-1], v2[-1], f1, f2); \
209 PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
210 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
211 PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
212 res += 8; \
213 } else if (EL_PER_CL(T) <= 2) { \
214 OPER(res[0], v1[0], v2[0], f1, f2); \
215 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
216 i -= 8; \
217 OPER(res[1], v1[1], v2[1], f1, f2); \
218 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
219 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
220 OPER(res[2], v1[2], v2[2], f1, f2); \
221 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
222 OPER(res[3], v1[3], v2[3], f1, f2); \
223 v1 += 8; \
224 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
225 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
226 OPER(res[4], v1[-4], v2[4], f1, f2); \
227 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
228 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
229 OPER(res[5], v1[-3], v2[5], f1, f2); \
230 v2 += 8; \
231 PREFETCH_W(res+PREF_OFFS(T), CA0); \
232 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
233 OPER(res[6], v1[-2], v2[-2], f1, f2); \
234 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
235 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
236 OPER(res[7], v1[-1], v2[-1], f1, f2); \
237 res += 8; \
238 } else if (EL_PER_CL(T) <= 4) { \
239 OPER(res[0], v1[0], v2[0], f1, f2); \
240 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
241 OPER(res[1], v1[1], v2[1], f1, f2); \
242 i -= 8; \
243 OPER(res[2], v1[2], v2[2], f1, f2); \
244 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
245 OPER(res[3], v1[3], v2[3], f1, f2); \
246 v1 += 8; \
247 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
248 OPER(res[4], v1[-4], v2[4], f1, f2); \
249 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
250 OPER(res[5], v1[-3], v2[5], f1, f2); \
251 v2 += 8; \
252 PREFETCH_W(res+PREF_OFFS(T), CA0); \
253 OPER(res[6], v1[-2], v2[-2], f1, f2); \
254 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
255 OPER(res[7], v1[-1], v2[-1], f1, f2); \
256 res += 8; \
257 } else { \
258 OPER(res[0], v1[0], v2[0], f1, f2); \
259 i -= 8; \
260 OPER(res[1], v1[1], v2[1], f1, f2); \
261 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
262 OPER(res[2], v1[2], v2[2], f1, f2); \
263 OPER(res[3], v1[3], v2[3], f1, f2); \
264 v1 += 8; \
265 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
266 OPER(res[4], v1[-4], v2[4], f1, f2); \
267 OPER(res[5], v1[-3], v2[5], f1, f2); \
268 v2 += 8; \
269 PREFETCH_W(res+PREF_OFFS(T), CA0); \
270 OPER(res[6], v1[-2], v2[-2], f1, f2); \
271 OPER(res[7], v1[-1], v2[-1], f1, f2); \
272 res += 8; \
273 }
274
275
277#define UNROLL8_KERNEL5(OPER) \
278 OPER(res[0], v1[0], v2[0], f1, f2); \
279 OPER(res[1], v1[1], v2[1], f1, f2); \
280 i -= 8; \
281 OPER(res[2], v1[2], v2[2], f1, f2); \
282 OPER(res[3], v1[3], v2[3], f1, f2); \
283 v1 += 8; \
284 OPER(res[4], v1[-4], v2[4], f1, f2); \
285 OPER(res[5], v1[-3], v2[5], f1, f2); \
286 v2 += 8; \
287 OPER(res[6], v1[-2], v2[-2], f1, f2); \
288 OPER(res[7], v1[-1], v2[-1], f1, f2); \
289 res += 8
290
291#define UNROLL8_KERNEL5_PREPARE do {} while(0)
292#define UNROLL8_KERNEL5_FIXUP do {} while(0)
293
295#define PREF_AHEAD3(T,CA0,CA1,CA2) \
296 if (PREFETCH_AHEAD >= 16) { \
297 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
298 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
299 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
300 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
301 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
302 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
303 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
304 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
305 PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
306 PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
307 PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
308 PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
309 PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
310 PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
311 PREFETCH_W(res+EL_PER_CL(T), CA0); \
312 PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
313 PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
314 PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
315 PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
316 PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
317 PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
318 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
319 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
320 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
321 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
322 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
323 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
324 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
325 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
326 PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
327 PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
328 PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
329 PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
330 PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
331 PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
332 PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
333 PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
334 PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
335 PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
336 PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
337 PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
338 PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
339 PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
340 PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
341 PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
342 } else if (PREFETCH_AHEAD >= 8) { \
343 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
344 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
345 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
346 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
347 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
348 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
349 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
350 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
351 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
352 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
353 PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
354 PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
355 PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
356 PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
357 PREFETCH_W(res+EL_PER_CL(T), CA0); \
358 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
359 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
360 PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
361 PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
362 PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
363 PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
364 } else if (PREFETCH_AHEAD >= 4) { \
365 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
366 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
367 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
368 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
369 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
370 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
371 PREFETCH_W(res+EL_PER_CL(T), CA0); \
372 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
373 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
374 } else if (PREFETCH_AHEAD >= 2) { \
375 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
376 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
377 PREFETCH_W(res+EL_PER_CL(T), CA0); \
378 }
379
380
381/***********************************************************
382 * 2 pointer operations
383 ***********************************************************/
384
386#define UNROLL1_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
387 OPER(res[0], v1[0], f1, f2); \
388 --i; \
389 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
390 ++v1; \
391 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
392 res+=RI
393
394#define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,RI) \
395 UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,1)
396
398#define UNROLL1_KERNEL4_STRIDE(OPER,RI) \
399 --i; \
400 OPER(res[0], v1[0], f1, f2); \
401 ++v1; res+=RI
402
403#define UNROLL1_KERNEL4(OPER) \
404 UNROLL1_KERNEL4_STRIDE(OPER,1)
405
406#define UNROLL1_KERNEL4_PREPARE do {} while(0)
407#define UNROLL1_KERNEL4_FIXUP do {} while(0)
408
409
411#define UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
412 if (EL_PER_CL(T) <= 1) { \
413 i -= 2; \
414 OPER(res[0], v1[0], f1, f2); \
415 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
416 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
417 OPER(res[RI],v1[1], f1, f2); \
418 v1 += 2; \
419 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
420 PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
421 res += 2*RI; \
422 } else { \
423 i -= 2; \
424 OPER(res[0], v1[0], f1, f2); \
425 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
426 OPER(res[RI],v1[1], f1, f2); \
427 v1 += 2; \
428 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
429 res += 2*RI; \
430 } \
431
432#define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433 UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
434
436#define UNROLL2_KERNEL4_STRIDE(OPER,RI) \
437 OPER(res[0], v1[0], f1, f2); \
438 v1 += 2; i -= 2; \
439 OPER(res[RI],v1[-1],f1, f2); \
440 res += 2*RI
441
442#define UNROLL2_KERNEL4(OPER) \
443 UNROLL2_KERNEL4_STRIDE(OPER,1) \
444
445#define UNROLL2_KERNEL4_PREPARE do {} while(0)
446#define UNROLL2_KERNEL4_FIXUP do {} while(0)
447
448
450#define UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
451 if (EL_PER_CL(T) <= 1) { \
452 OPER(res[0], v1[0], f1, f2); \
453 i -= 4; \
454 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
455 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
456 OPER(res[RI],v1[1], f1, f2); \
457 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
458 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
459 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
460 OPER(res[2*RI], v1[2], f1, f2); \
461 v1 += 4; \
462 PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
463 PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
464 PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
465 OPER(res[3*RI], v1[-1], f1, f2); \
466 res += 4*RI; \
467 } else if (EL_PER_CL(T) <= 2) { \
468 OPER(res[0], v1[0], f1, f2); \
469 i -= 4; \
470 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
471 OPER(res[RI],v1[1], f1, f2); \
472 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
473 OPER(res[2*RI], v1[2], f1, f2); \
474 v1 += 4; \
475 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
476 PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
477 OPER(res[3*RI], v1[-1], f1, f2); \
478 res += 4*RI; \
479 } else { \
480 OPER(res[0], v1[0], f1, f2); \
481 i -= 4; \
482 OPER(res[RI],v1[1], f1, f2); \
483 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
484 OPER(res[2*RI], v1[2], f1, f2); \
485 v1 += 4; \
486 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
487 OPER(res[3*RI], v1[-1], f1, f2); \
488 res += 4*RI; \
489 }
490
491#define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
492 UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
493
495#define UNROLL4_KERNEL4_STRIDE(OPER,RI) \
496 OPER(res[0], v1[0], f1, f2); \
497 OPER(res[RI],v1[1], f1, f2); \
498 v1 += 4; i -= 4; \
499 OPER(res[2*RI], v1[-2], f1, f2); \
500 OPER(res[3*RI], v1[-1], f1, f2); \
501 res += 4*RI
502
503#define UNROLL4_KERNEL4(OPER) \
504 UNROLL4_KERNEL4_STRIDE(OPER,1)
505
506#define UNROLL4_KERNEL4_PREPARE do {} while(0)
507#define UNROLL4_KERNEL4_FIXUP do {} while(0)
508
509
511#define UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
512 if (EL_PER_CL(T) <= 1) { \
513 OPER(res[0], v1[0], f1, f2); \
514 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
515 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
516 OPER(res[RI],v1[1], f1, f2); \
517 i -= 8; \
518 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
519 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
520 OPER(res[2*RI], v1[2], f1, f2); \
521 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
522 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
523 OPER(res[3*RI], v1[3], f1, f2); \
524 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
525 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
526 OPER(res[4*RI], v1[4], f1, f2); \
527 v1 += 8; \
528 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
529 PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
530 OPER(res[5*RI], v1[-3], f1, f2); \
531 PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
532 PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
533 OPER(res[6*RI], v1[-2], f1, f2); \
534 PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
535 PREFETCH_X(res+RI*(PREF_OFFS(T)+5), CA0); \
536 OPER(res[7*RI], v1[-1], f1, f2); \
537 PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
538 PREFETCH_X(res+RI*(PREF_OFFS(T)+7), CA0); \
539 res += 8*RI; \
540 } else if (EL_PER_CL(T) <= 2) { \
541 OPER(res[0], v1[0], f1, f2); \
542 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
543 OPER(res[RI],v1[1], f1, f2); \
544 i -= 8; \
545 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
546 OPER(res[2*RI], v1[2], f1, f2); \
547 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
548 OPER(res[3*RI], v1[3], f1, f2); \
549 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
550 OPER(res[4*RI], v1[4], f1, f2); \
551 v1 += 8; \
552 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
553 OPER(res[5*RI], v1[-3], f1, f2); \
554 PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
555 OPER(res[6*RI], v1[-2], f1, f2); \
556 PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
557 OPER(res[7*RI], v1[-1], f1, f2); \
558 PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
559 res += 8*RI; \
560 } else if (EL_PER_CL(T) <= 4) { \
561 OPER(res[0], v1[0], f1, f2); \
562 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
563 OPER(res[RI],v1[1], f1, f2); \
564 OPER(res[2*RI], v1[2], f1, f2); \
565 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
566 OPER(res[3*RI], v1[3], f1, f2); \
567 OPER(res[4*RI], v1[4], f1, f2); \
568 v1 += 8; \
569 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
570 OPER(res[5*RI], v1[-3], f1, f2); \
571 i -= 8; \
572 OPER(res[6*RI], v1[-2], f1, f2); \
573 PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
574 OPER(res[7*RI], v1[-1], f1, f2); \
575 res += 8*RI; \
576 } else { \
577 OPER(res[0], v1[0], f1, f2); \
578 OPER(res[RI],v1[1], f1, f2); \
579 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
580 OPER(res[2*RI], v1[2], f1, f2); \
581 OPER(res[3*RI], v1[3], f1, f2); \
582 v1 += 8; \
583 PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
584 OPER(res[4*RI], v1[-4], f1, f2); \
585 OPER(res[5*RI], v1[-3], f1, f2); \
586 i -= 8; \
587 OPER(res[6*RI], v1[-2], f1, f2); \
588 OPER(res[7*RI], v1[-1], f1, f2); \
589 res += 8*RI; \
590 }
591
592#define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
593 UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
594
596#define UNROLL8_KERNEL4_STRIDE(OPER,RI) \
597 OPER(res[0], v1[0], f1, f2); \
598 OPER(res[RI],v1[1], f1, f2); \
599 OPER(res[2*RI], v1[2], f1, f2); \
600 OPER(res[3*RI], v1[3], f1, f2); \
601 v1 += 8; i -= 8; \
602 OPER(res[4*RI], v1[-4], f1, f2); \
603 OPER(res[5*RI], v1[-3], f1, f2); \
604 OPER(res[6*RI], v1[-2], f1, f2); \
605 OPER(res[7*RI], v1[-1], f1, f2); \
606 res += 8*RI
607
608#define UNROLL8_KERNEL4(OPER) \
609 UNROLL8_KERNEL4_STRIDE(OPER,1)
610
611#define UNROLL8_KERNEL4_PREPARE do {} while(0)
612#define UNROLL8_KERNEL4_FIXUP do {} while(0)
613
614
616#define PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,RI) \
617 if (PREFETCH_AHEAD >= 16) { \
618 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
619 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
620 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
621 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
622 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
623 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
624 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
625 PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
626 PREFETCH_X(res+RI*EL_PER_CL(T)* 2, CA0); \
627 PREFETCH_X(res+RI*EL_PER_CL(T)* 3, CA0); \
628 PREFETCH_X(res+RI*EL_PER_CL(T)* 4, CA0); \
629 PREFETCH_X(res+RI*EL_PER_CL(T)* 5, CA0); \
630 PREFETCH_X(res+RI*EL_PER_CL(T)* 6, CA0); \
631 PREFETCH_X(res+RI*EL_PER_CL(T)* 7, CA0); \
632 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
633 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
634 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
635 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
636 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
637 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
638 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
639 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
640 PREFETCH_X(res+RI*EL_PER_CL(T)* 8, CA0); \
641 PREFETCH_X(res+RI*EL_PER_CL(T)* 9, CA0); \
642 PREFETCH_X(res+RI*EL_PER_CL(T)*10, CA0); \
643 PREFETCH_X(res+RI*EL_PER_CL(T)*11, CA0); \
644 PREFETCH_X(res+RI*EL_PER_CL(T)*12, CA0); \
645 PREFETCH_X(res+RI*EL_PER_CL(T)*13, CA0); \
646 PREFETCH_X(res+RI*EL_PER_CL(T)*14, CA0); \
647 PREFETCH_X(res+RI*EL_PER_CL(T)*15, CA0); \
648 } else if (PREFETCH_AHEAD >= 8) { \
649 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
650 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
651 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
652 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
653 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
654 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
655 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
656 PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
657 PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
658 PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
659 PREFETCH_X(res+RI*EL_PER_CL(T)*4, CA0); \
660 PREFETCH_X(res+RI*EL_PER_CL(T)*5, CA0); \
661 PREFETCH_X(res+RI*EL_PER_CL(T)*6, CA0); \
662 PREFETCH_X(res+RI*EL_PER_CL(T)*7, CA0); \
663 } else if (PREFETCH_AHEAD >= 4) { \
664 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
665 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
666 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
667 PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
668 PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
669 PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
670 } else if (PREFETCH_AHEAD >= 2) { \
671 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
672 PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
673 }
674
675#define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
676 PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,1)
677
678/***********************************************************
679 * 1 pointer operations
680 ***********************************************************/
681
683#define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
684 OPER(res[0], f1, f2); \
685 --i; \
686 PREFETCH_X(res+PREF_OFFS(T), CA0); \
687 ++res
688
690#define UNROLL1_KERNEL3(OPER) \
691 --i; \
692 OPER(res[0], f1, f2); \
693 ++res
694
695#define UNROLL1_KERNEL3_PREPARE do {} while(0)
696#define UNROLL1_KERNEL3_FIXUP do {} while(0)
697
698
700#define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
701 if (EL_PER_CL(T) <= 1) { \
702 OPER(res[0], f1, f2); \
703 PREFETCH_X(res+PREF_OFFS(T), CA0); \
704 i -= 2; \
705 OPER(res[1], f1, f2); \
706 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
707 res += 2; \
708 } else { \
709 OPER(res[0], f1, f2); \
710 i -= 2; \
711 OPER(res[1], f1, f2); \
712 PREFETCH_X(res+PREF_OFFS(T), CA0); \
713 res += 2; \
714 } \
715
716
718#define UNROLL2_KERNEL3(OPER) \
719 OPER(res[0], f1, f2); \
720 i -= 2; \
721 OPER(res[1], f1, f2); \
722 res += 2
723
724#define UNROLL2_KERNEL3_PREPARE do {} while(0)
725#define UNROLL2_KERNEL3_FIXUP do {} while(0)
726
727
729#define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
730 if (EL_PER_CL(T) <= 1) { \
731 OPER(res[0], f1, f2); \
732 i -= 4; \
733 PREFETCH_X(res+PREF_OFFS(T), CA0); \
734 OPER(res[1], f1, f2); \
735 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
736 OPER(res[2], f1, f2); \
737 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
738 OPER(res[3], f1, f2); \
739 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
740 res += 4; \
741 } else if (EL_PER_CL(T) <= 2) { \
742 OPER(res[0], f1, f2); \
743 PREFETCH_X(res+PREF_OFFS(T), CA0); \
744 OPER(res[1], f1, f2); \
745 i -= 4; \
746 OPER(res[2], f1, f2); \
747 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
748 OPER(res[3], f1, f2); \
749 res += 4; \
750 } else { \
751 OPER(res[0], f1, f2); \
752 i -= 4; \
753 OPER(res[1], f1, f2); \
754 OPER(res[2], f1, f2); \
755 PREFETCH_X(res+PREF_OFFS(T), CA0); \
756 OPER(res[3], f1, f2); \
757 res += 4; \
758 }
759
761#define UNROLL4_KERNEL3(OPER) \
762 OPER(res[0], f1, f2); \
763 OPER(res[1], f1, f2); \
764 i -= 4; \
765 OPER(res[2], f1, f2); \
766 OPER(res[3], f1, f2); \
767 res += 4
768
769#define UNROLL4_KERNEL3_PREPARE do {} while(0)
770#define UNROLL4_KERNEL3_FIXUP do {} while(0)
771
772
774#define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
775 if (EL_PER_CL(T) <= 1) { \
776 OPER(res[0], f1, f2); \
777 PREFETCH_X(res+PREF_OFFS(T), CA0); \
778 OPER(res[1], f1, f2); \
779 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
780 OPER(res[2], f1, f2); \
781 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
782 OPER(res[3], f1, f2); \
783 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
784 OPER(res[4], f1, f2); \
785 i -= 8; \
786 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
787 OPER(res[5], f1, f2); \
788 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
789 OPER(res[6], f1, f2); \
790 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
791 OPER(res[7], f1, f2); \
792 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
793 res += 8; \
794 } else if (EL_PER_CL(T) <= 2) { \
795 OPER(res[0], f1, f2); \
796 OPER(res[1], f1, f2); \
797 PREFETCH_X(res+PREF_OFFS(T), CA0); \
798 OPER(res[2], f1, f2); \
799 OPER(res[3], f1, f2); \
800 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
801 OPER(res[4], f1, f2); \
802 i -= 8; \
803 OPER(res[5], f1, f2); \
804 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
805 OPER(res[6], f1, f2); \
806 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
807 OPER(res[7], f1, f2); \
808 res += 8; \
809 } else if (EL_PER_CL(T) <= 4) { \
810 OPER(res[0], f1, f2); \
811 OPER(res[1], f1, f2); \
812 PREFETCH_X(res+PREF_OFFS(T), CA0); \
813 OPER(res[2], f1, f2); \
814 OPER(res[3], f1, f2); \
815 i -= 8; \
816 OPER(res[4], f1, f2); \
817 OPER(res[5], f1, f2); \
818 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
819 OPER(res[6], f1, f2); \
820 OPER(res[7], f1, f2); \
821 res += 8; \
822 } else { \
823 OPER(res[0], f1, f2); \
824 OPER(res[1], f1, f2); \
825 OPER(res[2], f1, f2); \
826 PREFETCH_X(res+PREF_OFFS(T), CA0); \
827 OPER(res[3], f1, f2); \
828 OPER(res[4], f1, f2); \
829 OPER(res[5], f1, f2); \
830 i -= 8; \
831 OPER(res[6], f1, f2); \
832 OPER(res[7], f1, f2); \
833 res += 8; \
834 }
835
836
838#define UNROLL8_KERNEL3(OPER) \
839 OPER(res[0], f1, f2); \
840 OPER(res[1], f1, f2); \
841 OPER(res[2], f1, f2); \
842 OPER(res[3], f1, f2); \
843 i -= 8; \
844 OPER(res[4], f1, f2); \
845 OPER(res[5], f1, f2); \
846 OPER(res[6], f1, f2); \
847 OPER(res[7], f1, f2); \
848 res += 8
849
850#define UNROLL8_KERNEL3_PREPARE do {} while(0)
851#define UNROLL8_KERNEL3_FIXUP do {} while(0)
852
853
855#define PREF_AHEAD1(T,PREFETCH_X,CA0) \
856 if (PREFETCH_AHEAD >= 16) { \
857 PREFETCH_X(res+EL_PER_CL(T), CA0); \
858 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
859 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
860 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
861 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
862 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
863 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
864 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
865 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
866 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
867 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
868 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
869 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
870 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
871 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
872 } else if (PREFETCH_AHEAD >= 8) { \
873 PREFETCH_X(res+EL_PER_CL(T), CA0); \
874 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
875 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
876 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
877 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
878 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
879 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
880 } else if (PREFETCH_AHEAD >= 4) { \
881 PREFETCH_X(res+EL_PER_CL(T), CA0); \
882 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
883 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
884 } else if (PREFETCH_AHEAD >= 2) { \
885 PREFETCH_X(res+EL_PER_CL(T), CA0); \
886 }
887
888
889
890// Select default kernels
891#if UNROLL_DEPTH == 1
892
893# define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
894# define UNR_KERNEL5 UNROLL1_KERNEL5
895# define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
896# define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
897
898# define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
899# define UNR_PREF_KERNEL4_STRIDE UNROLL1_PREF_KERNEL4_STRIDE
900# define UNR_KERNEL4 UNROLL1_KERNEL4
901# define UNR_KERNEL4_STRIDE UNROLL1_KERNEL4_STRIDE
902# define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
903# define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
904
905# define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
906# define UNR_KERNEL3 UNROLL1_KERNEL3
907# define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
908# define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
909
910#elif UNROLL_DEPTH == 2
911
912# define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
913# define UNR_KERNEL5 UNROLL2_KERNEL5
914# define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
915# define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
916
917# define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
918# define UNR_PREF_KERNEL4_STRIDE UNROLL2_PREF_KERNEL4_STRIDE
919# define UNR_KERNEL4 UNROLL2_KERNEL4
920# define UNR_KERNEL4_STRIDE UNROLL2_KERNEL4_STRIDE
921# define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
922# define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
923
924# define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
925# define UNR_KERNEL3 UNROLL2_KERNEL3
926# define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
927# define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
928
929#elif UNROLL_DEPTH == 4
930
931# define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
932# define UNR_KERNEL5 UNROLL4_KERNEL5
933# define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
934# define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
935
936# define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
937# define UNR_PREF_KERNEL4_STRIDE UNROLL4_PREF_KERNEL4_STRIDE
938# define UNR_KERNEL4 UNROLL4_KERNEL4
939# define UNR_KERNEL4_STRIDE UNROLL4_KERNEL4_STRIDE
940# define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
941# define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
942
943# define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
944# define UNR_KERNEL3 UNROLL4_KERNEL3
945# define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
946# define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
947
948#elif UNROLL_DEPTH == 8
949
950# define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
951# define UNR_KERNEL5 UNROLL8_KERNEL5
952# define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
953# define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
954
955# define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
956# define UNR_PREF_KERNEL4_STRIDE UNROLL8_PREF_KERNEL4_STRIDE
957# define UNR_KERNEL4 UNROLL8_KERNEL4
958# define UNR_KERNEL4_STRIDE UNROLL8_KERNEL4_STRIDE
959# define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
960# define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
961
962# define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
963# define UNR_KERNEL3 UNROLL8_KERNEL3
964# define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
965# define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
966
967#else
968
969# error "UNROLL_DEPTH may only be 1, 2, 4, 8"
970
971#endif /* UNROLL_DEPTH */
972
990
991/****************************************************************
992 * Macros with fragments for the implementation
993 ****************************************************************/
994
995#ifdef USE_PREFETCH
997# define VKERN_TEMPL_3V_PREF(OP3,T) \
998 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
999 PREFETCH_W (res, 3); \
1000 PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
1001 UNR_KERNEL5_PREP; \
1002 do { \
1003 UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
1004 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1005 UNR_KERNEL5_FIX; \
1006 }
1008# define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
1009 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1010 PREFETCH_X (res, 3); \
1011 PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
1012 UNR_KERNEL4_PREP; \
1013 do { \
1014 UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
1015 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1016 UNR_KERNEL4_FIX; \
1017 }
1018# define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) \
1019 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1020 PREFETCH_X (res, 3); \
1021 PREF_AHEAD2_STRIDE(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ),RI); \
1022 UNR_KERNEL4_PREP; \
1023 do { \
1024 UNR_PREF_KERNEL4_STRIDE(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ,RI); \
1025 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1026 UNR_KERNEL4_FIX; \
1027 }
1029# define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
1030 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1031 PREFETCH_X (res, 3); \
1032 PREF_AHEAD1(T,PREFETCH_X,CW); \
1033 UNR_KERNEL3_PREP; \
1034 do { \
1035 UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
1036 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1037 UNR_KERNEL3_FIX; \
1038 }
1039#else
1040# define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
1041# define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1042# define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) do {} while (0)
1043# define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1044#endif /* USE_PREFETCH */
1045
1046
1047/****************************************************************
1048 * Templates for routines
1049 ****************************************************************/
1050
1060
1061
1063#define VKERN_TEMPL_3V(FNAME,OP3) \
1064INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1065 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1066template <typename T> \
1067VEC_INLINE void FNAME (const unsigned long sz, \
1068 T* RESTRICT const _res, \
1069 const T* RESTRICT const _v1, \
1070 const T* RESTRICT const _v2) \
1071{ \
1072 PREFETCH_W(_res, 3); \
1073 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1074 REGISTER const T *v1 = _v1, *v2 = _v2; \
1075 REGISTER T *res = _res; \
1076 REGISTER long i = sz; \
1077 VKERN_TEMPL_3V_PREF(OP3,T); \
1078 \
1079 if (LIKELY(i >= UNROLL_DEPTH)) { \
1080 UNR_KERNEL5_PREP; \
1081 do { \
1082 UNR_KERNEL5(OP3); \
1083 } while (i >= UNROLL_DEPTH); \
1084 UNR_KERNEL5_FIX; \
1085 } \
1086 \
1087 for (; i; --i) { \
1088 OP3(*res, *v1, *v2, f1, f2); \
1089 ++v1; ++v2; ++res; \
1090 } \
1091}
1092
1094#define VKERN_TEMPL_3V_C(FNAME,OP3) \
1095INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1096 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1097 const T* RESTRICT const, LCTYPED(T));) \
1098template <typename T> \
1099VEC_INLINE void FNAME (const unsigned long sz, \
1100 T* RESTRICT const _res, \
1101 const T* RESTRICT const _v1, \
1102 const T* RESTRICT const _v2, \
1103 LCTYPE(T) f2) \
1104{ \
1105 PREFETCH_W(_res, 3); \
1106 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1107 REGISTER const T *v1 = _v1, *v2 = _v2; \
1108 REGISTER T *res = _res; \
1109 REGISTER long i = sz; \
1110 VKERN_TEMPL_3V_PREF(OP3,T); \
1111 \
1112 if (LIKELY(i >= UNROLL_DEPTH)) { \
1113 UNR_KERNEL5_PREP; \
1114 do { \
1115 UNR_KERNEL5(OP3); \
1116 } while (i >= UNROLL_DEPTH); \
1117 UNR_KERNEL5_FIX; \
1118 } \
1119 \
1120 for (; i; --i) { \
1121 OP3(*res, *v1, *v2, f1, f2); \
1122 ++v1; ++v2; ++res; \
1123 } \
1124}
1125
1127#define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1128INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1129 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1130 const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1131template <typename T> \
1132VEC_INLINE void FNAME (const unsigned long sz, \
1133 T* RESTRICT const _res, \
1134 const T* RESTRICT const _v1, \
1135 const T* RESTRICT const _v2, \
1136 LCTYPE(T) f1, \
1137 LCTYPE(T) f2) \
1138{ \
1139 PREFETCH_W(_res, 3); \
1140 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1141 REGISTER long i = sz; \
1142 REGISTER const T *v1 = _v1, *v2 = _v2; \
1143 REGISTER T *res = _res; \
1144 VKERN_TEMPL_3V_PREF(OP3,T); \
1145 \
1146 if (LIKELY(i >= UNROLL_DEPTH)) { \
1147 UNR_KERNEL5_PREP; \
1148 do { \
1149 UNR_KERNEL5(OP3); \
1150 } while (i >= UNROLL_DEPTH); \
1151 UNR_KERNEL5_FIX; \
1152 } \
1153 \
1154 for (; i; --i) { \
1155 OP3(*res, *v1, *v2, f1, f2); \
1156 ++v1; ++v2; ++res; \
1157 } \
1158}
1159
1161#define VKERN_TEMPL_2V(FNAME,OP2) \
1162INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1163 (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1164template <typename T> \
1165VEC_INLINE void FNAME (const unsigned long sz, \
1166 T* RESTRICT const _res, \
1167 const T* RESTRICT const _v1) \
1168{ \
1169 PREFETCH_W(_res, 3); \
1170 PREFETCH_R(_v1, 3); \
1171 REGISTER const T *v1 = _v1; \
1172 REGISTER T* res = _res; \
1173 REGISTER long i = sz; \
1174 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1175 \
1176 if (LIKELY(i >= UNROLL_DEPTH)) { \
1177 UNR_KERNEL4_PREP; \
1178 do { \
1179 UNR_KERNEL4(OP2); \
1180 } while (i >= UNROLL_DEPTH); \
1181 UNR_KERNEL4_FIX; \
1182 } \
1183 \
1184 for (; i; --i) { \
1185 OP2(*res, *v1, f1, f2); \
1186 ++v1; ++res; \
1187 } \
1188}
1189
1191#define VKERN_TEMPL_2V_C(FNAME,OP2) \
1192INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1193 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1194 LCTYPED(T));) \
1195template <typename T> \
1196VEC_INLINE void FNAME (const unsigned long sz, \
1197 T* RESTRICT const _res, \
1198 const T* RESTRICT const _v1, \
1199 LCTYPE(T) f2) \
1200{ \
1201 PREFETCH_W(_res, 3); \
1202 PREFETCH_R(_v1, 3); \
1203 REGISTER const T *v1 = _v1; \
1204 REGISTER T* res = _res; \
1205 REGISTER long i = sz; \
1206 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1207 \
1208 if (LIKELY(i >= UNROLL_DEPTH)) { \
1209 UNR_KERNEL4_PREP; \
1210 do { \
1211 UNR_KERNEL4(OP2); \
1212 } while (i >= UNROLL_DEPTH); \
1213 UNR_KERNEL4_FIX; \
1214 } \
1215 \
1216 for (; i; --i) { \
1217 OP2(*res, *v1, f1, f2); \
1218 ++v1; ++res; \
1219 } \
1220}
1221
1223#define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1224INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1225 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1226 LCTYPED(T), LCTYPED(T));) \
1227template <typename T> \
1228VEC_INLINE void FNAME (const unsigned long sz, \
1229 T* RESTRICT const _res, \
1230 const T* RESTRICT const _v1, \
1231 LCTYPE(T) f1, \
1232 LCTYPE(T) f2) \
1233{ \
1234 PREFETCH_W(_res, 3); \
1235 PREFETCH_R(_v1, 3); \
1236 REGISTER const T *v1 = _v1; \
1237 REGISTER T* res = _res; \
1238 REGISTER long i = sz; \
1239 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1240 \
1241 if (LIKELY(i >= UNROLL_DEPTH)) { \
1242 UNR_KERNEL4_PREP; \
1243 do { \
1244 UNR_KERNEL4(OP2); \
1245 } while (i >= UNROLL_DEPTH); \
1246 UNR_KERNEL4_FIX; \
1247 } \
1248 \
1249 for (; i; --i) { \
1250 OP2(*res, *v1, f1, f2); \
1251 ++v1; ++res; \
1252 } \
1253}
1254
1256#define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1257INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1258 (const unsigned long, const T* RESTRICT const, \
1259 const T* RESTRICT const, TYPE&);) \
1260template <typename T> \
1261VEC_INLINE void FNAME (const unsigned long sz, \
1262 const T* RESTRICT const _res, \
1263 const T* RESTRICT const _v1, \
1264 TYPE &_f2) \
1265{ \
1266 PREFETCH_R(_res, 3); \
1267 PREFETCH_R(_v1, 3); \
1268 REGISTER const T *v1 = _v1; \
1269 REGISTER const T* res = _res; \
1270 /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1271 REGISTER TYPE f2(_f2), f1(0.0); \
1272 REGISTER long i = sz; \
1273 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1274 \
1275 if (LIKELY(i >= UNROLL_DEPTH)) { \
1276 UNR_KERNEL4_PREP; \
1277 do { \
1278 UNR_KERNEL4(OP2); \
1279 } while (i >= UNROLL_DEPTH); \
1280 UNR_KERNEL4_FIX; \
1281 } \
1282 \
1283 for (; i; --i) { \
1284 OP2(*res, *v1, f1, f2); \
1285 ++v1; ++res; \
1286 } \
1287 _fin: \
1288 _f2 = f2 - f1; \
1289}
1290
1292#define VKERN_TEMPL_2V_T_STRIDE(FNAME,OP2,TYPE) \
1293INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1294 (const unsigned long, const T* RESTRICT const, \
1295 const T* RESTRICT const, TYPE&, const unsigned);) \
1296template <typename T> \
1297VEC_INLINE void FNAME (const unsigned long sz, \
1298 const T* RESTRICT const _res, \
1299 const T* RESTRICT const _v1, \
1300 TYPE &_f2, const unsigned rincr) \
1301{ \
1302 PREFETCH_R(_res, 3); \
1303 PREFETCH_R(_v1, 3); \
1304 REGISTER const T *v1 = _v1; \
1305 REGISTER const T* res = _res; \
1306 /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1307 REGISTER TYPE f2(_f2), f1(0.0); \
1308 REGISTER long i = sz; \
1309 VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_R,CACHE_LOC_READ,rincr); \
1310 \
1311 if (LIKELY(i >= UNROLL_DEPTH)) { \
1312 UNR_KERNEL4_PREP; \
1313 do { \
1314 UNR_KERNEL4_STRIDE(OP2,rincr); \
1315 } while (i >= UNROLL_DEPTH); \
1316 UNR_KERNEL4_FIX; \
1317 } \
1318 \
1319 for (; i; --i) { \
1320 OP2(*res, *v1, f1, f2); \
1321 ++v1; res += rincr; \
1322 } \
1323 _fin: \
1324 _f2 = f2 - f1; \
1325}
1326
1328#define VKERN_TEMPL_1V(FNAME,OP1) \
1329INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1330 (const unsigned long, T* RESTRICT const);) \
1331template <typename T> \
1332VEC_INLINE void FNAME (const unsigned long sz, \
1333 T* RESTRICT const _res) \
1334{ \
1335 PREFETCH_W(_res, 3); \
1336 REGISTER long i = sz; \
1337 REGISTER T* res = _res; \
1338 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1339 \
1340 if (LIKELY(i >= UNROLL_DEPTH)) { \
1341 UNR_KERNEL3_PREP; \
1342 do { \
1343 UNR_KERNEL3(OP1); \
1344 } while (i >= UNROLL_DEPTH); \
1345 UNR_KERNEL3_FIX; \
1346 } \
1347 \
1348 for (; i; --i) { \
1349 OP1(*res, f1, f2); \
1350 ++res; \
1351 } \
1352}
1353
1355#define VKERN_TEMPL_1V_C(FNAME,OP1) \
1356INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1357 (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1358template <typename T> \
1359VEC_INLINE void FNAME (const unsigned long sz, \
1360 T* RESTRICT const _res, \
1361 LCTYPE(T) f2) \
1362{ \
1363 PREFETCH_W(_res, 3); \
1364 REGISTER long i = sz; \
1365 REGISTER T* res = _res; \
1366 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1367 \
1368 if (LIKELY(i >= UNROLL_DEPTH)) { \
1369 UNR_KERNEL3_PREP; \
1370 do { \
1371 UNR_KERNEL3(OP1); \
1372 } while (i >= UNROLL_DEPTH); \
1373 UNR_KERNEL3_FIX; \
1374 } \
1375 \
1376 for (; i; --i) { \
1377 OP1(*res, f1, f2); \
1378 ++res; \
1379 } \
1380}
1381
1383#define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1384INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1385 (const unsigned long, T* RESTRICT const, \
1386 LCTYPED(T), LCTYPED(T));) \
1387template <typename T> \
1388VEC_INLINE void FNAME (const unsigned long sz, \
1389 T* RESTRICT const _res, \
1390 LCTYPE(T) f1, \
1391 LCTYPE(T) f2) \
1392{ \
1393 PREFETCH_W(_res, 3); \
1394 REGISTER long i = sz; \
1395 REGISTER T* res = _res; \
1396 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1397 \
1398 if (LIKELY(i >= UNROLL_DEPTH)) { \
1399 UNR_KERNEL3_PREP; \
1400 do { \
1401 UNR_KERNEL3(OP1); \
1402 } while (i >= UNROLL_DEPTH); \
1403 UNR_KERNEL3_FIX; \
1404 } \
1405 \
1406 for (; i; --i) { \
1407 OP1(*res, f1, f2); \
1408 ++res; \
1409 } \
1410}
1411
1412
1416#define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1417INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1418 (const unsigned long, const T* const, TYPE&);) \
1419template <typename T> \
1420VEC_INLINE void FNAME (const unsigned long sz, \
1421 const T* const _res, \
1422 TYPE &_f2) \
1423{ \
1424 PREFETCH_R(_res, 3); \
1425 /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1426 REGISTER TYPE f2(_f2), f1(0.0); \
1427 REGISTER const T* res = _res; \
1428 REGISTER long i = sz; \
1429 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1430 \
1431 if (LIKELY(i >= UNROLL_DEPTH)) { \
1432 UNR_KERNEL3_PREP; \
1433 do { \
1434 UNR_KERNEL3(OP1); \
1435 } while (i >= UNROLL_DEPTH); \
1436 UNR_KERNEL3_FIX; \
1437 } \
1438 \
1439 for (; i; --i) { \
1440 OP1(*res, f1, f2); \
1441 ++res; \
1442 } \
1443 _f2 = f2 - f1; \
1444}
1445
1450#define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1451INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1452 (const unsigned long, const T* const, TYPE&);) \
1453template <typename T> \
1454VEC_INLINE void FNAME (const unsigned long sz, \
1455 const T* const _res, \
1456 TYPE &_f2) \
1457{ \
1458 PREFETCH_R(_res, 3); \
1459 /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1460 REGISTER LONG_DOUBLE f2(_f2); \
1461 REGISTER const T* res = _res; \
1462 REGISTER long i = sz; \
1463 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1464 \
1465 if (LIKELY(i >= UNROLL_DEPTH)) { \
1466 UNR_KERNEL3_PREP; \
1467 do { \
1468 UNR_KERNEL3(OP1); \
1469 } while (i >= UNROLL_DEPTH); \
1470 UNR_KERNEL3_FIX; \
1471 } \
1472 \
1473 for (; i; --i) { \
1474 OP1(*res, f1, f2); \
1475 ++res; \
1476 } \
1477 _f2 = f2; \
1478}
1479
1480#endif /* TBCI_UNROLL_PREFETCH_DEF_H */