TBCI Numerical high perf. C++ Library 2.8.0
unroll_prefetch_def2.h
Go to the documentation of this file.
1
7
8#ifndef TBCI_UNROLL_PREFETCH_DEF2_H
9#define TBCI_UNROLL_PREFETCH_DEF2_H
10
11//#include "tbci/basics.h"
12
14#define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15#define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16
41
42#ifndef UNROLL_DEPTH
43# define UNROLL_DEPTH 4
44#endif
45
46
47/***********************************************************
48 * 3 pointer operations
49 ***********************************************************/
50
52#define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
53 --i; \
54 OPER(res[0], v1[0], v2[0], f1, f2); \
55 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
56 ++v1; \
57 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
58 ++v2; \
59 PREFETCH_W(res+PREF_OFFS(T), CA0); \
60 ++res
61
63#define UNROLL1_KERNEL5(OPER) \
64 --i; \
65 OPER(res[0], v1[0], v2[0], f1, f2); \
66 ++v1; ++v2; ++res
67
68#define UNROLL1_KERNEL5_PREPARE do {} while(0)
69#define UNROLL1_KERNEL5_FIXUP do {} while(0)
70
71
73#define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
74 if (EL_PER_CL(T) <= 1) { \
75 OPER(res[0], v1[0], v2[0], f1, f2); \
76 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
77 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
78 i -= 2; \
79 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
80 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
81 OPER(res[1], v1[1], v2[1], f1, f2); \
82 v1 += 2; v2 += 2; \
83 PREFETCH_W(res+PREF_OFFS(T), CA0); \
84 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
85 res += 2; \
86 } else { \
87 OPER(res[0], v1[0], v2[0], f1, f2); \
88 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
89 i -= 2; \
90 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
91 OPER(res[1], v1[1], v2[1], f1, f2); \
92 v1 += 2; v2 += 2; \
93 PREFETCH_W(res+PREF_OFFS(T), CA0); \
94 res += 2; \
95 } \
96
97
99#define UNROLL2_KERNEL5(OPER) \
100 OPER(res[0], v1[0], v2[0], f1, f2); \
101 i -= 2; \
102 OPER(res[1], v1[1], v2[1], f1, f2); \
103 v1 += 2; v2 += 2; res += 2
104
105#define UNROLL2_KERNEL5_PREPARE do {} while(0)
106#define UNROLL2_KERNEL5_FIXUP do {} while(0)
107
108
110#define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
111 if (EL_PER_CL(T) <= 1) { \
112 OPER(res[0], v1[0], v2[0], f1, f2); \
113 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
114 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
115 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
116 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
117 OPER(res[1], v1[1], v2[1], f1, f2); \
118 i -= 4; \
119 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
120 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
121 OPER(res[2], v1[2], v2[2], f1, f2); \
122 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
123 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
124 PREFETCH_W(res+PREF_OFFS(T), CA0); \
125 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126 OPER(res[3], v1[3], v2[3], f1, f2); \
127 v1 += 4; v2 += 4; \
128 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
129 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
130 res += 4; \
131 } else if (EL_PER_CL(T) <= 2) { \
132 OPER(res[0], v1[0], v2[0], f1, f2); \
133 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135 OPER(res[1], v1[1], v2[1], f1, f2); \
136 i -= 4; \
137 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139 OPER(res[2], v1[2], v2[2], f1, f2); \
140 PREFETCH_W(res+PREF_OFFS(T), CA0); \
141 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
142 OPER(res[3], v1[3], v2[3], f1, f2); \
143 v1 += 4; v2 += 4; \
144 res += 4; \
145 } else { \
146 OPER(res[0], v1[0], v2[0], f1, f2); \
147 i -= 4; \
148 OPER(res[1], v1[1], v2[1], f1, f2); \
149 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
150 OPER(res[2], v1[2], v2[2], f1, f2); \
151 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152 OPER(res[3], v1[3], v2[3], f1, f2); \
153 v1 += 4; v2 += 4; \
154 PREFETCH_W(res+PREF_OFFS(T), CA0); \
155 res += 4; \
156 }
157
159#define UNROLL4_KERNEL5(OPER) \
160 OPER(res[0], v1[0], v2[0], f1, f2); \
161 OPER(res[1], v1[1], v2[1], f1, f2); \
162 i -= 4; \
163 OPER(res[2], v1[2], v2[2], f1, f2); \
164 OPER(res[3], v1[3], v2[3], f1, f2); \
165 v1 += 4; v2 += 4; \
166 res += 4
167
168#define UNROLL4_KERNEL5_PREPARE do {} while(0)
169#define UNROLL4_KERNEL5_FIXUP do {} while(0)
170
171
173#define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
174 if (EL_PER_CL(T) <= 1) { \
175 OPER(res[0], v1[0], v2[0], f1, f2); \
176 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
177 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
178 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
179 OPER(res[1], v1[1], v2[1], f1, f2); \
180 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
181 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
182 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
183 OPER(res[2], v1[2], v2[2], f1, f2); \
184 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
185 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
186 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
187 OPER(res[3], v1[3], v2[3], f1, f2); \
188 i -= 8; \
189 PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
190 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
191 PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
192 OPER(res[4], v1[4], v2[4], f1, f2); \
193 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
194 PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
195 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
196 OPER(res[5], v1[5], v2[5], f1, f2); \
197 PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
198 PREFETCH_W(res+PREF_OFFS(T), CA0); \
199 PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
200 OPER(res[6], v1[6], v2[6], f1, f2); \
201 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
202 PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
203 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
204 OPER(res[7], v1[7], v2[7], f1, f2); \
205 v1 += 8; v2 += 8; \
206 PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
207 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
208 PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
209 res += 8; \
210 } else if (EL_PER_CL(T) <= 2) { \
211 OPER(res[0], v1[0], v2[0], f1, f2); \
212 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
213 OPER(res[1], v1[1], v2[1], f1, f2); \
214 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
215 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
216 OPER(res[2], v1[2], v2[2], f1, f2); \
217 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
218 OPER(res[3], v1[3], v2[3], f1, f2); \
219 i -= 8; \
220 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
221 PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
222 OPER(res[4], v1[4], v2[4], f1, f2); \
223 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
224 PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
225 OPER(res[5], v1[5], v2[5], f1, f2); \
226 PREFETCH_W(res+PREF_OFFS(T), CA0); \
227 PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
228 OPER(res[6], v1[6], v2[6], f1, f2); \
229 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
230 PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
231 OPER(res[7], v1[7], v2[7], f1, f2); \
232 v1 += 8; v2 += 8; \
233 res += 8; \
234 } else if (EL_PER_CL(T) <= 4) { \
235 OPER(res[0], v1[0], v2[0], f1, f2); \
236 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
237 OPER(res[1], v1[1], v2[1], f1, f2); \
238 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
239 OPER(res[2], v1[2], v2[2], f1, f2); \
240 i -= 8; \
241 OPER(res[3], v1[3], v2[3], f1, f2); \
242 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
243 OPER(res[4], v1[4], v2[4], f1, f2); \
244 PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
245 OPER(res[5], v1[5], v2[5], f1, f2); \
246 PREFETCH_W(res+PREF_OFFS(T), CA0); \
247 OPER(res[6], v1[6], v2[6], f1, f2); \
248 PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
249 OPER(res[7], v1[7], v2[7], f1, f2); \
250 v1 += 8; v2 += 8; \
251 res += 8; \
252 } else { \
253 OPER(res[0], v1[0], v2[0], f1, f2); \
254 OPER(res[1], v1[1], v2[1], f1, f2); \
255 i -= 8; \
256 OPER(res[2], v1[2], v2[2], f1, f2); \
257 OPER(res[3], v1[3], v2[3], f1, f2); \
258 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
259 OPER(res[4], v1[4], v2[4], f1, f2); \
260 OPER(res[5], v1[5], v2[5], f1, f2); \
261 PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
262 OPER(res[6], v1[6], v2[6], f1, f2); \
263 OPER(res[7], v1[7], v2[7], f1, f2); \
264 v1 += 8; v2 += 8; \
265 PREFETCH_W(res+PREF_OFFS(T), CA0); \
266 res += 8; \
267 }
268
269
271#define UNROLL8_KERNEL5(OPER) \
272 OPER(res[0], v1[0], v2[0], f1, f2); \
273 OPER(res[1], v1[1], v2[1], f1, f2); \
274 OPER(res[2], v1[2], v2[2], f1, f2); \
275 OPER(res[3], v1[3], v2[3], f1, f2); \
276 i -= 8; \
277 OPER(res[4], v1[4], v2[4], f1, f2); \
278 OPER(res[5], v1[5], v2[5], f1, f2); \
279 OPER(res[6], v1[6], v2[6], f1, f2); \
280 OPER(res[7], v1[7], v2[7], f1, f2); \
281 v1 += 8; v2 += 8; \
282 res += 8
283
284#define UNROLL8_KERNEL5_PREPARE do {} while(0)
285#define UNROLL8_KERNEL5_FIXUP do {} while(0)
286
288#define PREF_AHEAD3(T,CA0,CA1,CA2) \
289 if (PREFETCH_AHEAD >= 16) { \
290 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
291 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
292 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
293 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
294 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
295 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
296 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
297 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
298 PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
299 PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
300 PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
301 PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
302 PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
303 PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
304 PREFETCH_W(res+EL_PER_CL(T), CA0); \
305 PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
306 PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
307 PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
308 PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
309 PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
310 PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
311 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
312 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
313 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
314 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
315 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
316 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
317 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
318 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
319 PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
320 PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
321 PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
322 PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
323 PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
324 PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
325 PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
326 PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
327 PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
328 PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
329 PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
330 PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
331 PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
332 PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
333 PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
334 PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
335 } else if (PREFETCH_AHEAD >= 8) { \
336 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
337 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
338 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
339 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
340 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
341 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
342 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
343 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
344 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
345 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
346 PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
347 PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
348 PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
349 PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
350 PREFETCH_W(res+EL_PER_CL(T), CA0); \
351 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
352 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
353 PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
354 PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
355 PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
356 PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
357 } else if (PREFETCH_AHEAD >= 4) { \
358 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
359 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
360 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
361 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
362 PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
363 PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
364 PREFETCH_W(res+EL_PER_CL(T), CA0); \
365 PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
366 PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
367 } else if (PREFETCH_AHEAD >= 2) { \
368 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
369 PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
370 PREFETCH_W(res+EL_PER_CL(T), CA0); \
371 }
372
373
374/***********************************************************
375 * 2 pointer operations
376 ***********************************************************/
377
379#define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
380 OPER(res[0], v1[0], f1, f2); \
381 --i; \
382 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
383 ++v1; \
384 PREFETCH_X(res+PREF_OFFS(T), CA0); \
385 ++res
386
388#define UNROLL1_KERNEL4(OPER) \
389 --i; \
390 OPER(res[0], v1[0], f1, f2); \
391 ++v1; ++res
392
393#define UNROLL1_KERNEL4_PREPARE do {} while(0)
394#define UNROLL1_KERNEL4_FIXUP do {} while(0)
395
396
398#define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
399 if (EL_PER_CL(T) <= 1) { \
400 OPER(res[0], v1[0], f1, f2); \
401 i -= 2; \
402 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
403 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
404 OPER(res[1], v1[1], f1, f2); \
405 v1 += 2; \
406 PREFETCH_X(res+PREF_OFFS(T), CA0); \
407 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
408 res += 2; \
409 } else { \
410 OPER(res[0], v1[0], f1, f2); \
411 i -= 2; \
412 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
413 OPER(res[1], v1[1], f1, f2); \
414 v1 += 2; \
415 PREFETCH_X(res+PREF_OFFS(T), CA0); \
416 res += 2; \
417 } \
418
419
421#define UNROLL2_KERNEL4(OPER) \
422 OPER(res[0], v1[0], f1, f2); \
423 v1 += 2; i -= 2; \
424 OPER(res[1], v1[-1],f1, f2); \
425 res += 2
426
427#define UNROLL2_KERNEL4_PREPARE do {} while(0)
428#define UNROLL2_KERNEL4_FIXUP do {} while(0)
429
430
432#define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433 if (EL_PER_CL(T) <= 1) { \
434 OPER(res[0], v1[0], f1, f2); \
435 i -= 4; \
436 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
437 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
438 OPER(res[1], v1[1], f1, f2); \
439 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
440 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
441 PREFETCH_X(res+PREF_OFFS(T), CA0); \
442 OPER(res[2], v1[2], f1, f2); \
443 v1 += 4; \
444 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
445 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
446 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
447 OPER(res[3], v1[-1], f1, f2); \
448 res += 4; \
449 } else if (EL_PER_CL(T) <= 2) { \
450 OPER(res[0], v1[0], f1, f2); \
451 i -= 4; \
452 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
453 OPER(res[1], v1[1], f1, f2); \
454 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
455 OPER(res[2], v1[2], f1, f2); \
456 v1 += 4; \
457 PREFETCH_X(res+PREF_OFFS(T), CA0); \
458 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
459 OPER(res[3], v1[-1], f1, f2); \
460 res += 4; \
461 } else { \
462 OPER(res[0], v1[0], f1, f2); \
463 i -= 4; \
464 OPER(res[1], v1[1], f1, f2); \
465 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
466 OPER(res[2], v1[2], f1, f2); \
467 v1 += 4; \
468 PREFETCH_X(res+PREF_OFFS(T), CA0); \
469 OPER(res[3], v1[-1], f1, f2); \
470 res += 4; \
471 }
472
474#define UNROLL4_KERNEL4(OPER) \
475 OPER(res[0], v1[0], f1, f2); \
476 OPER(res[1], v1[1], f1, f2); \
477 v1 += 4; i -= 4; \
478 OPER(res[2], v1[-2], f1, f2); \
479 OPER(res[3], v1[-1], f1, f2); \
480 res += 4
481
482#define UNROLL4_KERNEL4_PREPARE do {} while(0)
483#define UNROLL4_KERNEL4_FIXUP do {} while(0)
484
485
487#define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
488 if (EL_PER_CL(T) <= 1) { \
489 OPER(res[0], v1[0], f1, f2); \
490 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
491 PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
492 OPER(res[1], v1[1], f1, f2); \
493 i -= 8; \
494 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
495 PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
496 OPER(res[2], v1[2], f1, f2); \
497 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
498 PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
499 OPER(res[3], v1[3], f1, f2); \
500 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
501 PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
502 OPER(res[4], v1[4], f1, f2); \
503 v1 += 8; \
504 PREFETCH_X(res+PREF_OFFS(T), CA0); \
505 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
506 OPER(res[5], v1[-3], f1, f2); \
507 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
508 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
509 OPER(res[6], v1[-2], f1, f2); \
510 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
511 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
512 OPER(res[7], v1[-1], f1, f2); \
513 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
514 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
515 res += 8; \
516 } else if (EL_PER_CL(T) <= 2) { \
517 OPER(res[0], v1[0], f1, f2); \
518 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
519 OPER(res[1], v1[1], f1, f2); \
520 i -= 8; \
521 PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
522 OPER(res[2], v1[2], f1, f2); \
523 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
524 OPER(res[3], v1[3], f1, f2); \
525 PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
526 OPER(res[4], v1[4], f1, f2); \
527 v1 += 8; \
528 PREFETCH_X(res+PREF_OFFS(T), CA0); \
529 OPER(res[5], v1[-3], f1, f2); \
530 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
531 OPER(res[6], v1[-2], f1, f2); \
532 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
533 OPER(res[7], v1[-1], f1, f2); \
534 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
535 res += 8; \
536 } else if (EL_PER_CL(T) <= 4) { \
537 OPER(res[0], v1[0], f1, f2); \
538 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
539 OPER(res[1], v1[1], f1, f2); \
540 OPER(res[2], v1[2], f1, f2); \
541 PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
542 OPER(res[3], v1[3], f1, f2); \
543 OPER(res[4], v1[4], f1, f2); \
544 v1 += 8; \
545 PREFETCH_X(res+PREF_OFFS(T), CA0); \
546 OPER(res[5], v1[-3], f1, f2); \
547 i -= 8; \
548 OPER(res[6], v1[-2], f1, f2); \
549 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
550 OPER(res[7], v1[-1], f1, f2); \
551 res += 8; \
552 } else { \
553 OPER(res[0], v1[0], f1, f2); \
554 OPER(res[1], v1[1], f1, f2); \
555 PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
556 OPER(res[2], v1[2], f1, f2); \
557 OPER(res[3], v1[3], f1, f2); \
558 v1 += 8; \
559 PREFETCH_X(res+PREF_OFFS(T), CA0); \
560 OPER(res[4], v1[-4], f1, f2); \
561 OPER(res[5], v1[-3], f1, f2); \
562 i -= 8; \
563 OPER(res[6], v1[-2], f1, f2); \
564 OPER(res[7], v1[-1], f1, f2); \
565 res += 8; \
566 }
567
568
570#define UNROLL8_KERNEL4(OPER) \
571 OPER(res[0], v1[0], f1, f2); \
572 OPER(res[1], v1[1], f1, f2); \
573 OPER(res[2], v1[2], f1, f2); \
574 OPER(res[3], v1[3], f1, f2); \
575 v1 += 8; i -= 8; \
576 OPER(res[4], v1[-4], f1, f2); \
577 OPER(res[5], v1[-3], f1, f2); \
578 OPER(res[6], v1[-2], f1, f2); \
579 OPER(res[7], v1[-1], f1, f2); \
580 res += 8
581
582#define UNROLL8_KERNEL4_PREPARE do {} while(0)
583#define UNROLL8_KERNEL4_FIXUP do {} while(0)
584
585
587#define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
588 if (PREFETCH_AHEAD >= 16) { \
589 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
590 PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
591 PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
592 PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
593 PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
594 PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
595 PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
596 PREFETCH_X(res+EL_PER_CL(T), CA0); \
597 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
598 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
599 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
600 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
601 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
602 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
603 PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
604 PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
605 PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
606 PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
607 PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
608 PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
609 PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
610 PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
611 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
612 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
613 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
614 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
615 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
616 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
617 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
618 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
619 } else if (PREFETCH_AHEAD >= 8) { \
620 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
621 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
622 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
623 PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
624 PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
625 PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
626 PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
627 PREFETCH_X(res+EL_PER_CL(T), CA0); \
628 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
629 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
630 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
631 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
632 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
633 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
634 } else if (PREFETCH_AHEAD >= 4) { \
635 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
636 PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
637 PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
638 PREFETCH_X(res+EL_PER_CL(T), CA0); \
639 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
640 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
641 } else if (PREFETCH_AHEAD >= 2) { \
642 PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
643 PREFETCH_X(res+EL_PER_CL(T), CA0); \
644 }
645
646
647/***********************************************************
648 * 1 pointer operations
649 ***********************************************************/
650
652#define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
653 OPER(res[0], f1, f2); \
654 --i; \
655 PREFETCH_X(res+PREF_OFFS(T), CA0); \
656 ++res
657
659#define UNROLL1_KERNEL3(OPER) \
660 --i; \
661 OPER(res[0], f1, f2); \
662 ++res
663
664#define UNROLL1_KERNEL3_PREPARE do {} while(0)
665#define UNROLL1_KERNEL3_FIXUP do {} while(0)
666
667
669#define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
670 if (EL_PER_CL(T) <= 1) { \
671 OPER(res[0], f1, f2); \
672 PREFETCH_X(res+PREF_OFFS(T), CA0); \
673 i -= 2; \
674 OPER(res[1], f1, f2); \
675 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
676 res += 2; \
677 } else { \
678 OPER(res[0], f1, f2); \
679 i -= 2; \
680 OPER(res[1], f1, f2); \
681 PREFETCH_X(res+PREF_OFFS(T), CA0); \
682 res += 2; \
683 } \
684
685
687#define UNROLL2_KERNEL3(OPER) \
688 OPER(res[0], f1, f2); \
689 i -= 2; \
690 OPER(res[1], f1, f2); \
691 res += 2
692
693#define UNROLL2_KERNEL3_PREPARE do {} while(0)
694#define UNROLL2_KERNEL3_FIXUP do {} while(0)
695
696
698#define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
699 if (EL_PER_CL(T) <= 1) { \
700 OPER(res[0], f1, f2); \
701 i -= 4; \
702 PREFETCH_X(res+PREF_OFFS(T), CA0); \
703 OPER(res[1], f1, f2); \
704 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
705 OPER(res[2], f1, f2); \
706 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
707 OPER(res[3], f1, f2); \
708 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
709 res += 4; \
710 } else if (EL_PER_CL(T) <= 2) { \
711 OPER(res[0], f1, f2); \
712 PREFETCH_X(res+PREF_OFFS(T), CA0); \
713 OPER(res[1], f1, f2); \
714 i -= 4; \
715 OPER(res[2], f1, f2); \
716 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
717 OPER(res[3], f1, f2); \
718 res += 4; \
719 } else { \
720 OPER(res[0], f1, f2); \
721 i -= 4; \
722 OPER(res[1], f1, f2); \
723 PREFETCH_X(res+PREF_OFFS(T), CA0); \
724 OPER(res[2], f1, f2); \
725 OPER(res[3], f1, f2); \
726 res += 4; \
727 }
728
730#define UNROLL4_KERNEL3(OPER) \
731 OPER(res[0], f1, f2); \
732 OPER(res[1], f1, f2); \
733 i -= 4; \
734 OPER(res[2], f1, f2); \
735 OPER(res[3], f1, f2); \
736 res += 4
737
738#define UNROLL4_KERNEL3_PREPARE do {} while(0)
739#define UNROLL4_KERNEL3_FIXUP do {} while(0)
740
741
743#define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
744 if (EL_PER_CL(T) <= 1) { \
745 OPER(res[0], f1, f2); \
746 PREFETCH_X(res+PREF_OFFS(T), CA0); \
747 OPER(res[1], f1, f2); \
748 PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
749 OPER(res[2], f1, f2); \
750 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
751 OPER(res[3], f1, f2); \
752 PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
753 OPER(res[4], f1, f2); \
754 i -= 8; \
755 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
756 OPER(res[5], f1, f2); \
757 PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
758 OPER(res[6], f1, f2); \
759 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
760 OPER(res[7], f1, f2); \
761 PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
762 res += 8; \
763 } else if (EL_PER_CL(T) <= 2) { \
764 OPER(res[0], f1, f2); \
765 OPER(res[1], f1, f2); \
766 PREFETCH_X(res+PREF_OFFS(T), CA0); \
767 OPER(res[2], f1, f2); \
768 OPER(res[3], f1, f2); \
769 PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
770 OPER(res[4], f1, f2); \
771 i -= 8; \
772 OPER(res[5], f1, f2); \
773 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
774 OPER(res[6], f1, f2); \
775 PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
776 OPER(res[7], f1, f2); \
777 res += 8; \
778 } else if (EL_PER_CL(T) <= 4) { \
779 OPER(res[0], f1, f2); \
780 OPER(res[1], f1, f2); \
781 PREFETCH_X(res+PREF_OFFS(T), CA0); \
782 OPER(res[2], f1, f2); \
783 OPER(res[3], f1, f2); \
784 i -= 8; \
785 OPER(res[4], f1, f2); \
786 OPER(res[5], f1, f2); \
787 PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
788 OPER(res[6], f1, f2); \
789 OPER(res[7], f1, f2); \
790 res += 8; \
791 } else { \
792 OPER(res[0], f1, f2); \
793 OPER(res[1], f1, f2); \
794 OPER(res[2], f1, f2); \
795 PREFETCH_X(res+PREF_OFFS(T), CA0); \
796 OPER(res[3], f1, f2); \
797 OPER(res[4], f1, f2); \
798 OPER(res[5], f1, f2); \
799 i -= 8; \
800 OPER(res[6], f1, f2); \
801 OPER(res[7], f1, f2); \
802 res += 8; \
803 }
804
805
807#define UNROLL8_KERNEL3(OPER) \
808 OPER(res[0], f1, f2); \
809 OPER(res[1], f1, f2); \
810 OPER(res[2], f1, f2); \
811 OPER(res[3], f1, f2); \
812 i -= 8; \
813 OPER(res[4], f1, f2); \
814 OPER(res[5], f1, f2); \
815 OPER(res[6], f1, f2); \
816 OPER(res[7], f1, f2); \
817 res += 8
818
819#define UNROLL8_KERNEL3_PREPARE do {} while(0)
820#define UNROLL8_KERNEL3_FIXUP do {} while(0)
821
822
824#define PREF_AHEAD1(T,PREFETCH_X,CA0) \
825 if (PREFETCH_AHEAD >= 16) { \
826 PREFETCH_X(res+EL_PER_CL(T), CA0); \
827 PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
828 PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
829 PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
830 PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
831 PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
832 PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
833 PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
834 PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
835 PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
836 PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
837 PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
838 PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
839 PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
840 PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
841 } else if (PREFETCH_AHEAD >= 8) { \
842 PREFETCH_X(res+EL_PER_CL(T), CA0); \
843 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
844 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
845 PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
846 PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
847 PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
848 PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
849 } else if (PREFETCH_AHEAD >= 4) { \
850 PREFETCH_X(res+EL_PER_CL(T), CA0); \
851 PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
852 PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
853 } else if (PREFETCH_AHEAD >= 2) { \
854 PREFETCH_X(res+EL_PER_CL(T), CA0); \
855 }
856
857
858
859// Select default kernels
860#if UNROLL_DEPTH == 1
861
862# define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
863# define UNR_KERNEL5 UNROLL1_KERNEL5
864# define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
865# define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
866
867# define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
868# define UNR_KERNEL4 UNROLL1_KERNEL4
869# define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
870# define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
871
872# define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
873# define UNR_KERNEL3 UNROLL1_KERNEL3
874# define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
875# define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
876
877#elif UNROLL_DEPTH == 2
878
879# define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
880# define UNR_KERNEL5 UNROLL2_KERNEL5
881# define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
882# define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
883
884# define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
885# define UNR_KERNEL4 UNROLL2_KERNEL4
886# define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
887# define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
888
889# define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
890# define UNR_KERNEL3 UNROLL2_KERNEL3
891# define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
892# define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
893
894#elif UNROLL_DEPTH == 4
895
896# define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
897# define UNR_KERNEL5 UNROLL4_KERNEL5
898# define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
899# define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
900
901# define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
902# define UNR_KERNEL4 UNROLL4_KERNEL4
903# define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
904# define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
905
906# define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
907# define UNR_KERNEL3 UNROLL4_KERNEL3
908# define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
909# define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
910
911#elif UNROLL_DEPTH == 8
912
913# define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
914# define UNR_KERNEL5 UNROLL8_KERNEL5
915# define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
916# define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
917
918# define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
919# define UNR_KERNEL4 UNROLL8_KERNEL4
920# define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
921# define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
922
923# define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
924# define UNR_KERNEL3 UNROLL8_KERNEL3
925# define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
926# define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
927
928#else
929
930# error "UNROLL_DEPTH may only be 1, 2, 4, 8"
931
932#endif /* UNROLL_DEPTH */
933
951
952/****************************************************************
953 * Macros with fragments for the implementation
954 ****************************************************************/
955
956#ifdef USE_PREFETCH
958# define VKERN_TEMPL_3V_PREF(OP3,T) \
959 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
960 PREFETCH_W (res, 3); \
961 PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
962 UNR_KERNEL5_PREP; \
963 do { \
964 UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
965 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
966 UNR_KERNEL5_FIX; \
967 }
969# define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
970 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
971 PREFETCH_X (res, 3); \
972 PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
973 UNR_KERNEL4_PREP; \
974 do { \
975 UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
976 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
977 UNR_KERNEL4_FIX; \
978 }
980# define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
981 if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
982 PREFETCH_X (res, 3); \
983 PREF_AHEAD1(T,PREFETCH_X,CW); \
984 UNR_KERNEL3_PREP; \
985 do { \
986 UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
987 } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
988 UNR_KERNEL3_FIX; \
989 }
990#else
991# define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
992# define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
993# define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
994#endif /* USE_PREFETCH */
995
996
997/****************************************************************
998 * Templates for routines
999 ****************************************************************/
1000
1010
1011
1013#define VKERN_TEMPL_3V(FNAME,OP3) \
1014INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1015 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1016template <typename T> \
1017VEC_INLINE void FNAME (const unsigned long sz, \
1018 T* RESTRICT const _res, \
1019 const T* RESTRICT const _v1, \
1020 const T* RESTRICT const _v2) \
1021{ \
1022 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1023 REGISTER const T *v1 = _v1, *v2 = _v2; \
1024 REGISTER T *res = _res; \
1025 REGISTER long i = sz; \
1026 VKERN_TEMPL_3V_PREF(OP3,T); \
1027 \
1028 if (LIKELY(i >= UNROLL_DEPTH)) { \
1029 UNR_KERNEL5_PREP; \
1030 do { \
1031 UNR_KERNEL5(OP3); \
1032 } while (i >= UNROLL_DEPTH); \
1033 UNR_KERNEL5_FIX; \
1034 } \
1035 \
1036 for (; i; --i) { \
1037 OP3(*res, *v1, *v2, f1, f2); \
1038 ++v1; ++v2; ++res; \
1039 } \
1040}
1041
1043#define VKERN_TEMPL_3V_C(FNAME,OP3) \
1044INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1045 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1046 const T* RESTRICT const, LCTYPED(T));) \
1047template <typename T> \
1048VEC_INLINE void FNAME (const unsigned long sz, \
1049 T* RESTRICT const _res, \
1050 const T* RESTRICT const _v1, \
1051 const T* RESTRICT const _v2, \
1052 LCTYPE(T) f2) \
1053{ \
1054 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1055 REGISTER const T *v1 = _v1. *v2 = _v2; \
1056 REGISTER T *res = _res; \
1057 REGISTER long i = sz; \
1058 VKERN_TEMPL_3V_PREF(OP3,T); \
1059 \
1060 if (LIKELY(i >= UNROLL_DEPTH)) { \
1061 UNR_KERNEL5_PREP; \
1062 do { \
1063 UNR_KERNEL5(OP3); \
1064 } while (i >= UNROLL_DEPTH); \
1065 UNR_KERNEL5_FIX; \
1066 } \
1067 \
1068 for (; i; --i) { \
1069 OP3(*res, *v1, *v2, f1, f2); \
1070 ++v1; ++v2; ++res; \
1071 } \
1072}
1073
1075#define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1076INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1077 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1078 const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1079template <typename T> \
1080VEC_INLINE void FNAME (const unsigned long sz, \
1081 T* RESTRICT const _res, \
1082 const T* RESTRICT const _v1, \
1083 const T* RESTRICT const _v2, \
1084 LCTYPE(T) f1, \
1085 LCTYPE(T) f2) \
1086{ \
1087 PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1088 REGISTER long i = sz; \
1089 REGISTER const T *v1 = _v1, *v2 = _v2; \
1090 REGISTER T *res = _res; \
1091 VKERN_TEMPL_3V_PREF(OP3,T); \
1092 \
1093 if (LIKELY(i >= UNROLL_DEPTH)) { \
1094 UNR_KERNEL5_PREP; \
1095 do { \
1096 UNR_KERNEL5(OP3); \
1097 } while (i >= UNROLL_DEPTH); \
1098 UNR_KERNEL5_FIX; \
1099 } \
1100 \
1101 for (; i; --i) { \
1102 OP3(*res, *v1, *v2, f1, f2); \
1103 ++v1; ++v2; ++res; \
1104 } \
1105}
1106
1108#define VKERN_TEMPL_2V(FNAME,OP2) \
1109INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1110 (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1111template <typename T> \
1112VEC_INLINE void FNAME (const unsigned long sz, \
1113 T* RESTRICT const _res, \
1114 const T* RESTRICT const _v1) \
1115{ \
1116 PREFETCH_R(_v1, 3); \
1117 REGISTER const T *v1 = _v1; \
1118 REGISTER T* res = _res; \
1119 REGISTER long i = sz; \
1120 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1121 \
1122 if (LIKELY(i >= UNROLL_DEPTH)) { \
1123 UNR_KERNEL4_PREP; \
1124 do { \
1125 UNR_KERNEL4(OP2); \
1126 } while (i >= UNROLL_DEPTH); \
1127 UNR_KERNEL4_FIX; \
1128 } \
1129 \
1130 for (; i; --i) { \
1131 OP2(*res, *v1, f1, f2); \
1132 ++v1; ++res; \
1133 } \
1134}
1135
1137#define VKERN_TEMPL_2V_C(FNAME,OP2) \
1138INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1139 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1140 LCTYPED(T));) \
1141template <typename T> \
1142VEC_INLINE void FNAME (const unsigned long sz, \
1143 T* RESTRICT const _res, \
1144 const T* RESTRICT const _v1, \
1145 LCTYPE(T) f2) \
1146{ \
1147 PREFETCH_R(_v1, 3); \
1148 REGISTER const T *v1 = _v1; \
1149 REGISTER T* res = _res; \
1150 REGISTER long i = sz; \
1151 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1152 \
1153 if (LIKELY(i >= UNROLL_DEPTH)) { \
1154 UNR_KERNEL4_PREP; \
1155 do { \
1156 UNR_KERNEL4(OP2); \
1157 } while (i >= UNROLL_DEPTH); \
1158 UNR_KERNEL4_FIX; \
1159 } \
1160 \
1161 for (; i; --i) { \
1162 OP2(*res, *v1, f1, f2); \
1163 ++v1; ++res; \
1164 } \
1165}
1166
1168#define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1169INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1170 (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1171 LCTYPED(T), LCTYPED(T));) \
1172template <typename T> \
1173VEC_INLINE void FNAME (const unsigned long sz, \
1174 T* RESTRICT const _res, \
1175 const T* RESTRICT const _v1, \
1176 LCTYPE(T) f1, \
1177 LCTYPE(T) f2) \
1178{ \
1179 PREFETCH_R(_v1, 3); \
1180 REGISTER const T *v1 = _v1; \
1181 REGISTER T* res = _res; \
1182 REGISTER long i = sz; \
1183 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1184 \
1185 if (LIKELY(i >= UNROLL_DEPTH)) { \
1186 UNR_KERNEL4_PREP; \
1187 do { \
1188 UNR_KERNEL4(OP2); \
1189 } while (i >= UNROLL_DEPTH); \
1190 UNR_KERNEL4_FIX; \
1191 } \
1192 \
1193 for (; i; --i) { \
1194 OP2(*res, *v1, f1, f2); \
1195 ++v1; ++res; \
1196 } \
1197}
1198
1200#define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1201INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1202 (const unsigned long, const T* RESTRICT const, \
1203 const T* RESTRICT const, TYPE&);) \
1204template <typename T> \
1205VEC_INLINE void FNAME (const unsigned long sz, \
1206 const T* RESTRICT const _res, \
1207 const T* RESTRICT const _v1, \
1208 TYPE &_f2) \
1209{ \
1210 PREFETCH_R(_v1, 3); \
1211 /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1212 REGISTER TYPE f2(_f2), f1(0.0); \
1213 REGISTER const T *v1 = _v1; \
1214 REGISTER const T* res = _res; \
1215 REGISTER long i = sz; \
1216 VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1217 \
1218 if (LIKELY(i >= UNROLL_DEPTH)) { \
1219 UNR_KERNEL4_PREP; \
1220 do { \
1221 UNR_KERNEL4(OP2); \
1222 } while (i >= UNROLL_DEPTH); \
1223 UNR_KERNEL4_FIX; \
1224 } \
1225 \
1226 for (; i; --i) { \
1227 OP2(*res, *v1, f1, f2); \
1228 ++v1; ++res; \
1229 } \
1230_fin: \
1231 _f2 = f2 - f1; \
1232}
1233
1235#define VKERN_TEMPL_1V(FNAME,OP1) \
1236INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1237 (const unsigned long, T* RESTRICT const);) \
1238template <typename T> \
1239VEC_INLINE void FNAME (const unsigned long sz, \
1240 T* RESTRICT const _res) \
1241{ \
1242 REGISTER long i = sz; \
1243 REGISTER T* res = _res; \
1244 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1245 \
1246 if (LIKELY(i >= UNROLL_DEPTH)) { \
1247 UNR_KERNEL3_PREP; \
1248 do { \
1249 UNR_KERNEL3(OP1); \
1250 } while (i >= UNROLL_DEPTH); \
1251 UNR_KERNEL3_FIX; \
1252 } \
1253 \
1254 for (; i; --i) { \
1255 OP1(*res, f1, f2); \
1256 ++res; \
1257 } \
1258}
1259
1261#define VKERN_TEMPL_1V_C(FNAME,OP1) \
1262INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1263 (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1264template <typename T> \
1265VEC_INLINE void FNAME (const unsigned long sz, \
1266 T* RESTRICT const _res, \
1267 LCTYPE(T) f2) \
1268{ \
1269 REGISTER long i = sz; \
1270 REGISTER T* res = _res; \
1271 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1272 \
1273 if (LIKELY(i >= UNROLL_DEPTH)) { \
1274 UNR_KERNEL3_PREP; \
1275 do { \
1276 UNR_KERNEL3(OP1); \
1277 } while (i >= UNROLL_DEPTH); \
1278 UNR_KERNEL3_FIX; \
1279 } \
1280 \
1281 for (; i; --i) { \
1282 OP1(*res, f1, f2); \
1283 ++res; \
1284 } \
1285}
1286
1288#define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1289INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1290 (const unsigned long, T* RESTRICT const, LCTYPED(T), \
1291 LCTYPED(T));) \
1292template <typename T> \
1293VEC_INLINE void FNAME (const unsigned long sz, \
1294 T* RESTRICT const _res, \
1295 LCTYPE(T) f1, \
1296 LCTYPE(T) f2) \
1297{ \
1298 REGISTER long i = sz; \
1299 REGISTER T* res = _res; \
1300 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1301 \
1302 if (LIKELY(i >= UNROLL_DEPTH)) { \
1303 UNR_KERNEL3_PREP; \
1304 do { \
1305 UNR_KERNEL3(OP1); \
1306 } while (i >= UNROLL_DEPTH); \
1307 UNR_KERNEL3_FIX; \
1308 } \
1309 \
1310 for (; i; --i) { \
1311 OP1(*res, f1, f2); \
1312 ++res; \
1313 } \
1314}
1315
1317#define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1318INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1319 (const unsigned long, const T* const, TYPE&);) \
1320template <typename T> \
1321VEC_INLINE void FNAME (const unsigned long sz, \
1322 const T* const _res, \
1323 TYPE &_f2) \
1324{ \
1325 /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1326 REGISTER TYPE f2(_f2), f1(0.0); \
1327 REGISTER const T* res = _res; \
1328 REGISTER long i = sz; \
1329 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1330 \
1331 if (LIKELY(i >= UNROLL_DEPTH)) { \
1332 UNR_KERNEL3_PREP; \
1333 do { \
1334 UNR_KERNEL3(OP1); \
1335 } while (i >= UNROLL_DEPTH); \
1336 UNR_KERNEL3_FIX; \
1337 } \
1338 \
1339 for (; i; --i) { \
1340 OP1(*res, f1, f2); \
1341 ++res; \
1342 } \
1343 _f2 = f2 - f1; \
1344}
1345
1347#define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1348INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1349 (const unsigned long, const T* const, TYPE&);) \
1350template <typename T> \
1351VEC_INLINE void FNAME (const unsigned long sz, \
1352 const T* const _res, \
1353 TYPE &_f2) \
1354{ \
1355 /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1356 REGISTER LONG_DOUBLE f2(_f2); \
1357 REGISTER const T* res = _res; \
1358 REGISTER long i = sz; \
1359 VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1360 \
1361 if (LIKELY(i >= UNROLL_DEPTH)) { \
1362 UNR_KERNEL3_PREP; \
1363 do { \
1364 UNR_KERNEL3(OP1); \
1365 } while (i >= UNROLL_DEPTH); \
1366 UNR_KERNEL3_FIX; \
1367 } \
1368 \
1369 for (; i; --i) { \
1370 OP1(*res, f1, f2); \
1371 ++res; \
1372 } \
1373 _f2 = f2; \
1374}
1375
1376#endif /* TBCI_UNROLL_PREFETCH_DEF2_H */