TBCI Numerical high perf. C++ Library  2.8.0
unroll_prefetch_def.h
Go to the documentation of this file.
1 
8 #ifndef TBCI_UNROLL_PREFETCH_DEF_H
9 #define TBCI_UNROLL_PREFETCH_DEF_H
10 
11 //#include "tbci/basics.h"
12 
14 #define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15 #define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16 
39 #ifndef UNROLL_DEPTH
40 # define UNROLL_DEPTH 4
41 #endif
42 
43 /***********************************************************
44  * 3 pointer operations
45  ***********************************************************/
46 
48 #define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
49  OPER(res[0], v1[0], v2[0], f1, f2); \
50  --i; \
51  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
52  ++v1; \
53  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
54  ++v2; \
55  PREFETCH_W(res+PREF_OFFS(T), CA0); \
56  ++res
57 
59 #define UNROLL1_KERNEL5(OPER) \
60  --i; \
61  OPER(res[0], v1[0], v2[0], f1, f2); \
62  ++v1; ++v2; ++res
63 
64 #define UNROLL1_KERNEL5_PREPARE do {} while(0)
65 #define UNROLL1_KERNEL5_FIXUP do {} while(0)
66 
67 
69 #define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
70  if (EL_PER_CL(T) <= 1) { \
71  i -= 2; \
72  OPER(res[0], v1[0], v2[0], f1, f2); \
73  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
74  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
75  v1 += 2; \
76  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
77  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
78  OPER(res[1], v1[-1], v2[1], f1, f2); \
79  v2 += 2; \
80  PREFETCH_W(res+PREF_OFFS(T), CA0); \
81  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
82  res += 2; \
83  } else { \
84  i -= 2; \
85  OPER(res[0], v1[0], v2[0], f1, f2); \
86  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
87  v1 += 2; \
88  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
89  OPER(res[1], v1[-1], v2[1], f1, f2); \
90  v2 += 2; \
91  PREFETCH_W(res+PREF_OFFS(T), CA0); \
92  res += 2; \
93  } \
94 
95 
97 #define UNROLL2_KERNEL5(OPER) \
98  OPER(res[0], v1[0], v2[0], f1, f2); \
99  v1 += 2; i -= 2; \
100  OPER(res[1], v1[-1], v2[1], f1, f2); \
101  v2 += 2; res += 2
102 
103 #define UNROLL2_KERNEL5_PREPARE do {} while(0)
104 #define UNROLL2_KERNEL5_FIXUP do {} while(0)
105 
106 
108 #define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
109  if (EL_PER_CL(T) <= 1) { \
110  OPER(res[0], v1[0], v2[0], f1, f2); \
111  i -= 4; \
112  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
113  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
114  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
115  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
116  OPER(res[1], v1[1], v2[1], f1, f2); \
117  v1 += 4; \
118  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
119  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
120  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
121  PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
122  OPER(res[2], v1[-2], v2[2], f1, f2); \
123  v2 += 4; \
124  PREFETCH_W(res+PREF_OFFS(T), CA0); \
125  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
127  PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
128  OPER(res[3], v1[-1], v2[-1], f1, f2); \
129  res += 4; \
130  } else if (EL_PER_CL(T) <= 2) { \
131  OPER(res[0], v1[0], v2[0], f1, f2); \
132  i -= 4; \
133  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135  OPER(res[1], v1[1], v2[1], f1, f2); \
136  v1 += 4; \
137  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139  OPER(res[2], v1[-2], v2[2], f1, f2); \
140  v2 += 4; \
141  PREFETCH_W(res+PREF_OFFS(T), CA0); \
142  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
143  OPER(res[3], v1[-1], v2[-1], f1, f2); \
144  res += 4; \
145  } else { \
146  OPER(res[0], v1[0], v2[0], f1, f2); \
147  i -= 4; \
148  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
149  OPER(res[1], v1[1], v2[1], f1, f2); \
150  v1 += 4; \
151  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152  OPER(res[2], v1[-2], v2[2], f1, f2); \
153  v2 += 4; \
154  PREFETCH_W(res+PREF_OFFS(T), CA0); \
155  OPER(res[3], v1[-1], v2[-1], f1, f2); \
156  res += 4; \
157  }
158 
160 #define UNROLL4_KERNEL5(OPER) \
161  OPER(res[0], v1[0], v2[0], f1, f2); \
162  i -= 4; \
163  OPER(res[1], v1[1], v2[1], f1, f2); \
164  v1 += 4; \
165  OPER(res[2], v1[-2], v2[2], f1, f2); \
166  v2 += 4; \
167  OPER(res[3], v1[-1], v2[-1], f1, f2); \
168  res += 4
169 
170 #define UNROLL4_KERNEL5_PREPARE do {} while(0)
171 #define UNROLL4_KERNEL5_FIXUP do {} while(0)
172 
173 
175 #define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
176  if (EL_PER_CL(T) <= 1) { \
177  OPER(res[0], v1[0], v2[0], f1, f2); \
178  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
179  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
180  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
181  OPER(res[1], v1[1], v2[1], f1, f2); \
182  i -= 8; \
183  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
184  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
185  PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
186  OPER(res[2], v1[2], v2[2], f1, f2); \
187  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
188  PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
189  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
190  OPER(res[3], v1[3], v2[3], f1, f2); \
191  v1 += 8; \
192  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
193  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
194  PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
195  OPER(res[4], v1[-4], v2[4], f1, f2); \
196  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
197  PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
198  PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
199  OPER(res[5], v1[-3], v2[5], f1, f2); \
200  PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
201  PREFETCH_W(res+PREF_OFFS(T), CA0); \
202  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
203  OPER(res[6], v1[-2], v2[6], f1, f2); \
204  v2 += 8; \
205  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
206  PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
207  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
208  OPER(res[7], v1[-1], v2[-1], f1, f2); \
209  PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
210  PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
211  PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
212  res += 8; \
213  } else if (EL_PER_CL(T) <= 2) { \
214  OPER(res[0], v1[0], v2[0], f1, f2); \
215  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
216  i -= 8; \
217  OPER(res[1], v1[1], v2[1], f1, f2); \
218  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
219  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
220  OPER(res[2], v1[2], v2[2], f1, f2); \
221  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
222  OPER(res[3], v1[3], v2[3], f1, f2); \
223  v1 += 8; \
224  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
225  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
226  OPER(res[4], v1[-4], v2[4], f1, f2); \
227  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
228  PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
229  OPER(res[5], v1[-3], v2[5], f1, f2); \
230  v2 += 8; \
231  PREFETCH_W(res+PREF_OFFS(T), CA0); \
232  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
233  OPER(res[6], v1[-2], v2[-2], f1, f2); \
234  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
235  PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
236  OPER(res[7], v1[-1], v2[-1], f1, f2); \
237  res += 8; \
238  } else if (EL_PER_CL(T) <= 4) { \
239  OPER(res[0], v1[0], v2[0], f1, f2); \
240  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
241  OPER(res[1], v1[1], v2[1], f1, f2); \
242  i -= 8; \
243  OPER(res[2], v1[2], v2[2], f1, f2); \
244  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
245  OPER(res[3], v1[3], v2[3], f1, f2); \
246  v1 += 8; \
247  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
248  OPER(res[4], v1[-4], v2[4], f1, f2); \
249  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
250  OPER(res[5], v1[-3], v2[5], f1, f2); \
251  v2 += 8; \
252  PREFETCH_W(res+PREF_OFFS(T), CA0); \
253  OPER(res[6], v1[-2], v2[-2], f1, f2); \
254  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
255  OPER(res[7], v1[-1], v2[-1], f1, f2); \
256  res += 8; \
257  } else { \
258  OPER(res[0], v1[0], v2[0], f1, f2); \
259  i -= 8; \
260  OPER(res[1], v1[1], v2[1], f1, f2); \
261  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
262  OPER(res[2], v1[2], v2[2], f1, f2); \
263  OPER(res[3], v1[3], v2[3], f1, f2); \
264  v1 += 8; \
265  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
266  OPER(res[4], v1[-4], v2[4], f1, f2); \
267  OPER(res[5], v1[-3], v2[5], f1, f2); \
268  v2 += 8; \
269  PREFETCH_W(res+PREF_OFFS(T), CA0); \
270  OPER(res[6], v1[-2], v2[-2], f1, f2); \
271  OPER(res[7], v1[-1], v2[-1], f1, f2); \
272  res += 8; \
273  }
274 
275 
277 #define UNROLL8_KERNEL5(OPER) \
278  OPER(res[0], v1[0], v2[0], f1, f2); \
279  OPER(res[1], v1[1], v2[1], f1, f2); \
280  i -= 8; \
281  OPER(res[2], v1[2], v2[2], f1, f2); \
282  OPER(res[3], v1[3], v2[3], f1, f2); \
283  v1 += 8; \
284  OPER(res[4], v1[-4], v2[4], f1, f2); \
285  OPER(res[5], v1[-3], v2[5], f1, f2); \
286  v2 += 8; \
287  OPER(res[6], v1[-2], v2[-2], f1, f2); \
288  OPER(res[7], v1[-1], v2[-1], f1, f2); \
289  res += 8
290 
291 #define UNROLL8_KERNEL5_PREPARE do {} while(0)
292 #define UNROLL8_KERNEL5_FIXUP do {} while(0)
293 
295 #define PREF_AHEAD3(T,CA0,CA1,CA2) \
296  if (PREFETCH_AHEAD >= 16) { \
297  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
298  PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
299  PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
300  PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
301  PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
302  PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
303  PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
304  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
305  PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
306  PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
307  PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
308  PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
309  PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
310  PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
311  PREFETCH_W(res+EL_PER_CL(T), CA0); \
312  PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
313  PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
314  PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
315  PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
316  PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
317  PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
318  PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
319  PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
320  PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
321  PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
322  PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
323  PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
324  PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
325  PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
326  PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
327  PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
328  PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
329  PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
330  PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
331  PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
332  PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
333  PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
334  PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
335  PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
336  PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
337  PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
338  PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
339  PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
340  PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
341  PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
342  } else if (PREFETCH_AHEAD >= 8) { \
343  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
344  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
345  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
346  PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
347  PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
348  PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
349  PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
350  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
351  PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
352  PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
353  PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
354  PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
355  PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
356  PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
357  PREFETCH_W(res+EL_PER_CL(T), CA0); \
358  PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
359  PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
360  PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
361  PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
362  PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
363  PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
364  } else if (PREFETCH_AHEAD >= 4) { \
365  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
366  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
367  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
368  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
369  PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
370  PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
371  PREFETCH_W(res+EL_PER_CL(T), CA0); \
372  PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
373  PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
374  } else if (PREFETCH_AHEAD >= 2) { \
375  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
376  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
377  PREFETCH_W(res+EL_PER_CL(T), CA0); \
378  }
379 
380 
381 /***********************************************************
382  * 2 pointer operations
383  ***********************************************************/
384 
386 #define UNROLL1_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
387  OPER(res[0], v1[0], f1, f2); \
388  --i; \
389  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
390  ++v1; \
391  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
392  res+=RI
393 
394 #define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,RI) \
395  UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,1)
396 
398 #define UNROLL1_KERNEL4_STRIDE(OPER,RI) \
399  --i; \
400  OPER(res[0], v1[0], f1, f2); \
401  ++v1; res+=RI
402 
403 #define UNROLL1_KERNEL4(OPER) \
404  UNROLL1_KERNEL4_STRIDE(OPER,1)
405 
406 #define UNROLL1_KERNEL4_PREPARE do {} while(0)
407 #define UNROLL1_KERNEL4_FIXUP do {} while(0)
408 
409 
411 #define UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
412  if (EL_PER_CL(T) <= 1) { \
413  i -= 2; \
414  OPER(res[0], v1[0], f1, f2); \
415  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
416  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
417  OPER(res[RI],v1[1], f1, f2); \
418  v1 += 2; \
419  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
420  PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
421  res += 2*RI; \
422  } else { \
423  i -= 2; \
424  OPER(res[0], v1[0], f1, f2); \
425  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
426  OPER(res[RI],v1[1], f1, f2); \
427  v1 += 2; \
428  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
429  res += 2*RI; \
430  } \
431 
432 #define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433  UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
434 
436 #define UNROLL2_KERNEL4_STRIDE(OPER,RI) \
437  OPER(res[0], v1[0], f1, f2); \
438  v1 += 2; i -= 2; \
439  OPER(res[RI],v1[-1],f1, f2); \
440  res += 2*RI
441 
442 #define UNROLL2_KERNEL4(OPER) \
443  UNROLL2_KERNEL4_STRIDE(OPER,1) \
444 
445 #define UNROLL2_KERNEL4_PREPARE do {} while(0)
446 #define UNROLL2_KERNEL4_FIXUP do {} while(0)
447 
448 
450 #define UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
451  if (EL_PER_CL(T) <= 1) { \
452  OPER(res[0], v1[0], f1, f2); \
453  i -= 4; \
454  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
455  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
456  OPER(res[RI],v1[1], f1, f2); \
457  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
458  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
459  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
460  OPER(res[2*RI], v1[2], f1, f2); \
461  v1 += 4; \
462  PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
463  PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
464  PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
465  OPER(res[3*RI], v1[-1], f1, f2); \
466  res += 4*RI; \
467  } else if (EL_PER_CL(T) <= 2) { \
468  OPER(res[0], v1[0], f1, f2); \
469  i -= 4; \
470  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
471  OPER(res[RI],v1[1], f1, f2); \
472  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
473  OPER(res[2*RI], v1[2], f1, f2); \
474  v1 += 4; \
475  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
476  PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
477  OPER(res[3*RI], v1[-1], f1, f2); \
478  res += 4*RI; \
479  } else { \
480  OPER(res[0], v1[0], f1, f2); \
481  i -= 4; \
482  OPER(res[RI],v1[1], f1, f2); \
483  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
484  OPER(res[2*RI], v1[2], f1, f2); \
485  v1 += 4; \
486  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
487  OPER(res[3*RI], v1[-1], f1, f2); \
488  res += 4*RI; \
489  }
490 
491 #define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
492  UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
493 
495 #define UNROLL4_KERNEL4_STRIDE(OPER,RI) \
496  OPER(res[0], v1[0], f1, f2); \
497  OPER(res[RI],v1[1], f1, f2); \
498  v1 += 4; i -= 4; \
499  OPER(res[2*RI], v1[-2], f1, f2); \
500  OPER(res[3*RI], v1[-1], f1, f2); \
501  res += 4*RI
502 
503 #define UNROLL4_KERNEL4(OPER) \
504  UNROLL4_KERNEL4_STRIDE(OPER,1)
505 
506 #define UNROLL4_KERNEL4_PREPARE do {} while(0)
507 #define UNROLL4_KERNEL4_FIXUP do {} while(0)
508 
509 
511 #define UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,RI) \
512  if (EL_PER_CL(T) <= 1) { \
513  OPER(res[0], v1[0], f1, f2); \
514  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
515  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
516  OPER(res[RI],v1[1], f1, f2); \
517  i -= 8; \
518  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
519  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
520  OPER(res[2*RI], v1[2], f1, f2); \
521  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
522  PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
523  OPER(res[3*RI], v1[3], f1, f2); \
524  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
525  PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
526  OPER(res[4*RI], v1[4], f1, f2); \
527  v1 += 8; \
528  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
529  PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
530  OPER(res[5*RI], v1[-3], f1, f2); \
531  PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
532  PREFETCH_X(res+RI*(PREF_OFFS(T)+3), CA0); \
533  OPER(res[6*RI], v1[-2], f1, f2); \
534  PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
535  PREFETCH_X(res+RI*(PREF_OFFS(T)+5), CA0); \
536  OPER(res[7*RI], v1[-1], f1, f2); \
537  PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
538  PREFETCH_X(res+RI*(PREF_OFFS(T)+7), CA0); \
539  res += 8*RI; \
540  } else if (EL_PER_CL(T) <= 2) { \
541  OPER(res[0], v1[0], f1, f2); \
542  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
543  OPER(res[RI],v1[1], f1, f2); \
544  i -= 8; \
545  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
546  OPER(res[2*RI], v1[2], f1, f2); \
547  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
548  OPER(res[3*RI], v1[3], f1, f2); \
549  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
550  OPER(res[4*RI], v1[4], f1, f2); \
551  v1 += 8; \
552  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
553  OPER(res[5*RI], v1[-3], f1, f2); \
554  PREFETCH_X(res+RI*(PREF_OFFS(T)+2), CA0); \
555  OPER(res[6*RI], v1[-2], f1, f2); \
556  PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
557  OPER(res[7*RI], v1[-1], f1, f2); \
558  PREFETCH_X(res+RI*(PREF_OFFS(T)+6), CA0); \
559  res += 8*RI; \
560  } else if (EL_PER_CL(T) <= 4) { \
561  OPER(res[0], v1[0], f1, f2); \
562  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
563  OPER(res[RI],v1[1], f1, f2); \
564  OPER(res[2*RI], v1[2], f1, f2); \
565  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
566  OPER(res[3*RI], v1[3], f1, f2); \
567  OPER(res[4*RI], v1[4], f1, f2); \
568  v1 += 8; \
569  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
570  OPER(res[5*RI], v1[-3], f1, f2); \
571  i -= 8; \
572  OPER(res[6*RI], v1[-2], f1, f2); \
573  PREFETCH_X(res+RI*(PREF_OFFS(T)+4), CA0); \
574  OPER(res[7*RI], v1[-1], f1, f2); \
575  res += 8*RI; \
576  } else { \
577  OPER(res[0], v1[0], f1, f2); \
578  OPER(res[RI],v1[1], f1, f2); \
579  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
580  OPER(res[2*RI], v1[2], f1, f2); \
581  OPER(res[3*RI], v1[3], f1, f2); \
582  v1 += 8; \
583  PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
584  OPER(res[4*RI], v1[-4], f1, f2); \
585  OPER(res[5*RI], v1[-3], f1, f2); \
586  i -= 8; \
587  OPER(res[6*RI], v1[-2], f1, f2); \
588  OPER(res[7*RI], v1[-1], f1, f2); \
589  res += 8*RI; \
590  }
591 
592 #define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
593  UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
594 
596 #define UNROLL8_KERNEL4_STRIDE(OPER,RI) \
597  OPER(res[0], v1[0], f1, f2); \
598  OPER(res[RI],v1[1], f1, f2); \
599  OPER(res[2*RI], v1[2], f1, f2); \
600  OPER(res[3*RI], v1[3], f1, f2); \
601  v1 += 8; i -= 8; \
602  OPER(res[4*RI], v1[-4], f1, f2); \
603  OPER(res[5*RI], v1[-3], f1, f2); \
604  OPER(res[6*RI], v1[-2], f1, f2); \
605  OPER(res[7*RI], v1[-1], f1, f2); \
606  res += 8*RI
607 
608 #define UNROLL8_KERNEL4(OPER) \
609  UNROLL8_KERNEL4_STRIDE(OPER,1)
610 
611 #define UNROLL8_KERNEL4_PREPARE do {} while(0)
612 #define UNROLL8_KERNEL4_FIXUP do {} while(0)
613 
614 
616 #define PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,RI) \
617  if (PREFETCH_AHEAD >= 16) { \
618  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
619  PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
620  PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
621  PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
622  PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
623  PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
624  PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
625  PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
626  PREFETCH_X(res+RI*EL_PER_CL(T)* 2, CA0); \
627  PREFETCH_X(res+RI*EL_PER_CL(T)* 3, CA0); \
628  PREFETCH_X(res+RI*EL_PER_CL(T)* 4, CA0); \
629  PREFETCH_X(res+RI*EL_PER_CL(T)* 5, CA0); \
630  PREFETCH_X(res+RI*EL_PER_CL(T)* 6, CA0); \
631  PREFETCH_X(res+RI*EL_PER_CL(T)* 7, CA0); \
632  PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
633  PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
634  PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
635  PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
636  PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
637  PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
638  PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
639  PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
640  PREFETCH_X(res+RI*EL_PER_CL(T)* 8, CA0); \
641  PREFETCH_X(res+RI*EL_PER_CL(T)* 9, CA0); \
642  PREFETCH_X(res+RI*EL_PER_CL(T)*10, CA0); \
643  PREFETCH_X(res+RI*EL_PER_CL(T)*11, CA0); \
644  PREFETCH_X(res+RI*EL_PER_CL(T)*12, CA0); \
645  PREFETCH_X(res+RI*EL_PER_CL(T)*13, CA0); \
646  PREFETCH_X(res+RI*EL_PER_CL(T)*14, CA0); \
647  PREFETCH_X(res+RI*EL_PER_CL(T)*15, CA0); \
648  } else if (PREFETCH_AHEAD >= 8) { \
649  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
650  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
651  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
652  PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
653  PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
654  PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
655  PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
656  PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
657  PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
658  PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
659  PREFETCH_X(res+RI*EL_PER_CL(T)*4, CA0); \
660  PREFETCH_X(res+RI*EL_PER_CL(T)*5, CA0); \
661  PREFETCH_X(res+RI*EL_PER_CL(T)*6, CA0); \
662  PREFETCH_X(res+RI*EL_PER_CL(T)*7, CA0); \
663  } else if (PREFETCH_AHEAD >= 4) { \
664  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
665  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
666  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
667  PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
668  PREFETCH_X(res+RI*EL_PER_CL(T)*2, CA0); \
669  PREFETCH_X(res+RI*EL_PER_CL(T)*3, CA0); \
670  } else if (PREFETCH_AHEAD >= 2) { \
671  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
672  PREFETCH_X(res+RI*EL_PER_CL(T), CA0); \
673  }
674 
675 #define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
676  PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,1)
677 
678 /***********************************************************
679  * 1 pointer operations
680  ***********************************************************/
681 
683 #define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
684  OPER(res[0], f1, f2); \
685  --i; \
686  PREFETCH_X(res+PREF_OFFS(T), CA0); \
687  ++res
688 
690 #define UNROLL1_KERNEL3(OPER) \
691  --i; \
692  OPER(res[0], f1, f2); \
693  ++res
694 
695 #define UNROLL1_KERNEL3_PREPARE do {} while(0)
696 #define UNROLL1_KERNEL3_FIXUP do {} while(0)
697 
698 
700 #define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
701  if (EL_PER_CL(T) <= 1) { \
702  OPER(res[0], f1, f2); \
703  PREFETCH_X(res+PREF_OFFS(T), CA0); \
704  i -= 2; \
705  OPER(res[1], f1, f2); \
706  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
707  res += 2; \
708  } else { \
709  OPER(res[0], f1, f2); \
710  i -= 2; \
711  OPER(res[1], f1, f2); \
712  PREFETCH_X(res+PREF_OFFS(T), CA0); \
713  res += 2; \
714  } \
715 
716 
718 #define UNROLL2_KERNEL3(OPER) \
719  OPER(res[0], f1, f2); \
720  i -= 2; \
721  OPER(res[1], f1, f2); \
722  res += 2
723 
724 #define UNROLL2_KERNEL3_PREPARE do {} while(0)
725 #define UNROLL2_KERNEL3_FIXUP do {} while(0)
726 
727 
729 #define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
730  if (EL_PER_CL(T) <= 1) { \
731  OPER(res[0], f1, f2); \
732  i -= 4; \
733  PREFETCH_X(res+PREF_OFFS(T), CA0); \
734  OPER(res[1], f1, f2); \
735  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
736  OPER(res[2], f1, f2); \
737  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
738  OPER(res[3], f1, f2); \
739  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
740  res += 4; \
741  } else if (EL_PER_CL(T) <= 2) { \
742  OPER(res[0], f1, f2); \
743  PREFETCH_X(res+PREF_OFFS(T), CA0); \
744  OPER(res[1], f1, f2); \
745  i -= 4; \
746  OPER(res[2], f1, f2); \
747  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
748  OPER(res[3], f1, f2); \
749  res += 4; \
750  } else { \
751  OPER(res[0], f1, f2); \
752  i -= 4; \
753  OPER(res[1], f1, f2); \
754  OPER(res[2], f1, f2); \
755  PREFETCH_X(res+PREF_OFFS(T), CA0); \
756  OPER(res[3], f1, f2); \
757  res += 4; \
758  }
759 
761 #define UNROLL4_KERNEL3(OPER) \
762  OPER(res[0], f1, f2); \
763  OPER(res[1], f1, f2); \
764  i -= 4; \
765  OPER(res[2], f1, f2); \
766  OPER(res[3], f1, f2); \
767  res += 4
768 
769 #define UNROLL4_KERNEL3_PREPARE do {} while(0)
770 #define UNROLL4_KERNEL3_FIXUP do {} while(0)
771 
772 
774 #define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
775  if (EL_PER_CL(T) <= 1) { \
776  OPER(res[0], f1, f2); \
777  PREFETCH_X(res+PREF_OFFS(T), CA0); \
778  OPER(res[1], f1, f2); \
779  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
780  OPER(res[2], f1, f2); \
781  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
782  OPER(res[3], f1, f2); \
783  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
784  OPER(res[4], f1, f2); \
785  i -= 8; \
786  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
787  OPER(res[5], f1, f2); \
788  PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
789  OPER(res[6], f1, f2); \
790  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
791  OPER(res[7], f1, f2); \
792  PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
793  res += 8; \
794  } else if (EL_PER_CL(T) <= 2) { \
795  OPER(res[0], f1, f2); \
796  OPER(res[1], f1, f2); \
797  PREFETCH_X(res+PREF_OFFS(T), CA0); \
798  OPER(res[2], f1, f2); \
799  OPER(res[3], f1, f2); \
800  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
801  OPER(res[4], f1, f2); \
802  i -= 8; \
803  OPER(res[5], f1, f2); \
804  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
805  OPER(res[6], f1, f2); \
806  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
807  OPER(res[7], f1, f2); \
808  res += 8; \
809  } else if (EL_PER_CL(T) <= 4) { \
810  OPER(res[0], f1, f2); \
811  OPER(res[1], f1, f2); \
812  PREFETCH_X(res+PREF_OFFS(T), CA0); \
813  OPER(res[2], f1, f2); \
814  OPER(res[3], f1, f2); \
815  i -= 8; \
816  OPER(res[4], f1, f2); \
817  OPER(res[5], f1, f2); \
818  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
819  OPER(res[6], f1, f2); \
820  OPER(res[7], f1, f2); \
821  res += 8; \
822  } else { \
823  OPER(res[0], f1, f2); \
824  OPER(res[1], f1, f2); \
825  OPER(res[2], f1, f2); \
826  PREFETCH_X(res+PREF_OFFS(T), CA0); \
827  OPER(res[3], f1, f2); \
828  OPER(res[4], f1, f2); \
829  OPER(res[5], f1, f2); \
830  i -= 8; \
831  OPER(res[6], f1, f2); \
832  OPER(res[7], f1, f2); \
833  res += 8; \
834  }
835 
836 
838 #define UNROLL8_KERNEL3(OPER) \
839  OPER(res[0], f1, f2); \
840  OPER(res[1], f1, f2); \
841  OPER(res[2], f1, f2); \
842  OPER(res[3], f1, f2); \
843  i -= 8; \
844  OPER(res[4], f1, f2); \
845  OPER(res[5], f1, f2); \
846  OPER(res[6], f1, f2); \
847  OPER(res[7], f1, f2); \
848  res += 8
849 
850 #define UNROLL8_KERNEL3_PREPARE do {} while(0)
851 #define UNROLL8_KERNEL3_FIXUP do {} while(0)
852 
853 
855 #define PREF_AHEAD1(T,PREFETCH_X,CA0) \
856  if (PREFETCH_AHEAD >= 16) { \
857  PREFETCH_X(res+EL_PER_CL(T), CA0); \
858  PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
859  PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
860  PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
861  PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
862  PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
863  PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
864  PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
865  PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
866  PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
867  PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
868  PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
869  PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
870  PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
871  PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
872  } else if (PREFETCH_AHEAD >= 8) { \
873  PREFETCH_X(res+EL_PER_CL(T), CA0); \
874  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
875  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
876  PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
877  PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
878  PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
879  PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
880  } else if (PREFETCH_AHEAD >= 4) { \
881  PREFETCH_X(res+EL_PER_CL(T), CA0); \
882  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
883  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
884  } else if (PREFETCH_AHEAD >= 2) { \
885  PREFETCH_X(res+EL_PER_CL(T), CA0); \
886  }
887 
888 
889 
890 // Select default kernels
891 #if UNROLL_DEPTH == 1
892 
893 # define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
894 # define UNR_KERNEL5 UNROLL1_KERNEL5
895 # define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
896 # define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
897 
898 # define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
899 # define UNR_PREF_KERNEL4_STRIDE UNROLL1_PREF_KERNEL4_STRIDE
900 # define UNR_KERNEL4 UNROLL1_KERNEL4
901 # define UNR_KERNEL4_STRIDE UNROLL1_KERNEL4_STRIDE
902 # define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
903 # define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
904 
905 # define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
906 # define UNR_KERNEL3 UNROLL1_KERNEL3
907 # define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
908 # define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
909 
910 #elif UNROLL_DEPTH == 2
911 
912 # define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
913 # define UNR_KERNEL5 UNROLL2_KERNEL5
914 # define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
915 # define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
916 
917 # define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
918 # define UNR_PREF_KERNEL4_STRIDE UNROLL2_PREF_KERNEL4_STRIDE
919 # define UNR_KERNEL4 UNROLL2_KERNEL4
920 # define UNR_KERNEL4_STRIDE UNROLL2_KERNEL4_STRIDE
921 # define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
922 # define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
923 
924 # define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
925 # define UNR_KERNEL3 UNROLL2_KERNEL3
926 # define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
927 # define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
928 
929 #elif UNROLL_DEPTH == 4
930 
931 # define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
932 # define UNR_KERNEL5 UNROLL4_KERNEL5
933 # define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
934 # define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
935 
936 # define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
937 # define UNR_PREF_KERNEL4_STRIDE UNROLL4_PREF_KERNEL4_STRIDE
938 # define UNR_KERNEL4 UNROLL4_KERNEL4
939 # define UNR_KERNEL4_STRIDE UNROLL4_KERNEL4_STRIDE
940 # define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
941 # define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
942 
943 # define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
944 # define UNR_KERNEL3 UNROLL4_KERNEL3
945 # define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
946 # define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
947 
948 #elif UNROLL_DEPTH == 8
949 
950 # define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
951 # define UNR_KERNEL5 UNROLL8_KERNEL5
952 # define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
953 # define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
954 
955 # define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
956 # define UNR_PREF_KERNEL4_STRIDE UNROLL8_PREF_KERNEL4_STRIDE
957 # define UNR_KERNEL4 UNROLL8_KERNEL4
958 # define UNR_KERNEL4_STRIDE UNROLL8_KERNEL4_STRIDE
959 # define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
960 # define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
961 
962 # define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
963 # define UNR_KERNEL3 UNROLL8_KERNEL3
964 # define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
965 # define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
966 
967 #else
968 
969 # error "UNROLL_DEPTH may only be 1, 2, 4, 8"
970 
971 #endif /* UNROLL_DEPTH */
972 
991 /****************************************************************
992  * Macros with fragments for the implementation
993  ****************************************************************/
994 
995 #ifdef USE_PREFETCH
996 
997 # define VKERN_TEMPL_3V_PREF(OP3,T) \
998  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
999  PREFETCH_W (res, 3); \
1000  PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
1001  UNR_KERNEL5_PREP; \
1002  do { \
1003  UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
1004  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1005  UNR_KERNEL5_FIX; \
1006  }
1007 
1008 # define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
1009  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1010  PREFETCH_X (res, 3); \
1011  PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
1012  UNR_KERNEL4_PREP; \
1013  do { \
1014  UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
1015  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1016  UNR_KERNEL4_FIX; \
1017  }
1018 # define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) \
1019  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1020  PREFETCH_X (res, 3); \
1021  PREF_AHEAD2_STRIDE(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ),RI); \
1022  UNR_KERNEL4_PREP; \
1023  do { \
1024  UNR_PREF_KERNEL4_STRIDE(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ,RI); \
1025  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1026  UNR_KERNEL4_FIX; \
1027  }
1028 
1029 # define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
1030  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
1031  PREFETCH_X (res, 3); \
1032  PREF_AHEAD1(T,PREFETCH_X,CW); \
1033  UNR_KERNEL3_PREP; \
1034  do { \
1035  UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
1036  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
1037  UNR_KERNEL3_FIX; \
1038  }
1039 #else
1040 # define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
1041 # define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1042 # define VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_X,CW,RI) do {} while (0)
1043 # define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
1044 #endif /* USE_PREFETCH */
1045 
1046 
1047 /****************************************************************
1048  * Templates for routines
1049  ****************************************************************/
1050 
1062 #define VKERN_TEMPL_3V(FNAME,OP3) \
1064 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1065  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1066 template <typename T> \
1067 VEC_INLINE void FNAME (const unsigned long sz, \
1068  T* RESTRICT const _res, \
1069  const T* RESTRICT const _v1, \
1070  const T* RESTRICT const _v2) \
1071 { \
1072  PREFETCH_W(_res, 3); \
1073  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1074  REGISTER const T *v1 = _v1, *v2 = _v2; \
1075  REGISTER T *res = _res; \
1076  REGISTER long i = sz; \
1077  VKERN_TEMPL_3V_PREF(OP3,T); \
1078  \
1079  if (LIKELY(i >= UNROLL_DEPTH)) { \
1080  UNR_KERNEL5_PREP; \
1081  do { \
1082  UNR_KERNEL5(OP3); \
1083  } while (i >= UNROLL_DEPTH); \
1084  UNR_KERNEL5_FIX; \
1085  } \
1086  \
1087  for (; i; --i) { \
1088  OP3(*res, *v1, *v2, f1, f2); \
1089  ++v1; ++v2; ++res; \
1090  } \
1091 }
1092 
1094 #define VKERN_TEMPL_3V_C(FNAME,OP3) \
1095 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1096  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1097  const T* RESTRICT const, LCTYPED(T));) \
1098 template <typename T> \
1099 VEC_INLINE void FNAME (const unsigned long sz, \
1100  T* RESTRICT const _res, \
1101  const T* RESTRICT const _v1, \
1102  const T* RESTRICT const _v2, \
1103  LCTYPE(T) f2) \
1104 { \
1105  PREFETCH_W(_res, 3); \
1106  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1107  REGISTER const T *v1 = _v1, *v2 = _v2; \
1108  REGISTER T *res = _res; \
1109  REGISTER long i = sz; \
1110  VKERN_TEMPL_3V_PREF(OP3,T); \
1111  \
1112  if (LIKELY(i >= UNROLL_DEPTH)) { \
1113  UNR_KERNEL5_PREP; \
1114  do { \
1115  UNR_KERNEL5(OP3); \
1116  } while (i >= UNROLL_DEPTH); \
1117  UNR_KERNEL5_FIX; \
1118  } \
1119  \
1120  for (; i; --i) { \
1121  OP3(*res, *v1, *v2, f1, f2); \
1122  ++v1; ++v2; ++res; \
1123  } \
1124 }
1125 
1127 #define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1128 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1129  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1130  const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1131 template <typename T> \
1132 VEC_INLINE void FNAME (const unsigned long sz, \
1133  T* RESTRICT const _res, \
1134  const T* RESTRICT const _v1, \
1135  const T* RESTRICT const _v2, \
1136  LCTYPE(T) f1, \
1137  LCTYPE(T) f2) \
1138 { \
1139  PREFETCH_W(_res, 3); \
1140  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1141  REGISTER long i = sz; \
1142  REGISTER const T *v1 = _v1, *v2 = _v2; \
1143  REGISTER T *res = _res; \
1144  VKERN_TEMPL_3V_PREF(OP3,T); \
1145  \
1146  if (LIKELY(i >= UNROLL_DEPTH)) { \
1147  UNR_KERNEL5_PREP; \
1148  do { \
1149  UNR_KERNEL5(OP3); \
1150  } while (i >= UNROLL_DEPTH); \
1151  UNR_KERNEL5_FIX; \
1152  } \
1153  \
1154  for (; i; --i) { \
1155  OP3(*res, *v1, *v2, f1, f2); \
1156  ++v1; ++v2; ++res; \
1157  } \
1158 }
1159 
1161 #define VKERN_TEMPL_2V(FNAME,OP2) \
1162 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1163  (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1164 template <typename T> \
1165 VEC_INLINE void FNAME (const unsigned long sz, \
1166  T* RESTRICT const _res, \
1167  const T* RESTRICT const _v1) \
1168 { \
1169  PREFETCH_W(_res, 3); \
1170  PREFETCH_R(_v1, 3); \
1171  REGISTER const T *v1 = _v1; \
1172  REGISTER T* res = _res; \
1173  REGISTER long i = sz; \
1174  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1175  \
1176  if (LIKELY(i >= UNROLL_DEPTH)) { \
1177  UNR_KERNEL4_PREP; \
1178  do { \
1179  UNR_KERNEL4(OP2); \
1180  } while (i >= UNROLL_DEPTH); \
1181  UNR_KERNEL4_FIX; \
1182  } \
1183  \
1184  for (; i; --i) { \
1185  OP2(*res, *v1, f1, f2); \
1186  ++v1; ++res; \
1187  } \
1188 }
1189 
1191 #define VKERN_TEMPL_2V_C(FNAME,OP2) \
1192 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1193  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1194  LCTYPED(T));) \
1195 template <typename T> \
1196 VEC_INLINE void FNAME (const unsigned long sz, \
1197  T* RESTRICT const _res, \
1198  const T* RESTRICT const _v1, \
1199  LCTYPE(T) f2) \
1200 { \
1201  PREFETCH_W(_res, 3); \
1202  PREFETCH_R(_v1, 3); \
1203  REGISTER const T *v1 = _v1; \
1204  REGISTER T* res = _res; \
1205  REGISTER long i = sz; \
1206  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1207  \
1208  if (LIKELY(i >= UNROLL_DEPTH)) { \
1209  UNR_KERNEL4_PREP; \
1210  do { \
1211  UNR_KERNEL4(OP2); \
1212  } while (i >= UNROLL_DEPTH); \
1213  UNR_KERNEL4_FIX; \
1214  } \
1215  \
1216  for (; i; --i) { \
1217  OP2(*res, *v1, f1, f2); \
1218  ++v1; ++res; \
1219  } \
1220 }
1221 
1223 #define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1224 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1225  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1226  LCTYPED(T), LCTYPED(T));) \
1227 template <typename T> \
1228 VEC_INLINE void FNAME (const unsigned long sz, \
1229  T* RESTRICT const _res, \
1230  const T* RESTRICT const _v1, \
1231  LCTYPE(T) f1, \
1232  LCTYPE(T) f2) \
1233 { \
1234  PREFETCH_W(_res, 3); \
1235  PREFETCH_R(_v1, 3); \
1236  REGISTER const T *v1 = _v1; \
1237  REGISTER T* res = _res; \
1238  REGISTER long i = sz; \
1239  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1240  \
1241  if (LIKELY(i >= UNROLL_DEPTH)) { \
1242  UNR_KERNEL4_PREP; \
1243  do { \
1244  UNR_KERNEL4(OP2); \
1245  } while (i >= UNROLL_DEPTH); \
1246  UNR_KERNEL4_FIX; \
1247  } \
1248  \
1249  for (; i; --i) { \
1250  OP2(*res, *v1, f1, f2); \
1251  ++v1; ++res; \
1252  } \
1253 }
1254 
1256 #define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1257 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1258  (const unsigned long, const T* RESTRICT const, \
1259  const T* RESTRICT const, TYPE&);) \
1260 template <typename T> \
1261 VEC_INLINE void FNAME (const unsigned long sz, \
1262  const T* RESTRICT const _res, \
1263  const T* RESTRICT const _v1, \
1264  TYPE &_f2) \
1265 { \
1266  PREFETCH_R(_res, 3); \
1267  PREFETCH_R(_v1, 3); \
1268  REGISTER const T *v1 = _v1; \
1269  REGISTER const T* res = _res; \
1270  /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1271  REGISTER TYPE f2(_f2), f1(0.0); \
1272  REGISTER long i = sz; \
1273  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1274  \
1275  if (LIKELY(i >= UNROLL_DEPTH)) { \
1276  UNR_KERNEL4_PREP; \
1277  do { \
1278  UNR_KERNEL4(OP2); \
1279  } while (i >= UNROLL_DEPTH); \
1280  UNR_KERNEL4_FIX; \
1281  } \
1282  \
1283  for (; i; --i) { \
1284  OP2(*res, *v1, f1, f2); \
1285  ++v1; ++res; \
1286  } \
1287  _fin: \
1288  _f2 = f2 - f1; \
1289 }
1290 
1292 #define VKERN_TEMPL_2V_T_STRIDE(FNAME,OP2,TYPE) \
1293 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1294  (const unsigned long, const T* RESTRICT const, \
1295  const T* RESTRICT const, TYPE&, const unsigned);) \
1296 template <typename T> \
1297 VEC_INLINE void FNAME (const unsigned long sz, \
1298  const T* RESTRICT const _res, \
1299  const T* RESTRICT const _v1, \
1300  TYPE &_f2, const unsigned rincr) \
1301 { \
1302  PREFETCH_R(_res, 3); \
1303  PREFETCH_R(_v1, 3); \
1304  REGISTER const T *v1 = _v1; \
1305  REGISTER const T* res = _res; \
1306  /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1307  REGISTER TYPE f2(_f2), f1(0.0); \
1308  REGISTER long i = sz; \
1309  VKERN_TEMPL_2V_PREF_STRIDE(OP2,T,PREFETCH_R,CACHE_LOC_READ,rincr); \
1310  \
1311  if (LIKELY(i >= UNROLL_DEPTH)) { \
1312  UNR_KERNEL4_PREP; \
1313  do { \
1314  UNR_KERNEL4_STRIDE(OP2,rincr); \
1315  } while (i >= UNROLL_DEPTH); \
1316  UNR_KERNEL4_FIX; \
1317  } \
1318  \
1319  for (; i; --i) { \
1320  OP2(*res, *v1, f1, f2); \
1321  ++v1; res += rincr; \
1322  } \
1323  _fin: \
1324  _f2 = f2 - f1; \
1325 }
1326 
1328 #define VKERN_TEMPL_1V(FNAME,OP1) \
1329 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1330  (const unsigned long, T* RESTRICT const);) \
1331 template <typename T> \
1332 VEC_INLINE void FNAME (const unsigned long sz, \
1333  T* RESTRICT const _res) \
1334 { \
1335  PREFETCH_W(_res, 3); \
1336  REGISTER long i = sz; \
1337  REGISTER T* res = _res; \
1338  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1339  \
1340  if (LIKELY(i >= UNROLL_DEPTH)) { \
1341  UNR_KERNEL3_PREP; \
1342  do { \
1343  UNR_KERNEL3(OP1); \
1344  } while (i >= UNROLL_DEPTH); \
1345  UNR_KERNEL3_FIX; \
1346  } \
1347  \
1348  for (; i; --i) { \
1349  OP1(*res, f1, f2); \
1350  ++res; \
1351  } \
1352 }
1353 
1355 #define VKERN_TEMPL_1V_C(FNAME,OP1) \
1356 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1357  (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1358 template <typename T> \
1359 VEC_INLINE void FNAME (const unsigned long sz, \
1360  T* RESTRICT const _res, \
1361  LCTYPE(T) f2) \
1362 { \
1363  PREFETCH_W(_res, 3); \
1364  REGISTER long i = sz; \
1365  REGISTER T* res = _res; \
1366  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1367  \
1368  if (LIKELY(i >= UNROLL_DEPTH)) { \
1369  UNR_KERNEL3_PREP; \
1370  do { \
1371  UNR_KERNEL3(OP1); \
1372  } while (i >= UNROLL_DEPTH); \
1373  UNR_KERNEL3_FIX; \
1374  } \
1375  \
1376  for (; i; --i) { \
1377  OP1(*res, f1, f2); \
1378  ++res; \
1379  } \
1380 }
1381 
1383 #define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1384 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1385  (const unsigned long, T* RESTRICT const, \
1386  LCTYPED(T), LCTYPED(T));) \
1387 template <typename T> \
1388 VEC_INLINE void FNAME (const unsigned long sz, \
1389  T* RESTRICT const _res, \
1390  LCTYPE(T) f1, \
1391  LCTYPE(T) f2) \
1392 { \
1393  PREFETCH_W(_res, 3); \
1394  REGISTER long i = sz; \
1395  REGISTER T* res = _res; \
1396  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1397  \
1398  if (LIKELY(i >= UNROLL_DEPTH)) { \
1399  UNR_KERNEL3_PREP; \
1400  do { \
1401  UNR_KERNEL3(OP1); \
1402  } while (i >= UNROLL_DEPTH); \
1403  UNR_KERNEL3_FIX; \
1404  } \
1405  \
1406  for (; i; --i) { \
1407  OP1(*res, f1, f2); \
1408  ++res; \
1409  } \
1410 }
1411 
1412 
1416 #define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1417 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1418  (const unsigned long, const T* const, TYPE&);) \
1419 template <typename T> \
1420 VEC_INLINE void FNAME (const unsigned long sz, \
1421  const T* const _res, \
1422  TYPE &_f2) \
1423 { \
1424  PREFETCH_R(_res, 3); \
1425  /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1426  REGISTER TYPE f2(_f2), f1(0.0); \
1427  REGISTER const T* res = _res; \
1428  REGISTER long i = sz; \
1429  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1430  \
1431  if (LIKELY(i >= UNROLL_DEPTH)) { \
1432  UNR_KERNEL3_PREP; \
1433  do { \
1434  UNR_KERNEL3(OP1); \
1435  } while (i >= UNROLL_DEPTH); \
1436  UNR_KERNEL3_FIX; \
1437  } \
1438  \
1439  for (; i; --i) { \
1440  OP1(*res, f1, f2); \
1441  ++res; \
1442  } \
1443  _f2 = f2 - f1; \
1444 }
1445 
1450 #define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1451 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1452  (const unsigned long, const T* const, TYPE&);) \
1453 template <typename T> \
1454 VEC_INLINE void FNAME (const unsigned long sz, \
1455  const T* const _res, \
1456  TYPE &_f2) \
1457 { \
1458  PREFETCH_R(_res, 3); \
1459  /* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1460  REGISTER LONG_DOUBLE f2(_f2); \
1461  REGISTER const T* res = _res; \
1462  REGISTER long i = sz; \
1463  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1464  \
1465  if (LIKELY(i >= UNROLL_DEPTH)) { \
1466  UNR_KERNEL3_PREP; \
1467  do { \
1468  UNR_KERNEL3(OP1); \
1469  } while (i >= UNROLL_DEPTH); \
1470  UNR_KERNEL3_FIX; \
1471  } \
1472  \
1473  for (; i; --i) { \
1474  OP1(*res, f1, f2); \
1475  ++res; \
1476  } \
1477  _f2 = f2; \
1478 }
1479 
1480 #endif /* TBCI_UNROLL_PREFETCH_DEF_H */