TBCI Numerical high perf. C++ Library  2.8.0
unroll_prefetch_def2.h
Go to the documentation of this file.
1 
8 #ifndef TBCI_UNROLL_PREFETCH_DEF2_H
9 #define TBCI_UNROLL_PREFETCH_DEF2_H
10 
11 //#include "tbci/basics.h"
12 
14 #define LCTYPE(T) REGISTER typename tbci_traits<T>::loop_const_refval_type
15 #define LCTYPED(T) REGISTER tbci_traits<T>::loop_const_refval_type
16 
42 #ifndef UNROLL_DEPTH
43 # define UNROLL_DEPTH 4
44 #endif
45 
46 
47 /***********************************************************
48  * 3 pointer operations
49  ***********************************************************/
50 
52 #define UNROLL1_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
53  --i; \
54  OPER(res[0], v1[0], v2[0], f1, f2); \
55  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
56  ++v1; \
57  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
58  ++v2; \
59  PREFETCH_W(res+PREF_OFFS(T), CA0); \
60  ++res
61 
63 #define UNROLL1_KERNEL5(OPER) \
64  --i; \
65  OPER(res[0], v1[0], v2[0], f1, f2); \
66  ++v1; ++v2; ++res
67 
68 #define UNROLL1_KERNEL5_PREPARE do {} while(0)
69 #define UNROLL1_KERNEL5_FIXUP do {} while(0)
70 
71 
73 #define UNROLL2_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
74  if (EL_PER_CL(T) <= 1) { \
75  OPER(res[0], v1[0], v2[0], f1, f2); \
76  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
77  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
78  i -= 2; \
79  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
80  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
81  OPER(res[1], v1[1], v2[1], f1, f2); \
82  v1 += 2; v2 += 2; \
83  PREFETCH_W(res+PREF_OFFS(T), CA0); \
84  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
85  res += 2; \
86  } else { \
87  OPER(res[0], v1[0], v2[0], f1, f2); \
88  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
89  i -= 2; \
90  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
91  OPER(res[1], v1[1], v2[1], f1, f2); \
92  v1 += 2; v2 += 2; \
93  PREFETCH_W(res+PREF_OFFS(T), CA0); \
94  res += 2; \
95  } \
96 
97 
99 #define UNROLL2_KERNEL5(OPER) \
100  OPER(res[0], v1[0], v2[0], f1, f2); \
101  i -= 2; \
102  OPER(res[1], v1[1], v2[1], f1, f2); \
103  v1 += 2; v2 += 2; res += 2
104 
105 #define UNROLL2_KERNEL5_PREPARE do {} while(0)
106 #define UNROLL2_KERNEL5_FIXUP do {} while(0)
107 
108 
110 #define UNROLL4_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
111  if (EL_PER_CL(T) <= 1) { \
112  OPER(res[0], v1[0], v2[0], f1, f2); \
113  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
114  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
115  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
116  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
117  OPER(res[1], v1[1], v2[1], f1, f2); \
118  i -= 4; \
119  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
120  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
121  OPER(res[2], v1[2], v2[2], f1, f2); \
122  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
123  PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
124  PREFETCH_W(res+PREF_OFFS(T), CA0); \
125  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
126  OPER(res[3], v1[3], v2[3], f1, f2); \
127  v1 += 4; v2 += 4; \
128  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
129  PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
130  res += 4; \
131  } else if (EL_PER_CL(T) <= 2) { \
132  OPER(res[0], v1[0], v2[0], f1, f2); \
133  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
134  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
135  OPER(res[1], v1[1], v2[1], f1, f2); \
136  i -= 4; \
137  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
138  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
139  OPER(res[2], v1[2], v2[2], f1, f2); \
140  PREFETCH_W(res+PREF_OFFS(T), CA0); \
141  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
142  OPER(res[3], v1[3], v2[3], f1, f2); \
143  v1 += 4; v2 += 4; \
144  res += 4; \
145  } else { \
146  OPER(res[0], v1[0], v2[0], f1, f2); \
147  i -= 4; \
148  OPER(res[1], v1[1], v2[1], f1, f2); \
149  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
150  OPER(res[2], v1[2], v2[2], f1, f2); \
151  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
152  OPER(res[3], v1[3], v2[3], f1, f2); \
153  v1 += 4; v2 += 4; \
154  PREFETCH_W(res+PREF_OFFS(T), CA0); \
155  res += 4; \
156  }
157 
159 #define UNROLL4_KERNEL5(OPER) \
160  OPER(res[0], v1[0], v2[0], f1, f2); \
161  OPER(res[1], v1[1], v2[1], f1, f2); \
162  i -= 4; \
163  OPER(res[2], v1[2], v2[2], f1, f2); \
164  OPER(res[3], v1[3], v2[3], f1, f2); \
165  v1 += 4; v2 += 4; \
166  res += 4
167 
168 #define UNROLL4_KERNEL5_PREPARE do {} while(0)
169 #define UNROLL4_KERNEL5_FIXUP do {} while(0)
170 
171 
173 #define UNROLL8_PREF_KERNEL5(OPER,T,CA0,CA1,CA2) \
174  if (EL_PER_CL(T) <= 1) { \
175  OPER(res[0], v1[0], v2[0], f1, f2); \
176  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
177  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
178  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
179  OPER(res[1], v1[1], v2[1], f1, f2); \
180  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
181  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
182  PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
183  OPER(res[2], v1[2], v2[2], f1, f2); \
184  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
185  PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
186  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
187  OPER(res[3], v1[3], v2[3], f1, f2); \
188  i -= 8; \
189  PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
190  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
191  PREFETCH_R(v2 +PREF_OFFS(T)+3, CA2); \
192  OPER(res[4], v1[4], v2[4], f1, f2); \
193  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
194  PREFETCH_R(v2 +PREF_OFFS(T)+5, CA2); \
195  PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
196  OPER(res[5], v1[5], v2[5], f1, f2); \
197  PREFETCH_R(v2 +PREF_OFFS(T)+7, CA2); \
198  PREFETCH_W(res+PREF_OFFS(T), CA0); \
199  PREFETCH_W(res+PREF_OFFS(T)+1, CA0); \
200  OPER(res[6], v1[6], v2[6], f1, f2); \
201  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
202  PREFETCH_W(res+PREF_OFFS(T)+3, CA0); \
203  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
204  OPER(res[7], v1[7], v2[7], f1, f2); \
205  v1 += 8; v2 += 8; \
206  PREFETCH_W(res+PREF_OFFS(T)+5, CA0); \
207  PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
208  PREFETCH_W(res+PREF_OFFS(T)+7, CA0); \
209  res += 8; \
210  } else if (EL_PER_CL(T) <= 2) { \
211  OPER(res[0], v1[0], v2[0], f1, f2); \
212  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
213  OPER(res[1], v1[1], v2[1], f1, f2); \
214  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
215  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
216  OPER(res[2], v1[2], v2[2], f1, f2); \
217  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
218  OPER(res[3], v1[3], v2[3], f1, f2); \
219  i -= 8; \
220  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
221  PREFETCH_R(v2 +PREF_OFFS(T)+2, CA2); \
222  OPER(res[4], v1[4], v2[4], f1, f2); \
223  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
224  PREFETCH_R(v2 +PREF_OFFS(T)+6, CA2); \
225  OPER(res[5], v1[5], v2[5], f1, f2); \
226  PREFETCH_W(res+PREF_OFFS(T), CA0); \
227  PREFETCH_W(res+PREF_OFFS(T)+2, CA0); \
228  OPER(res[6], v1[6], v2[6], f1, f2); \
229  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
230  PREFETCH_W(res+PREF_OFFS(T)+6, CA0); \
231  OPER(res[7], v1[7], v2[7], f1, f2); \
232  v1 += 8; v2 += 8; \
233  res += 8; \
234  } else if (EL_PER_CL(T) <= 4) { \
235  OPER(res[0], v1[0], v2[0], f1, f2); \
236  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
237  OPER(res[1], v1[1], v2[1], f1, f2); \
238  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
239  OPER(res[2], v1[2], v2[2], f1, f2); \
240  i -= 8; \
241  OPER(res[3], v1[3], v2[3], f1, f2); \
242  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
243  OPER(res[4], v1[4], v2[4], f1, f2); \
244  PREFETCH_R(v2 +PREF_OFFS(T)+4, CA2); \
245  OPER(res[5], v1[5], v2[5], f1, f2); \
246  PREFETCH_W(res+PREF_OFFS(T), CA0); \
247  OPER(res[6], v1[6], v2[6], f1, f2); \
248  PREFETCH_W(res+PREF_OFFS(T)+4, CA0); \
249  OPER(res[7], v1[7], v2[7], f1, f2); \
250  v1 += 8; v2 += 8; \
251  res += 8; \
252  } else { \
253  OPER(res[0], v1[0], v2[0], f1, f2); \
254  OPER(res[1], v1[1], v2[1], f1, f2); \
255  i -= 8; \
256  OPER(res[2], v1[2], v2[2], f1, f2); \
257  OPER(res[3], v1[3], v2[3], f1, f2); \
258  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
259  OPER(res[4], v1[4], v2[4], f1, f2); \
260  OPER(res[5], v1[5], v2[5], f1, f2); \
261  PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
262  OPER(res[6], v1[6], v2[6], f1, f2); \
263  OPER(res[7], v1[7], v2[7], f1, f2); \
264  v1 += 8; v2 += 8; \
265  PREFETCH_W(res+PREF_OFFS(T), CA0); \
266  res += 8; \
267  }
268 
269 
271 #define UNROLL8_KERNEL5(OPER) \
272  OPER(res[0], v1[0], v2[0], f1, f2); \
273  OPER(res[1], v1[1], v2[1], f1, f2); \
274  OPER(res[2], v1[2], v2[2], f1, f2); \
275  OPER(res[3], v1[3], v2[3], f1, f2); \
276  i -= 8; \
277  OPER(res[4], v1[4], v2[4], f1, f2); \
278  OPER(res[5], v1[5], v2[5], f1, f2); \
279  OPER(res[6], v1[6], v2[6], f1, f2); \
280  OPER(res[7], v1[7], v2[7], f1, f2); \
281  v1 += 8; v2 += 8; \
282  res += 8
283 
284 #define UNROLL8_KERNEL5_PREPARE do {} while(0)
285 #define UNROLL8_KERNEL5_FIXUP do {} while(0)
286 
288 #define PREF_AHEAD3(T,CA0,CA1,CA2) \
289  if (PREFETCH_AHEAD >= 16) { \
290  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
291  PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
292  PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
293  PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
294  PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
295  PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
296  PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
297  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
298  PREFETCH_R(v2 +EL_PER_CL(T)* 2, CA2); \
299  PREFETCH_R(v2 +EL_PER_CL(T)* 3, CA2); \
300  PREFETCH_R(v2 +EL_PER_CL(T)* 4, CA2); \
301  PREFETCH_R(v2 +EL_PER_CL(T)* 5, CA2); \
302  PREFETCH_R(v2 +EL_PER_CL(T)* 6, CA2); \
303  PREFETCH_R(v2 +EL_PER_CL(T)* 7, CA2); \
304  PREFETCH_W(res+EL_PER_CL(T), CA0); \
305  PREFETCH_W(res+EL_PER_CL(T)* 2, CA0); \
306  PREFETCH_W(res+EL_PER_CL(T)* 3, CA0); \
307  PREFETCH_W(res+EL_PER_CL(T)* 4, CA0); \
308  PREFETCH_W(res+EL_PER_CL(T)* 5, CA0); \
309  PREFETCH_W(res+EL_PER_CL(T)* 6, CA0); \
310  PREFETCH_W(res+EL_PER_CL(T)* 7, CA0); \
311  PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
312  PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
313  PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
314  PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
315  PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
316  PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
317  PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
318  PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
319  PREFETCH_R(v2 +EL_PER_CL(T)* 8, CA2); \
320  PREFETCH_R(v2 +EL_PER_CL(T)* 9, CA2); \
321  PREFETCH_R(v2 +EL_PER_CL(T)*10, CA2); \
322  PREFETCH_R(v2 +EL_PER_CL(T)*11, CA2); \
323  PREFETCH_R(v2 +EL_PER_CL(T)*12, CA2); \
324  PREFETCH_R(v2 +EL_PER_CL(T)*13, CA2); \
325  PREFETCH_R(v2 +EL_PER_CL(T)*14, CA2); \
326  PREFETCH_R(v2 +EL_PER_CL(T)*15, CA2); \
327  PREFETCH_W(res+EL_PER_CL(T)* 8, CA0); \
328  PREFETCH_W(res+EL_PER_CL(T)* 9, CA0); \
329  PREFETCH_W(res+EL_PER_CL(T)*10, CA0); \
330  PREFETCH_W(res+EL_PER_CL(T)*11, CA0); \
331  PREFETCH_W(res+EL_PER_CL(T)*12, CA0); \
332  PREFETCH_W(res+EL_PER_CL(T)*13, CA0); \
333  PREFETCH_W(res+EL_PER_CL(T)*14, CA0); \
334  PREFETCH_W(res+EL_PER_CL(T)*15, CA0); \
335  } else if (PREFETCH_AHEAD >= 8) { \
336  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
337  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
338  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
339  PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
340  PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
341  PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
342  PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
343  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
344  PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
345  PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
346  PREFETCH_R(v2 +EL_PER_CL(T)*4, CA2); \
347  PREFETCH_R(v2 +EL_PER_CL(T)*5, CA2); \
348  PREFETCH_R(v2 +EL_PER_CL(T)*6, CA2); \
349  PREFETCH_R(v2 +EL_PER_CL(T)*7, CA2); \
350  PREFETCH_W(res+EL_PER_CL(T), CA0); \
351  PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
352  PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
353  PREFETCH_W(res+EL_PER_CL(T)*4, CA0); \
354  PREFETCH_W(res+EL_PER_CL(T)*5, CA0); \
355  PREFETCH_W(res+EL_PER_CL(T)*6, CA0); \
356  PREFETCH_W(res+EL_PER_CL(T)*7, CA0); \
357  } else if (PREFETCH_AHEAD >= 4) { \
358  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
359  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
360  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
361  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
362  PREFETCH_R(v2 +EL_PER_CL(T)*2, CA2); \
363  PREFETCH_R(v2 +EL_PER_CL(T)*3, CA2); \
364  PREFETCH_W(res+EL_PER_CL(T), CA0); \
365  PREFETCH_W(res+EL_PER_CL(T)*2, CA0); \
366  PREFETCH_W(res+EL_PER_CL(T)*3, CA0); \
367  } else if (PREFETCH_AHEAD >= 2) { \
368  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
369  PREFETCH_R(v2 +EL_PER_CL(T), CA2); \
370  PREFETCH_W(res+EL_PER_CL(T), CA0); \
371  }
372 
373 
374 /***********************************************************
375  * 2 pointer operations
376  ***********************************************************/
377 
379 #define UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
380  OPER(res[0], v1[0], f1, f2); \
381  --i; \
382  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
383  ++v1; \
384  PREFETCH_X(res+PREF_OFFS(T), CA0); \
385  ++res
386 
388 #define UNROLL1_KERNEL4(OPER) \
389  --i; \
390  OPER(res[0], v1[0], f1, f2); \
391  ++v1; ++res
392 
393 #define UNROLL1_KERNEL4_PREPARE do {} while(0)
394 #define UNROLL1_KERNEL4_FIXUP do {} while(0)
395 
396 
398 #define UNROLL2_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
399  if (EL_PER_CL(T) <= 1) { \
400  OPER(res[0], v1[0], f1, f2); \
401  i -= 2; \
402  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
403  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
404  OPER(res[1], v1[1], f1, f2); \
405  v1 += 2; \
406  PREFETCH_X(res+PREF_OFFS(T), CA0); \
407  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
408  res += 2; \
409  } else { \
410  OPER(res[0], v1[0], f1, f2); \
411  i -= 2; \
412  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
413  OPER(res[1], v1[1], f1, f2); \
414  v1 += 2; \
415  PREFETCH_X(res+PREF_OFFS(T), CA0); \
416  res += 2; \
417  } \
418 
419 
421 #define UNROLL2_KERNEL4(OPER) \
422  OPER(res[0], v1[0], f1, f2); \
423  v1 += 2; i -= 2; \
424  OPER(res[1], v1[-1],f1, f2); \
425  res += 2
426 
427 #define UNROLL2_KERNEL4_PREPARE do {} while(0)
428 #define UNROLL2_KERNEL4_FIXUP do {} while(0)
429 
430 
432 #define UNROLL4_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
433  if (EL_PER_CL(T) <= 1) { \
434  OPER(res[0], v1[0], f1, f2); \
435  i -= 4; \
436  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
437  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
438  OPER(res[1], v1[1], f1, f2); \
439  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
440  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
441  PREFETCH_X(res+PREF_OFFS(T), CA0); \
442  OPER(res[2], v1[2], f1, f2); \
443  v1 += 4; \
444  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
445  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
446  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
447  OPER(res[3], v1[-1], f1, f2); \
448  res += 4; \
449  } else if (EL_PER_CL(T) <= 2) { \
450  OPER(res[0], v1[0], f1, f2); \
451  i -= 4; \
452  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
453  OPER(res[1], v1[1], f1, f2); \
454  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
455  OPER(res[2], v1[2], f1, f2); \
456  v1 += 4; \
457  PREFETCH_X(res+PREF_OFFS(T), CA0); \
458  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
459  OPER(res[3], v1[-1], f1, f2); \
460  res += 4; \
461  } else { \
462  OPER(res[0], v1[0], f1, f2); \
463  i -= 4; \
464  OPER(res[1], v1[1], f1, f2); \
465  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
466  OPER(res[2], v1[2], f1, f2); \
467  v1 += 4; \
468  PREFETCH_X(res+PREF_OFFS(T), CA0); \
469  OPER(res[3], v1[-1], f1, f2); \
470  res += 4; \
471  }
472 
474 #define UNROLL4_KERNEL4(OPER) \
475  OPER(res[0], v1[0], f1, f2); \
476  OPER(res[1], v1[1], f1, f2); \
477  v1 += 4; i -= 4; \
478  OPER(res[2], v1[-2], f1, f2); \
479  OPER(res[3], v1[-1], f1, f2); \
480  res += 4
481 
482 #define UNROLL4_KERNEL4_PREPARE do {} while(0)
483 #define UNROLL4_KERNEL4_FIXUP do {} while(0)
484 
485 
487 #define UNROLL8_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1) \
488  if (EL_PER_CL(T) <= 1) { \
489  OPER(res[0], v1[0], f1, f2); \
490  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
491  PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
492  OPER(res[1], v1[1], f1, f2); \
493  i -= 8; \
494  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
495  PREFETCH_R(v1 +PREF_OFFS(T)+3, CA1); \
496  OPER(res[2], v1[2], f1, f2); \
497  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
498  PREFETCH_R(v1 +PREF_OFFS(T)+5, CA1); \
499  OPER(res[3], v1[3], f1, f2); \
500  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
501  PREFETCH_R(v1 +PREF_OFFS(T)+7, CA1); \
502  OPER(res[4], v1[4], f1, f2); \
503  v1 += 8; \
504  PREFETCH_X(res+PREF_OFFS(T), CA0); \
505  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
506  OPER(res[5], v1[-3], f1, f2); \
507  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
508  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
509  OPER(res[6], v1[-2], f1, f2); \
510  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
511  PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
512  OPER(res[7], v1[-1], f1, f2); \
513  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
514  PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
515  res += 8; \
516  } else if (EL_PER_CL(T) <= 2) { \
517  OPER(res[0], v1[0], f1, f2); \
518  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
519  OPER(res[1], v1[1], f1, f2); \
520  i -= 8; \
521  PREFETCH_R(v1 +PREF_OFFS(T)+2, CA1); \
522  OPER(res[2], v1[2], f1, f2); \
523  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
524  OPER(res[3], v1[3], f1, f2); \
525  PREFETCH_R(v1 +PREF_OFFS(T)+6, CA1); \
526  OPER(res[4], v1[4], f1, f2); \
527  v1 += 8; \
528  PREFETCH_X(res+PREF_OFFS(T), CA0); \
529  OPER(res[5], v1[-3], f1, f2); \
530  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
531  OPER(res[6], v1[-2], f1, f2); \
532  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
533  OPER(res[7], v1[-1], f1, f2); \
534  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
535  res += 8; \
536  } else if (EL_PER_CL(T) <= 4) { \
537  OPER(res[0], v1[0], f1, f2); \
538  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
539  OPER(res[1], v1[1], f1, f2); \
540  OPER(res[2], v1[2], f1, f2); \
541  PREFETCH_R(v1 +PREF_OFFS(T)+4, CA1); \
542  OPER(res[3], v1[3], f1, f2); \
543  OPER(res[4], v1[4], f1, f2); \
544  v1 += 8; \
545  PREFETCH_X(res+PREF_OFFS(T), CA0); \
546  OPER(res[5], v1[-3], f1, f2); \
547  i -= 8; \
548  OPER(res[6], v1[-2], f1, f2); \
549  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
550  OPER(res[7], v1[-1], f1, f2); \
551  res += 8; \
552  } else { \
553  OPER(res[0], v1[0], f1, f2); \
554  OPER(res[1], v1[1], f1, f2); \
555  PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
556  OPER(res[2], v1[2], f1, f2); \
557  OPER(res[3], v1[3], f1, f2); \
558  v1 += 8; \
559  PREFETCH_X(res+PREF_OFFS(T), CA0); \
560  OPER(res[4], v1[-4], f1, f2); \
561  OPER(res[5], v1[-3], f1, f2); \
562  i -= 8; \
563  OPER(res[6], v1[-2], f1, f2); \
564  OPER(res[7], v1[-1], f1, f2); \
565  res += 8; \
566  }
567 
568 
570 #define UNROLL8_KERNEL4(OPER) \
571  OPER(res[0], v1[0], f1, f2); \
572  OPER(res[1], v1[1], f1, f2); \
573  OPER(res[2], v1[2], f1, f2); \
574  OPER(res[3], v1[3], f1, f2); \
575  v1 += 8; i -= 8; \
576  OPER(res[4], v1[-4], f1, f2); \
577  OPER(res[5], v1[-3], f1, f2); \
578  OPER(res[6], v1[-2], f1, f2); \
579  OPER(res[7], v1[-1], f1, f2); \
580  res += 8
581 
582 #define UNROLL8_KERNEL4_PREPARE do {} while(0)
583 #define UNROLL8_KERNEL4_FIXUP do {} while(0)
584 
585 
587 #define PREF_AHEAD2(T,PREFETCH_X,CA0,CA1) \
588  if (PREFETCH_AHEAD >= 16) { \
589  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
590  PREFETCH_R(v1 +EL_PER_CL(T)* 2, CA1); \
591  PREFETCH_R(v1 +EL_PER_CL(T)* 3, CA1); \
592  PREFETCH_R(v1 +EL_PER_CL(T)* 4, CA1); \
593  PREFETCH_R(v1 +EL_PER_CL(T)* 5, CA1); \
594  PREFETCH_R(v1 +EL_PER_CL(T)* 6, CA1); \
595  PREFETCH_R(v1 +EL_PER_CL(T)* 7, CA1); \
596  PREFETCH_X(res+EL_PER_CL(T), CA0); \
597  PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
598  PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
599  PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
600  PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
601  PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
602  PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
603  PREFETCH_R(v1 +EL_PER_CL(T)* 8, CA1); \
604  PREFETCH_R(v1 +EL_PER_CL(T)* 9, CA1); \
605  PREFETCH_R(v1 +EL_PER_CL(T)*10, CA1); \
606  PREFETCH_R(v1 +EL_PER_CL(T)*11, CA1); \
607  PREFETCH_R(v1 +EL_PER_CL(T)*12, CA1); \
608  PREFETCH_R(v1 +EL_PER_CL(T)*13, CA1); \
609  PREFETCH_R(v1 +EL_PER_CL(T)*14, CA1); \
610  PREFETCH_R(v1 +EL_PER_CL(T)*15, CA1); \
611  PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
612  PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
613  PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
614  PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
615  PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
616  PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
617  PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
618  PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
619  } else if (PREFETCH_AHEAD >= 8) { \
620  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
621  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
622  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
623  PREFETCH_R(v1 +EL_PER_CL(T)*4, CA1); \
624  PREFETCH_R(v1 +EL_PER_CL(T)*5, CA1); \
625  PREFETCH_R(v1 +EL_PER_CL(T)*6, CA1); \
626  PREFETCH_R(v1 +EL_PER_CL(T)*7, CA1); \
627  PREFETCH_X(res+EL_PER_CL(T), CA0); \
628  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
629  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
630  PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
631  PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
632  PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
633  PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
634  } else if (PREFETCH_AHEAD >= 4) { \
635  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
636  PREFETCH_R(v1 +EL_PER_CL(T)*2, CA1); \
637  PREFETCH_R(v1 +EL_PER_CL(T)*3, CA1); \
638  PREFETCH_X(res+EL_PER_CL(T), CA0); \
639  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
640  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
641  } else if (PREFETCH_AHEAD >= 2) { \
642  PREFETCH_R(v1 +EL_PER_CL(T), CA1); \
643  PREFETCH_X(res+EL_PER_CL(T), CA0); \
644  }
645 
646 
647 /***********************************************************
648  * 1 pointer operations
649  ***********************************************************/
650 
652 #define UNROLL1_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
653  OPER(res[0], f1, f2); \
654  --i; \
655  PREFETCH_X(res+PREF_OFFS(T), CA0); \
656  ++res
657 
659 #define UNROLL1_KERNEL3(OPER) \
660  --i; \
661  OPER(res[0], f1, f2); \
662  ++res
663 
664 #define UNROLL1_KERNEL3_PREPARE do {} while(0)
665 #define UNROLL1_KERNEL3_FIXUP do {} while(0)
666 
667 
669 #define UNROLL2_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
670  if (EL_PER_CL(T) <= 1) { \
671  OPER(res[0], f1, f2); \
672  PREFETCH_X(res+PREF_OFFS(T), CA0); \
673  i -= 2; \
674  OPER(res[1], f1, f2); \
675  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
676  res += 2; \
677  } else { \
678  OPER(res[0], f1, f2); \
679  i -= 2; \
680  OPER(res[1], f1, f2); \
681  PREFETCH_X(res+PREF_OFFS(T), CA0); \
682  res += 2; \
683  } \
684 
685 
687 #define UNROLL2_KERNEL3(OPER) \
688  OPER(res[0], f1, f2); \
689  i -= 2; \
690  OPER(res[1], f1, f2); \
691  res += 2
692 
693 #define UNROLL2_KERNEL3_PREPARE do {} while(0)
694 #define UNROLL2_KERNEL3_FIXUP do {} while(0)
695 
696 
698 #define UNROLL4_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
699  if (EL_PER_CL(T) <= 1) { \
700  OPER(res[0], f1, f2); \
701  i -= 4; \
702  PREFETCH_X(res+PREF_OFFS(T), CA0); \
703  OPER(res[1], f1, f2); \
704  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
705  OPER(res[2], f1, f2); \
706  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
707  OPER(res[3], f1, f2); \
708  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
709  res += 4; \
710  } else if (EL_PER_CL(T) <= 2) { \
711  OPER(res[0], f1, f2); \
712  PREFETCH_X(res+PREF_OFFS(T), CA0); \
713  OPER(res[1], f1, f2); \
714  i -= 4; \
715  OPER(res[2], f1, f2); \
716  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
717  OPER(res[3], f1, f2); \
718  res += 4; \
719  } else { \
720  OPER(res[0], f1, f2); \
721  i -= 4; \
722  OPER(res[1], f1, f2); \
723  PREFETCH_X(res+PREF_OFFS(T), CA0); \
724  OPER(res[2], f1, f2); \
725  OPER(res[3], f1, f2); \
726  res += 4; \
727  }
728 
730 #define UNROLL4_KERNEL3(OPER) \
731  OPER(res[0], f1, f2); \
732  OPER(res[1], f1, f2); \
733  i -= 4; \
734  OPER(res[2], f1, f2); \
735  OPER(res[3], f1, f2); \
736  res += 4
737 
738 #define UNROLL4_KERNEL3_PREPARE do {} while(0)
739 #define UNROLL4_KERNEL3_FIXUP do {} while(0)
740 
741 
743 #define UNROLL8_PREF_KERNEL3(OPER,T,PREFETCH_X,CA0) \
744  if (EL_PER_CL(T) <= 1) { \
745  OPER(res[0], f1, f2); \
746  PREFETCH_X(res+PREF_OFFS(T), CA0); \
747  OPER(res[1], f1, f2); \
748  PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
749  OPER(res[2], f1, f2); \
750  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
751  OPER(res[3], f1, f2); \
752  PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
753  OPER(res[4], f1, f2); \
754  i -= 8; \
755  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
756  OPER(res[5], f1, f2); \
757  PREFETCH_X(res+PREF_OFFS(T)+5, CA0); \
758  OPER(res[6], f1, f2); \
759  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
760  OPER(res[7], f1, f2); \
761  PREFETCH_X(res+PREF_OFFS(T)+7, CA0); \
762  res += 8; \
763  } else if (EL_PER_CL(T) <= 2) { \
764  OPER(res[0], f1, f2); \
765  OPER(res[1], f1, f2); \
766  PREFETCH_X(res+PREF_OFFS(T), CA0); \
767  OPER(res[2], f1, f2); \
768  OPER(res[3], f1, f2); \
769  PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
770  OPER(res[4], f1, f2); \
771  i -= 8; \
772  OPER(res[5], f1, f2); \
773  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
774  OPER(res[6], f1, f2); \
775  PREFETCH_X(res+PREF_OFFS(T)+6, CA0); \
776  OPER(res[7], f1, f2); \
777  res += 8; \
778  } else if (EL_PER_CL(T) <= 4) { \
779  OPER(res[0], f1, f2); \
780  OPER(res[1], f1, f2); \
781  PREFETCH_X(res+PREF_OFFS(T), CA0); \
782  OPER(res[2], f1, f2); \
783  OPER(res[3], f1, f2); \
784  i -= 8; \
785  OPER(res[4], f1, f2); \
786  OPER(res[5], f1, f2); \
787  PREFETCH_X(res+PREF_OFFS(T)+4, CA0); \
788  OPER(res[6], f1, f2); \
789  OPER(res[7], f1, f2); \
790  res += 8; \
791  } else { \
792  OPER(res[0], f1, f2); \
793  OPER(res[1], f1, f2); \
794  OPER(res[2], f1, f2); \
795  PREFETCH_X(res+PREF_OFFS(T), CA0); \
796  OPER(res[3], f1, f2); \
797  OPER(res[4], f1, f2); \
798  OPER(res[5], f1, f2); \
799  i -= 8; \
800  OPER(res[6], f1, f2); \
801  OPER(res[7], f1, f2); \
802  res += 8; \
803  }
804 
805 
807 #define UNROLL8_KERNEL3(OPER) \
808  OPER(res[0], f1, f2); \
809  OPER(res[1], f1, f2); \
810  OPER(res[2], f1, f2); \
811  OPER(res[3], f1, f2); \
812  i -= 8; \
813  OPER(res[4], f1, f2); \
814  OPER(res[5], f1, f2); \
815  OPER(res[6], f1, f2); \
816  OPER(res[7], f1, f2); \
817  res += 8
818 
819 #define UNROLL8_KERNEL3_PREPARE do {} while(0)
820 #define UNROLL8_KERNEL3_FIXUP do {} while(0)
821 
822 
824 #define PREF_AHEAD1(T,PREFETCH_X,CA0) \
825  if (PREFETCH_AHEAD >= 16) { \
826  PREFETCH_X(res+EL_PER_CL(T), CA0); \
827  PREFETCH_X(res+EL_PER_CL(T)* 2, CA0); \
828  PREFETCH_X(res+EL_PER_CL(T)* 3, CA0); \
829  PREFETCH_X(res+EL_PER_CL(T)* 4, CA0); \
830  PREFETCH_X(res+EL_PER_CL(T)* 5, CA0); \
831  PREFETCH_X(res+EL_PER_CL(T)* 6, CA0); \
832  PREFETCH_X(res+EL_PER_CL(T)* 7, CA0); \
833  PREFETCH_X(res+EL_PER_CL(T)* 8, CA0); \
834  PREFETCH_X(res+EL_PER_CL(T)* 9, CA0); \
835  PREFETCH_X(res+EL_PER_CL(T)*10, CA0); \
836  PREFETCH_X(res+EL_PER_CL(T)*11, CA0); \
837  PREFETCH_X(res+EL_PER_CL(T)*12, CA0); \
838  PREFETCH_X(res+EL_PER_CL(T)*13, CA0); \
839  PREFETCH_X(res+EL_PER_CL(T)*14, CA0); \
840  PREFETCH_X(res+EL_PER_CL(T)*15, CA0); \
841  } else if (PREFETCH_AHEAD >= 8) { \
842  PREFETCH_X(res+EL_PER_CL(T), CA0); \
843  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
844  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
845  PREFETCH_X(res+EL_PER_CL(T)*4, CA0); \
846  PREFETCH_X(res+EL_PER_CL(T)*5, CA0); \
847  PREFETCH_X(res+EL_PER_CL(T)*6, CA0); \
848  PREFETCH_X(res+EL_PER_CL(T)*7, CA0); \
849  } else if (PREFETCH_AHEAD >= 4) { \
850  PREFETCH_X(res+EL_PER_CL(T), CA0); \
851  PREFETCH_X(res+EL_PER_CL(T)*2, CA0); \
852  PREFETCH_X(res+EL_PER_CL(T)*3, CA0); \
853  } else if (PREFETCH_AHEAD >= 2) { \
854  PREFETCH_X(res+EL_PER_CL(T), CA0); \
855  }
856 
857 
858 
859 // Select default kernels
860 #if UNROLL_DEPTH == 1
861 
862 # define UNR_PREF_KERNEL5 UNROLL1_PREF_KERNEL5
863 # define UNR_KERNEL5 UNROLL1_KERNEL5
864 # define UNR_KERNEL5_PREP UNROLL1_KERNEL5_PREPARE
865 # define UNR_KERNEL5_FIX UNROLL1_KERNEL5_FIXUP
866 
867 # define UNR_PREF_KERNEL4 UNROLL1_PREF_KERNEL4
868 # define UNR_KERNEL4 UNROLL1_KERNEL4
869 # define UNR_KERNEL4_PREP UNROLL1_KERNEL4_PREPARE
870 # define UNR_KERNEL4_FIX UNROLL1_KERNEL4_FIXUP
871 
872 # define UNR_PREF_KERNEL3 UNROLL1_PREF_KERNEL3
873 # define UNR_KERNEL3 UNROLL1_KERNEL3
874 # define UNR_KERNEL3_PREP UNROLL1_KERNEL3_PREPARE
875 # define UNR_KERNEL3_FIX UNROLL1_KERNEL3_FIXUP
876 
877 #elif UNROLL_DEPTH == 2
878 
879 # define UNR_PREF_KERNEL5 UNROLL2_PREF_KERNEL5
880 # define UNR_KERNEL5 UNROLL2_KERNEL5
881 # define UNR_KERNEL5_PREP UNROLL2_KERNEL5_PREPARE
882 # define UNR_KERNEL5_FIX UNROLL2_KERNEL5_FIXUP
883 
884 # define UNR_PREF_KERNEL4 UNROLL2_PREF_KERNEL4
885 # define UNR_KERNEL4 UNROLL2_KERNEL4
886 # define UNR_KERNEL4_PREP UNROLL2_KERNEL4_PREPARE
887 # define UNR_KERNEL4_FIX UNROLL2_KERNEL4_FIXUP
888 
889 # define UNR_PREF_KERNEL3 UNROLL2_PREF_KERNEL3
890 # define UNR_KERNEL3 UNROLL2_KERNEL3
891 # define UNR_KERNEL3_PREP UNROLL2_KERNEL3_PREPARE
892 # define UNR_KERNEL3_FIX UNROLL2_KERNEL3_FIXUP
893 
894 #elif UNROLL_DEPTH == 4
895 
896 # define UNR_PREF_KERNEL5 UNROLL4_PREF_KERNEL5
897 # define UNR_KERNEL5 UNROLL4_KERNEL5
898 # define UNR_KERNEL5_PREP UNROLL4_KERNEL5_PREPARE
899 # define UNR_KERNEL5_FIX UNROLL4_KERNEL5_FIXUP
900 
901 # define UNR_PREF_KERNEL4 UNROLL4_PREF_KERNEL4
902 # define UNR_KERNEL4 UNROLL4_KERNEL4
903 # define UNR_KERNEL4_PREP UNROLL4_KERNEL4_PREPARE
904 # define UNR_KERNEL4_FIX UNROLL4_KERNEL4_FIXUP
905 
906 # define UNR_PREF_KERNEL3 UNROLL4_PREF_KERNEL3
907 # define UNR_KERNEL3 UNROLL4_KERNEL3
908 # define UNR_KERNEL3_PREP UNROLL4_KERNEL3_PREPARE
909 # define UNR_KERNEL3_FIX UNROLL4_KERNEL3_FIXUP
910 
911 #elif UNROLL_DEPTH == 8
912 
913 # define UNR_PREF_KERNEL5 UNROLL8_PREF_KERNEL5
914 # define UNR_KERNEL5 UNROLL8_KERNEL5
915 # define UNR_KERNEL5_PREP UNROLL8_KERNEL5_PREPARE
916 # define UNR_KERNEL5_FIX UNROLL8_KERNEL5_FIXUP
917 
918 # define UNR_PREF_KERNEL4 UNROLL8_PREF_KERNEL4
919 # define UNR_KERNEL4 UNROLL8_KERNEL4
920 # define UNR_KERNEL4_PREP UNROLL8_KERNEL4_PREPARE
921 # define UNR_KERNEL4_FIX UNROLL8_KERNEL4_FIXUP
922 
923 # define UNR_PREF_KERNEL3 UNROLL8_PREF_KERNEL3
924 # define UNR_KERNEL3 UNROLL8_KERNEL3
925 # define UNR_KERNEL3_PREP UNROLL8_KERNEL3_PREPARE
926 # define UNR_KERNEL3_FIX UNROLL8_KERNEL3_FIXUP
927 
928 #else
929 
930 # error "UNROLL_DEPTH may only be 1, 2, 4, 8"
931 
932 #endif /* UNROLL_DEPTH */
933 
952 /****************************************************************
953  * Macros with fragments for the implementation
954  ****************************************************************/
955 
956 #ifdef USE_PREFETCH
957 
958 # define VKERN_TEMPL_3V_PREF(OP3,T) \
959  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
960  PREFETCH_W (res, 3); \
961  PREF_AHEAD3(T,3,MAX(1,CACHE_LOC_READ),MAX(1,CACHE_LOC_READ)); \
962  UNR_KERNEL5_PREP; \
963  do { \
964  UNR_PREF_KERNEL5(OP3,T,CACHE_LOC_WRITE,CACHE_LOC_READ,CACHE_LOC_READ); \
965  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
966  UNR_KERNEL5_FIX; \
967  }
968 
969 # define VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_X,CW) \
970  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
971  PREFETCH_X (res, 3); \
972  PREF_AHEAD2(T,PREFETCH_X,CW,MAX(1,CACHE_LOC_READ)); \
973  UNR_KERNEL4_PREP; \
974  do { \
975  UNR_PREF_KERNEL4(OP2,T,PREFETCH_X,CW,CACHE_LOC_READ); \
976  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
977  UNR_KERNEL4_FIX; \
978  }
979 
980 # define VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_X,CW) \
981  if (LIKELY(i >= UNROLL_DEPTH+PREF_OFFS(T))) { \
982  PREFETCH_X (res, 3); \
983  PREF_AHEAD1(T,PREFETCH_X,CW); \
984  UNR_KERNEL3_PREP; \
985  do { \
986  UNR_PREF_KERNEL3(OP1,T,PREFETCH_X,CW); \
987  } while (i >= UNROLL_DEPTH+PREF_OFFS(T)); \
988  UNR_KERNEL3_FIX; \
989  }
990 #else
991 # define VKERN_TEMPL_3V_PREF(OP,T) do {} while (0)
992 # define VKERN_TEMPL_2V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
993 # define VKERN_TEMPL_1V_PREF(OP,T,PREFETCH_X,CW) do {} while (0)
994 #endif /* USE_PREFETCH */
995 
996 
997 /****************************************************************
998  * Templates for routines
999  ****************************************************************/
1000 
1012 #define VKERN_TEMPL_3V(FNAME,OP3) \
1014 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1015  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
1016 template <typename T> \
1017 VEC_INLINE void FNAME (const unsigned long sz, \
1018  T* RESTRICT const _res, \
1019  const T* RESTRICT const _v1, \
1020  const T* RESTRICT const _v2) \
1021 { \
1022  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1023  REGISTER const T *v1 = _v1, *v2 = _v2; \
1024  REGISTER T *res = _res; \
1025  REGISTER long i = sz; \
1026  VKERN_TEMPL_3V_PREF(OP3,T); \
1027  \
1028  if (LIKELY(i >= UNROLL_DEPTH)) { \
1029  UNR_KERNEL5_PREP; \
1030  do { \
1031  UNR_KERNEL5(OP3); \
1032  } while (i >= UNROLL_DEPTH); \
1033  UNR_KERNEL5_FIX; \
1034  } \
1035  \
1036  for (; i; --i) { \
1037  OP3(*res, *v1, *v2, f1, f2); \
1038  ++v1; ++v2; ++res; \
1039  } \
1040 }
1041 
1043 #define VKERN_TEMPL_3V_C(FNAME,OP3) \
1044 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1045  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1046  const T* RESTRICT const, LCTYPED(T));) \
1047 template <typename T> \
1048 VEC_INLINE void FNAME (const unsigned long sz, \
1049  T* RESTRICT const _res, \
1050  const T* RESTRICT const _v1, \
1051  const T* RESTRICT const _v2, \
1052  LCTYPE(T) f2) \
1053 { \
1054  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1055  REGISTER const T *v1 = _v1. *v2 = _v2; \
1056  REGISTER T *res = _res; \
1057  REGISTER long i = sz; \
1058  VKERN_TEMPL_3V_PREF(OP3,T); \
1059  \
1060  if (LIKELY(i >= UNROLL_DEPTH)) { \
1061  UNR_KERNEL5_PREP; \
1062  do { \
1063  UNR_KERNEL5(OP3); \
1064  } while (i >= UNROLL_DEPTH); \
1065  UNR_KERNEL5_FIX; \
1066  } \
1067  \
1068  for (; i; --i) { \
1069  OP3(*res, *v1, *v2, f1, f2); \
1070  ++v1; ++v2; ++res; \
1071  } \
1072 }
1073 
1075 #define VKERN_TEMPL_3V_CC(FNAME,OP3) \
1076 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1077  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1078  const T* RESTRICT const, LCTYPED(T), LCTYPED(T));) \
1079 template <typename T> \
1080 VEC_INLINE void FNAME (const unsigned long sz, \
1081  T* RESTRICT const _res, \
1082  const T* RESTRICT const _v1, \
1083  const T* RESTRICT const _v2, \
1084  LCTYPE(T) f1, \
1085  LCTYPE(T) f2) \
1086 { \
1087  PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
1088  REGISTER long i = sz; \
1089  REGISTER const T *v1 = _v1, *v2 = _v2; \
1090  REGISTER T *res = _res; \
1091  VKERN_TEMPL_3V_PREF(OP3,T); \
1092  \
1093  if (LIKELY(i >= UNROLL_DEPTH)) { \
1094  UNR_KERNEL5_PREP; \
1095  do { \
1096  UNR_KERNEL5(OP3); \
1097  } while (i >= UNROLL_DEPTH); \
1098  UNR_KERNEL5_FIX; \
1099  } \
1100  \
1101  for (; i; --i) { \
1102  OP3(*res, *v1, *v2, f1, f2); \
1103  ++v1; ++v2; ++res; \
1104  } \
1105 }
1106 
1108 #define VKERN_TEMPL_2V(FNAME,OP2) \
1109 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1110  (const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
1111 template <typename T> \
1112 VEC_INLINE void FNAME (const unsigned long sz, \
1113  T* RESTRICT const _res, \
1114  const T* RESTRICT const _v1) \
1115 { \
1116  PREFETCH_R(_v1, 3); \
1117  REGISTER const T *v1 = _v1; \
1118  REGISTER T* res = _res; \
1119  REGISTER long i = sz; \
1120  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_WRITE); \
1121  \
1122  if (LIKELY(i >= UNROLL_DEPTH)) { \
1123  UNR_KERNEL4_PREP; \
1124  do { \
1125  UNR_KERNEL4(OP2); \
1126  } while (i >= UNROLL_DEPTH); \
1127  UNR_KERNEL4_FIX; \
1128  } \
1129  \
1130  for (; i; --i) { \
1131  OP2(*res, *v1, f1, f2); \
1132  ++v1; ++res; \
1133  } \
1134 }
1135 
1137 #define VKERN_TEMPL_2V_C(FNAME,OP2) \
1138 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1139  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1140  LCTYPED(T));) \
1141 template <typename T> \
1142 VEC_INLINE void FNAME (const unsigned long sz, \
1143  T* RESTRICT const _res, \
1144  const T* RESTRICT const _v1, \
1145  LCTYPE(T) f2) \
1146 { \
1147  PREFETCH_R(_v1, 3); \
1148  REGISTER const T *v1 = _v1; \
1149  REGISTER T* res = _res; \
1150  REGISTER long i = sz; \
1151  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1152  \
1153  if (LIKELY(i >= UNROLL_DEPTH)) { \
1154  UNR_KERNEL4_PREP; \
1155  do { \
1156  UNR_KERNEL4(OP2); \
1157  } while (i >= UNROLL_DEPTH); \
1158  UNR_KERNEL4_FIX; \
1159  } \
1160  \
1161  for (; i; --i) { \
1162  OP2(*res, *v1, f1, f2); \
1163  ++v1; ++res; \
1164  } \
1165 }
1166 
1168 #define VKERN_TEMPL_2V_CC(FNAME,OP2) \
1169 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1170  (const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
1171  LCTYPED(T), LCTYPED(T));) \
1172 template <typename T> \
1173 VEC_INLINE void FNAME (const unsigned long sz, \
1174  T* RESTRICT const _res, \
1175  const T* RESTRICT const _v1, \
1176  LCTYPE(T) f1, \
1177  LCTYPE(T) f2) \
1178 { \
1179  PREFETCH_R(_v1, 3); \
1180  REGISTER const T *v1 = _v1; \
1181  REGISTER T* res = _res; \
1182  REGISTER long i = sz; \
1183  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_W,CACHE_LOC_WRITE); \
1184  \
1185  if (LIKELY(i >= UNROLL_DEPTH)) { \
1186  UNR_KERNEL4_PREP; \
1187  do { \
1188  UNR_KERNEL4(OP2); \
1189  } while (i >= UNROLL_DEPTH); \
1190  UNR_KERNEL4_FIX; \
1191  } \
1192  \
1193  for (; i; --i) { \
1194  OP2(*res, *v1, f1, f2); \
1195  ++v1; ++res; \
1196  } \
1197 }
1198 
1200 #define VKERN_TEMPL_2V_T(FNAME,OP2,TYPE) \
1201 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1202  (const unsigned long, const T* RESTRICT const, \
1203  const T* RESTRICT const, TYPE&);) \
1204 template <typename T> \
1205 VEC_INLINE void FNAME (const unsigned long sz, \
1206  const T* RESTRICT const _res, \
1207  const T* RESTRICT const _v1, \
1208  TYPE &_f2) \
1209 { \
1210  PREFETCH_R(_v1, 3); \
1211  /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1212  REGISTER TYPE f2(_f2), f1(0.0); \
1213  REGISTER const T *v1 = _v1; \
1214  REGISTER const T* res = _res; \
1215  REGISTER long i = sz; \
1216  VKERN_TEMPL_2V_PREF(OP2,T,PREFETCH_R,CACHE_LOC_READ); \
1217  \
1218  if (LIKELY(i >= UNROLL_DEPTH)) { \
1219  UNR_KERNEL4_PREP; \
1220  do { \
1221  UNR_KERNEL4(OP2); \
1222  } while (i >= UNROLL_DEPTH); \
1223  UNR_KERNEL4_FIX; \
1224  } \
1225  \
1226  for (; i; --i) { \
1227  OP2(*res, *v1, f1, f2); \
1228  ++v1; ++res; \
1229  } \
1230 _fin: \
1231  _f2 = f2 - f1; \
1232 }
1233 
1235 #define VKERN_TEMPL_1V(FNAME,OP1) \
1236 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1237  (const unsigned long, T* RESTRICT const);) \
1238 template <typename T> \
1239 VEC_INLINE void FNAME (const unsigned long sz, \
1240  T* RESTRICT const _res) \
1241 { \
1242  REGISTER long i = sz; \
1243  REGISTER T* res = _res; \
1244  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1245  \
1246  if (LIKELY(i >= UNROLL_DEPTH)) { \
1247  UNR_KERNEL3_PREP; \
1248  do { \
1249  UNR_KERNEL3(OP1); \
1250  } while (i >= UNROLL_DEPTH); \
1251  UNR_KERNEL3_FIX; \
1252  } \
1253  \
1254  for (; i; --i) { \
1255  OP1(*res, f1, f2); \
1256  ++res; \
1257  } \
1258 }
1259 
1261 #define VKERN_TEMPL_1V_C(FNAME,OP1) \
1262 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1263  (const unsigned long, T* RESTRICT const, LCTYPED(T));) \
1264 template <typename T> \
1265 VEC_INLINE void FNAME (const unsigned long sz, \
1266  T* RESTRICT const _res, \
1267  LCTYPE(T) f2) \
1268 { \
1269  REGISTER long i = sz; \
1270  REGISTER T* res = _res; \
1271  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1272  \
1273  if (LIKELY(i >= UNROLL_DEPTH)) { \
1274  UNR_KERNEL3_PREP; \
1275  do { \
1276  UNR_KERNEL3(OP1); \
1277  } while (i >= UNROLL_DEPTH); \
1278  UNR_KERNEL3_FIX; \
1279  } \
1280  \
1281  for (; i; --i) { \
1282  OP1(*res, f1, f2); \
1283  ++res; \
1284  } \
1285 }
1286 
1288 #define VKERN_TEMPL_1V_CC(FNAME,OP1) \
1289 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1290  (const unsigned long, T* RESTRICT const, LCTYPED(T), \
1291  LCTYPED(T));) \
1292 template <typename T> \
1293 VEC_INLINE void FNAME (const unsigned long sz, \
1294  T* RESTRICT const _res, \
1295  LCTYPE(T) f1, \
1296  LCTYPE(T) f2) \
1297 { \
1298  REGISTER long i = sz; \
1299  REGISTER T* res = _res; \
1300  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_WRITE); \
1301  \
1302  if (LIKELY(i >= UNROLL_DEPTH)) { \
1303  UNR_KERNEL3_PREP; \
1304  do { \
1305  UNR_KERNEL3(OP1); \
1306  } while (i >= UNROLL_DEPTH); \
1307  UNR_KERNEL3_FIX; \
1308  } \
1309  \
1310  for (; i; --i) { \
1311  OP1(*res, f1, f2); \
1312  ++res; \
1313  } \
1314 }
1315 
1317 #define VKERN_TEMPL_1V_T(FNAME,OP1,TYPE) \
1318 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1319  (const unsigned long, const T* const, TYPE&);) \
1320 template <typename T> \
1321 VEC_INLINE void FNAME (const unsigned long sz, \
1322  const T* const _res, \
1323  TYPE &_f2) \
1324 { \
1325  /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1326  REGISTER TYPE f2(_f2), f1(0.0); \
1327  REGISTER const T* res = _res; \
1328  REGISTER long i = sz; \
1329  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1330  \
1331  if (LIKELY(i >= UNROLL_DEPTH)) { \
1332  UNR_KERNEL3_PREP; \
1333  do { \
1334  UNR_KERNEL3(OP1); \
1335  } while (i >= UNROLL_DEPTH); \
1336  UNR_KERNEL3_FIX; \
1337  } \
1338  \
1339  for (; i; --i) { \
1340  OP1(*res, f1, f2); \
1341  ++res; \
1342  } \
1343  _f2 = f2 - f1; \
1344 }
1345 
1347 #define VKERN_TEMPL_1V_T_LD(FNAME,OP1,TYPE) \
1348 INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
1349  (const unsigned long, const T* const, TYPE&);) \
1350 template <typename T> \
1351 VEC_INLINE void FNAME (const unsigned long sz, \
1352  const T* const _res, \
1353  TYPE &_f2) \
1354 { \
1355  /* REGISTER tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
1356  REGISTER LONG_DOUBLE f2(_f2); \
1357  REGISTER const T* res = _res; \
1358  REGISTER long i = sz; \
1359  VKERN_TEMPL_1V_PREF(OP1,T,PREFETCH_R,CACHE_LOC_READ); \
1360  \
1361  if (LIKELY(i >= UNROLL_DEPTH)) { \
1362  UNR_KERNEL3_PREP; \
1363  do { \
1364  UNR_KERNEL3(OP1); \
1365  } while (i >= UNROLL_DEPTH); \
1366  UNR_KERNEL3_FIX; \
1367  } \
1368  \
1369  for (; i; --i) { \
1370  OP1(*res, f1, f2); \
1371  ++res; \
1372  } \
1373  _f2 = f2; \
1374 }
1375 
1376 #endif /* TBCI_UNROLL_PREFETCH_DEF2_H */