TBCI Numerical high perf. C++ Library  2.8.0
Macros
unroll_prefetch_def.h File Reference

macros for composing unrolled prefetching loops over arrays. More...

Go to the source code of this file.

Macros

#define LCTYPE(T)   REGISTER typename tbci_traits<T>::loop_const_refval_type
 Shortcut for loop const ref type. More...
 
#define LCTYPED(T)   REGISTER tbci_traits<T>::loop_const_refval_type
 
#define UNROLL_DEPTH   4
 When unrolling the loops, I had the following architectural details in mind: More...
 
#define UNROLL1_PREF_KERNEL5(OPER, T, CA0, CA1, CA2)
 Non-unrolled kernel for 5 args with prefetching. More...
 
#define UNROLL1_KERNEL5(OPER)
 Non-unrolled kernel for 5 args without prefetching. More...
 
#define UNROLL1_KERNEL5_PREPARE   do {} while(0)
 
#define UNROLL1_KERNEL5_FIXUP   do {} while(0)
 
#define UNROLL2_PREF_KERNEL5(OPER, T, CA0, CA1, CA2)
 Twice unrolled kernel for 5 args with prefetching. More...
 
#define UNROLL2_KERNEL5(OPER)
 Twice unrolled kernel for 5 args without prefetching. More...
 
#define UNROLL2_KERNEL5_PREPARE   do {} while(0)
 
#define UNROLL2_KERNEL5_FIXUP   do {} while(0)
 
#define UNROLL4_PREF_KERNEL5(OPER, T, CA0, CA1, CA2)
 Four times unrolled kernel for 5 args with prefetching. More...
 
#define UNROLL4_KERNEL5(OPER)
 Four times unrolled kernel for 5 args without prefetching. More...
 
#define UNROLL4_KERNEL5_PREPARE   do {} while(0)
 
#define UNROLL4_KERNEL5_FIXUP   do {} while(0)
 
#define UNROLL8_PREF_KERNEL5(OPER, T, CA0, CA1, CA2)
 Eight times unrolled kernel for 5 args with prefetching. More...
 
#define UNROLL8_KERNEL5(OPER)
 Four times unrolled kernel for 5 args without prefetching. More...
 
#define UNROLL8_KERNEL5_PREPARE   do {} while(0)
 
#define UNROLL8_KERNEL5_FIXUP   do {} while(0)
 
#define PREF_AHEAD3(T, CA0, CA1, CA2)
 Initial prefetch ahead (3 pointers) More...
 
#define UNROLL1_PREF_KERNEL4_STRIDE(OPER, T, PREFETCH_X, CA0, CA1, RI)
 Non-unrolled kernel for 4 args with prefetching. More...
 
#define UNROLL1_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1, RI)   UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,1)
 
#define UNROLL1_KERNEL4_STRIDE(OPER, RI)
 Non-unrolled kernel for 4 args without prefetching. More...
 
#define UNROLL1_KERNEL4(OPER)   UNROLL1_KERNEL4_STRIDE(OPER,1)
 
#define UNROLL1_KERNEL4_PREPARE   do {} while(0)
 
#define UNROLL1_KERNEL4_FIXUP   do {} while(0)
 
#define UNROLL2_PREF_KERNEL4_STRIDE(OPER, T, PREFETCH_X, CA0, CA1, RI)
 Twice unrolled kernel for 4 args with prefetching. More...
 
#define UNROLL2_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1)   UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
 
#define UNROLL2_KERNEL4_STRIDE(OPER, RI)
 Twice unrolled kernel for 4 args without prefetching. More...
 
#define UNROLL2_KERNEL4(OPER)   UNROLL2_KERNEL4_STRIDE(OPER,1) \
 
#define UNROLL2_KERNEL4_PREPARE   do {} while(0)
 
#define UNROLL2_KERNEL4_FIXUP   do {} while(0)
 
#define UNROLL4_PREF_KERNEL4_STRIDE(OPER, T, PREFETCH_X, CA0, CA1, RI)
 Four times unrolled kernel for 4 args with prefetching. More...
 
#define UNROLL4_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1)   UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
 
#define UNROLL4_KERNEL4_STRIDE(OPER, RI)
 Four times unrolled kernel for 4 args without prefetching. More...
 
#define UNROLL4_KERNEL4(OPER)   UNROLL4_KERNEL4_STRIDE(OPER,1)
 
#define UNROLL4_KERNEL4_PREPARE   do {} while(0)
 
#define UNROLL4_KERNEL4_FIXUP   do {} while(0)
 
#define UNROLL8_PREF_KERNEL4_STRIDE(OPER, T, PREFETCH_X, CA0, CA1, RI)
 Eight times unrolled kernel for 4 args with prefetching. More...
 
#define UNROLL8_PREF_KERNEL4(OPER, T, PREFETCH_X, CA0, CA1)   UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)
 
#define UNROLL8_KERNEL4_STRIDE(OPER, RI)
 Four times unrolled kernel for 4 args without prefetching. More...
 
#define UNROLL8_KERNEL4(OPER)   UNROLL8_KERNEL4_STRIDE(OPER,1)
 
#define UNROLL8_KERNEL4_PREPARE   do {} while(0)
 
#define UNROLL8_KERNEL4_FIXUP   do {} while(0)
 
#define PREF_AHEAD2_STRIDE(T, PREFETCH_X, CA0, CA1, RI)
 Initial prefetch ahead (2 pointers) More...
 
#define PREF_AHEAD2(T, PREFETCH_X, CA0, CA1)   PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,1)
 
#define UNROLL1_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0)
 Non-unrolled kernel for 3 args with prefetching. More...
 
#define UNROLL1_KERNEL3(OPER)
 Non-unrolled kernel for 3 args without prefetching. More...
 
#define UNROLL1_KERNEL3_PREPARE   do {} while(0)
 
#define UNROLL1_KERNEL3_FIXUP   do {} while(0)
 
#define UNROLL2_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0)
 Twice unrolled kernel for 3 args with prefetching. More...
 
#define UNROLL2_KERNEL3(OPER)
 Twice unrolled kernel for 3 args without prefetching. More...
 
#define UNROLL2_KERNEL3_PREPARE   do {} while(0)
 
#define UNROLL2_KERNEL3_FIXUP   do {} while(0)
 
#define UNROLL4_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0)
 Four times unrolled kernel for 3 args with prefetching. More...
 
#define UNROLL4_KERNEL3(OPER)
 Four times unrolled kernel for 3 args without prefetching. More...
 
#define UNROLL4_KERNEL3_PREPARE   do {} while(0)
 
#define UNROLL4_KERNEL3_FIXUP   do {} while(0)
 
#define UNROLL8_PREF_KERNEL3(OPER, T, PREFETCH_X, CA0)
 Eight times unrolled kernel for 3 args with prefetching. More...
 
#define UNROLL8_KERNEL3(OPER)
 Four times unrolled kernel for 3 args without prefetching. More...
 
#define UNROLL8_KERNEL3_PREPARE   do {} while(0)
 
#define UNROLL8_KERNEL3_FIXUP   do {} while(0)
 
#define PREF_AHEAD1(T, PREFETCH_X, CA0)
 Initial prefetch ahead (1 pointer) More...
 
#define UNR_PREF_KERNEL5   UNROLL4_PREF_KERNEL5
 
#define UNR_KERNEL5   UNROLL4_KERNEL5
 
#define UNR_KERNEL5_PREP   UNROLL4_KERNEL5_PREPARE
 
#define UNR_KERNEL5_FIX   UNROLL4_KERNEL5_FIXUP
 
#define UNR_PREF_KERNEL4   UNROLL4_PREF_KERNEL4
 
#define UNR_PREF_KERNEL4_STRIDE   UNROLL4_PREF_KERNEL4_STRIDE
 
#define UNR_KERNEL4   UNROLL4_KERNEL4
 
#define UNR_KERNEL4_STRIDE   UNROLL4_KERNEL4_STRIDE
 
#define UNR_KERNEL4_PREP   UNROLL4_KERNEL4_PREPARE
 
#define UNR_KERNEL4_FIX   UNROLL4_KERNEL4_FIXUP
 
#define UNR_PREF_KERNEL3   UNROLL4_PREF_KERNEL3
 
#define UNR_KERNEL3   UNROLL4_KERNEL3
 
#define UNR_KERNEL3_PREP   UNROLL4_KERNEL3_PREPARE
 
#define UNR_KERNEL3_FIX   UNROLL4_KERNEL3_FIXUP
 
#define VKERN_TEMPL_3V_PREF(OP, T)   do {} while (0)
 Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with variable number of data elements per cacheline 1,2,4,8,16 cachelines ahead prefetch 1,2,4,8 fold unrolling. More...
 
#define VKERN_TEMPL_2V_PREF(OP, T, PREFETCH_X, CW)   do {} while (0)
 
#define VKERN_TEMPL_2V_PREF_STRIDE(OP2, T, PREFETCH_X, CW, RI)   do {} while (0)
 
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)   do {} while (0)
 
#define VKERN_TEMPL_3V(FNAME, OP3)
 gcc-2.95.x seems to fail caching a const double& in a REGISTER. More...
 
#define VKERN_TEMPL_3V_C(FNAME, OP3)
 Operations of type vec = vec OP val * vec. More...
 
#define VKERN_TEMPL_3V_CC(FNAME, OP3)
 Operations of type vec = val * vec OP val * vec. More...
 
#define VKERN_TEMPL_2V(FNAME, OP2)
 Operations of type vec OP= vec. More...
 
#define VKERN_TEMPL_2V_C(FNAME, OP2)
 Operations of type VEC = VEC OP VAL or VAL OP VEC. More...
 
#define VKERN_TEMPL_2V_CC(FNAME, OP2)
 Operations of type VEC = VEC OP VAL or VAL OP VEC. More...
 
#define VKERN_TEMPL_2V_T(FNAME, OP2, TYPE)
 Operations of type TYPE = VEC OP VEC. More...
 
#define VKERN_TEMPL_2V_T_STRIDE(FNAME, OP2, TYPE)
 Operations of type TYPE = VEC OP VEC. More...
 
#define VKERN_TEMPL_1V(FNAME, OP1)
 Operations of type VEC = OP self. More...
 
#define VKERN_TEMPL_1V_C(FNAME, OP1)
 Operations of type VEC OP= VAL. More...
 
#define VKERN_TEMPL_1V_CC(FNAME, OP1)
 Operations of type VEC *= S OP= VAL. More...
 
#define VKERN_TEMPL_1V_T(FNAME, OP1, TYPE)
 Operations of type TYPE = OP VEC This includes the option to compensate for summation errors using f1. More...
 
#define VKERN_TEMPL_1V_T_LD(FNAME, OP1, TYPE)
 Operations of type TYPE = OP VEC This does exclude the option to compensate for summation errors using f1 but instead uses LONG_DOUBLE internally. More...
 

Detailed Description

macros for composing unrolled prefetching loops over arrays.

(c) Kurt Garloff, kurt@.nosp@m.garl.nosp@m.off.d.nosp@m.e, 7/2002, GNU LGPL v2

Id:
unroll_prefetch_def.h,v 1.1.2.34 2022/11/03 17:28:11 garloff Exp

Definition in file unroll_prefetch_def.h.

Macro Definition Documentation

#define LCTYPE (   T)    REGISTER typename tbci_traits<T>::loop_const_refval_type

Shortcut for loop const ref type.

Definition at line 14 of file unroll_prefetch_def.h.

#define LCTYPED (   T)    REGISTER tbci_traits<T>::loop_const_refval_type

Definition at line 15 of file unroll_prefetch_def.h.

#define PREF_AHEAD1 (   T,
  PREFETCH_X,
  CA0 
)

Initial prefetch ahead (1 pointer)

Definition at line 855 of file unroll_prefetch_def.h.

#define PREF_AHEAD2 (   T,
  PREFETCH_X,
  CA0,
  CA1 
)    PREF_AHEAD2_STRIDE(T,PREFETCH_X,CA0,CA1,1)

Definition at line 675 of file unroll_prefetch_def.h.

#define PREF_AHEAD2_STRIDE (   T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)

Initial prefetch ahead (2 pointers)

Definition at line 616 of file unroll_prefetch_def.h.

#define PREF_AHEAD3 (   T,
  CA0,
  CA1,
  CA2 
)

Initial prefetch ahead (3 pointers)

Definition at line 295 of file unroll_prefetch_def.h.

#define UNR_KERNEL3   UNROLL4_KERNEL3

Definition at line 944 of file unroll_prefetch_def.h.

#define UNR_KERNEL3_FIX   UNROLL4_KERNEL3_FIXUP

Definition at line 946 of file unroll_prefetch_def.h.

#define UNR_KERNEL3_PREP   UNROLL4_KERNEL3_PREPARE

Definition at line 945 of file unroll_prefetch_def.h.

#define UNR_KERNEL4   UNROLL4_KERNEL4

Definition at line 938 of file unroll_prefetch_def.h.

#define UNR_KERNEL4_FIX   UNROLL4_KERNEL4_FIXUP

Definition at line 941 of file unroll_prefetch_def.h.

#define UNR_KERNEL4_PREP   UNROLL4_KERNEL4_PREPARE

Definition at line 940 of file unroll_prefetch_def.h.

#define UNR_KERNEL4_STRIDE   UNROLL4_KERNEL4_STRIDE

Definition at line 939 of file unroll_prefetch_def.h.

#define UNR_KERNEL5   UNROLL4_KERNEL5

Definition at line 932 of file unroll_prefetch_def.h.

#define UNR_KERNEL5_FIX   UNROLL4_KERNEL5_FIXUP

Definition at line 934 of file unroll_prefetch_def.h.

#define UNR_KERNEL5_PREP   UNROLL4_KERNEL5_PREPARE

Definition at line 933 of file unroll_prefetch_def.h.

#define UNR_PREF_KERNEL3   UNROLL4_PREF_KERNEL3

Definition at line 943 of file unroll_prefetch_def.h.

#define UNR_PREF_KERNEL4   UNROLL4_PREF_KERNEL4

Definition at line 936 of file unroll_prefetch_def.h.

#define UNR_PREF_KERNEL4_STRIDE   UNROLL4_PREF_KERNEL4_STRIDE

Definition at line 937 of file unroll_prefetch_def.h.

#define UNR_PREF_KERNEL5   UNROLL4_PREF_KERNEL5

Definition at line 931 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL3 (   OPER)
Value:
--i; \
OPER(res[0], f1, f2); \
++res
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Non-unrolled kernel for 3 args without prefetching.

Definition at line 690 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL3_FIXUP   do {} while(0)

Definition at line 696 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL3_PREPARE   do {} while(0)

Definition at line 695 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL4 (   OPER)    UNROLL1_KERNEL4_STRIDE(OPER,1)

Definition at line 403 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL4_FIXUP   do {} while(0)

Definition at line 407 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL4_PREPARE   do {} while(0)

Definition at line 406 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL4_STRIDE (   OPER,
  RI 
)
Value:
--i; \
OPER(res[0], v1[0], f1, f2); \
++v1; res+=RI
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Non-unrolled kernel for 4 args without prefetching.

Definition at line 398 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL5 (   OPER)
Value:
--i; \
OPER(res[0], v1[0], v2[0], f1, f2); \
++v1; ++v2; ++res
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Non-unrolled kernel for 5 args without prefetching.

Definition at line 59 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL5_FIXUP   do {} while(0)

Definition at line 65 of file unroll_prefetch_def.h.

#define UNROLL1_KERNEL5_PREPARE   do {} while(0)

Definition at line 64 of file unroll_prefetch_def.h.

#define UNROLL1_PREF_KERNEL3 (   OPER,
  T,
  PREFETCH_X,
  CA0 
)
Value:
OPER(res[0], f1, f2); \
--i; \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
++res
#define PREF_OFFS(T)
Definition: perf_opt.h:173
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Non-unrolled kernel for 3 args with prefetching.

Definition at line 683 of file unroll_prefetch_def.h.

#define UNROLL1_PREF_KERNEL4 (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)    UNROLL1_PREF_KERNEL4(OPER,T,PREFETCH_X,CA0,CA1,1)

Definition at line 394 of file unroll_prefetch_def.h.

#define UNROLL1_PREF_KERNEL4_STRIDE (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)
Value:
OPER(res[0], v1[0], f1, f2); \
--i; \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
++v1; \
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
res+=RI
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define PREF_OFFS(T)
Definition: perf_opt.h:173
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Non-unrolled kernel for 4 args with prefetching.

Definition at line 386 of file unroll_prefetch_def.h.

#define UNROLL1_PREF_KERNEL5 (   OPER,
  T,
  CA0,
  CA1,
  CA2 
)
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
--i; \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
++v1; \
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
++v2; \
++res
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define PREF_OFFS(T)
Definition: perf_opt.h:173
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Non-unrolled kernel for 5 args with prefetching.

Definition at line 48 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL3 (   OPER)
Value:
OPER(res[0], f1, f2); \
i -= 2; \
OPER(res[1], f1, f2); \
res += 2
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Twice unrolled kernel for 3 args without prefetching.

Definition at line 718 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL3_FIXUP   do {} while(0)

Definition at line 725 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL3_PREPARE   do {} while(0)

Definition at line 724 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL4 (   OPER)    UNROLL2_KERNEL4_STRIDE(OPER,1) \

Definition at line 442 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL4_FIXUP   do {} while(0)

Definition at line 446 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL4_PREPARE   do {} while(0)

Definition at line 445 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL4_STRIDE (   OPER,
  RI 
)
Value:
OPER(res[0], v1[0], f1, f2); \
v1 += 2; i -= 2; \
OPER(res[RI],v1[-1],f1, f2); \
res += 2*RI
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Twice unrolled kernel for 4 args without prefetching.

Definition at line 436 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL5 (   OPER)
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
v1 += 2; i -= 2; \
OPER(res[1], v1[-1], v2[1], f1, f2); \
v2 += 2; res += 2
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Twice unrolled kernel for 5 args without prefetching.

Definition at line 97 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL5_FIXUP   do {} while(0)

Definition at line 104 of file unroll_prefetch_def.h.

#define UNROLL2_KERNEL5_PREPARE   do {} while(0)

Definition at line 103 of file unroll_prefetch_def.h.

#define UNROLL2_PREF_KERNEL3 (   OPER,
  T,
  PREFETCH_X,
  CA0 
)
Value:
if (EL_PER_CL(T) <= 1) { \
OPER(res[0], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
i -= 2; \
OPER(res[1], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
res += 2; \
} else { \
OPER(res[0], f1, f2); \
i -= 2; \
OPER(res[1], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
res += 2; \
} \
#define EL_PER_CL(T)
Definition: perf_opt.h:172
#define PREF_OFFS(T)
Definition: perf_opt.h:173
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Twice unrolled kernel for 3 args with prefetching.

Definition at line 700 of file unroll_prefetch_def.h.

#define UNROLL2_PREF_KERNEL4 (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1 
)    UNROLL2_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)

Definition at line 432 of file unroll_prefetch_def.h.

#define UNROLL2_PREF_KERNEL4_STRIDE (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)
Value:
if (EL_PER_CL(T) <= 1) { \
i -= 2; \
OPER(res[0], v1[0], f1, f2); \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
OPER(res[RI],v1[1], f1, f2); \
v1 += 2; \
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
PREFETCH_X(res+RI*(PREF_OFFS(T)+1), CA0); \
res += 2*RI; \
} else { \
i -= 2; \
OPER(res[0], v1[0], f1, f2); \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
OPER(res[RI],v1[1], f1, f2); \
v1 += 2; \
PREFETCH_X(res+RI*PREF_OFFS(T), CA0); \
res += 2*RI; \
} \
#define EL_PER_CL(T)
Definition: perf_opt.h:172
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define PREF_OFFS(T)
Definition: perf_opt.h:173
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Twice unrolled kernel for 4 args with prefetching.

Definition at line 411 of file unroll_prefetch_def.h.

#define UNROLL2_PREF_KERNEL5 (   OPER,
  T,
  CA0,
  CA1,
  CA2 
)
Value:
if (EL_PER_CL(T) <= 1) { \
i -= 2; \
OPER(res[0], v1[0], v2[0], f1, f2); \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
PREFETCH_R(v1 +PREF_OFFS(T)+1, CA1); \
v1 += 2; \
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
PREFETCH_R(v2 +PREF_OFFS(T)+1, CA2); \
OPER(res[1], v1[-1], v2[1], f1, f2); \
v2 += 2; \
res += 2; \
} else { \
i -= 2; \
OPER(res[0], v1[0], v2[0], f1, f2); \
PREFETCH_R(v1 +PREF_OFFS(T), CA1); \
v1 += 2; \
PREFETCH_R(v2 +PREF_OFFS(T), CA2); \
OPER(res[1], v1[-1], v2[1], f1, f2); \
v2 += 2; \
res += 2; \
} \
#define EL_PER_CL(T)
Definition: perf_opt.h:172
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define PREF_OFFS(T)
Definition: perf_opt.h:173
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Twice unrolled kernel for 5 args with prefetching.

Definition at line 69 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL3 (   OPER)
Value:
OPER(res[0], f1, f2); \
OPER(res[1], f1, f2); \
i -= 4; \
OPER(res[2], f1, f2); \
OPER(res[3], f1, f2); \
res += 4
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 3 args without prefetching.

Definition at line 761 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL3_FIXUP   do {} while(0)

Definition at line 770 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL3_PREPARE   do {} while(0)

Definition at line 769 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL4 (   OPER)    UNROLL4_KERNEL4_STRIDE(OPER,1)

Definition at line 503 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL4_FIXUP   do {} while(0)

Definition at line 507 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL4_PREPARE   do {} while(0)

Definition at line 506 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL4_STRIDE (   OPER,
  RI 
)
Value:
OPER(res[0], v1[0], f1, f2); \
OPER(res[RI],v1[1], f1, f2); \
v1 += 4; i -= 4; \
OPER(res[2*RI], v1[-2], f1, f2); \
OPER(res[3*RI], v1[-1], f1, f2); \
res += 4*RI
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 4 args without prefetching.

Definition at line 495 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL5 (   OPER)
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
i -= 4; \
OPER(res[1], v1[1], v2[1], f1, f2); \
v1 += 4; \
OPER(res[2], v1[-2], v2[2], f1, f2); \
v2 += 4; \
OPER(res[3], v1[-1], v2[-1], f1, f2); \
res += 4
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 5 args without prefetching.

Definition at line 160 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL5_FIXUP   do {} while(0)

Definition at line 171 of file unroll_prefetch_def.h.

#define UNROLL4_KERNEL5_PREPARE   do {} while(0)

Definition at line 170 of file unroll_prefetch_def.h.

#define UNROLL4_PREF_KERNEL3 (   OPER,
  T,
  PREFETCH_X,
  CA0 
)
Value:
if (EL_PER_CL(T) <= 1) { \
OPER(res[0], f1, f2); \
i -= 4; \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
OPER(res[1], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T)+1, CA0); \
OPER(res[2], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
OPER(res[3], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T)+3, CA0); \
res += 4; \
} else if (EL_PER_CL(T) <= 2) { \
OPER(res[0], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
OPER(res[1], f1, f2); \
i -= 4; \
OPER(res[2], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T)+2, CA0); \
OPER(res[3], f1, f2); \
res += 4; \
} else { \
OPER(res[0], f1, f2); \
i -= 4; \
OPER(res[1], f1, f2); \
OPER(res[2], f1, f2); \
PREFETCH_X(res+PREF_OFFS(T), CA0); \
OPER(res[3], f1, f2); \
res += 4; \
}
#define EL_PER_CL(T)
Definition: perf_opt.h:172
#define PREF_OFFS(T)
Definition: perf_opt.h:173
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Four times unrolled kernel for 3 args with prefetching.

Definition at line 729 of file unroll_prefetch_def.h.

#define UNROLL4_PREF_KERNEL4 (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1 
)    UNROLL4_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)

Definition at line 491 of file unroll_prefetch_def.h.

#define UNROLL4_PREF_KERNEL4_STRIDE (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)

Four times unrolled kernel for 4 args with prefetching.

Definition at line 450 of file unroll_prefetch_def.h.

#define UNROLL4_PREF_KERNEL5 (   OPER,
  T,
  CA0,
  CA1,
  CA2 
)

Four times unrolled kernel for 5 args with prefetching.

Definition at line 108 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL3 (   OPER)
Value:
OPER(res[0], f1, f2); \
OPER(res[1], f1, f2); \
OPER(res[2], f1, f2); \
OPER(res[3], f1, f2); \
i -= 8; \
OPER(res[4], f1, f2); \
OPER(res[5], f1, f2); \
OPER(res[6], f1, f2); \
OPER(res[7], f1, f2); \
res += 8
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 3 args without prefetching.

Definition at line 838 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL3_FIXUP   do {} while(0)

Definition at line 851 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL3_PREPARE   do {} while(0)

Definition at line 850 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL4 (   OPER)    UNROLL8_KERNEL4_STRIDE(OPER,1)

Definition at line 608 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL4_FIXUP   do {} while(0)

Definition at line 612 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL4_PREPARE   do {} while(0)

Definition at line 611 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL4_STRIDE (   OPER,
  RI 
)
Value:
OPER(res[0], v1[0], f1, f2); \
OPER(res[RI],v1[1], f1, f2); \
OPER(res[2*RI], v1[2], f1, f2); \
OPER(res[3*RI], v1[3], f1, f2); \
v1 += 8; i -= 8; \
OPER(res[4*RI], v1[-4], f1, f2); \
OPER(res[5*RI], v1[-3], f1, f2); \
OPER(res[6*RI], v1[-2], f1, f2); \
OPER(res[7*RI], v1[-1], f1, f2); \
res += 8*RI
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 4 args without prefetching.

Definition at line 596 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL5 (   OPER)
Value:
OPER(res[0], v1[0], v2[0], f1, f2); \
OPER(res[1], v1[1], v2[1], f1, f2); \
i -= 8; \
OPER(res[2], v1[2], v2[2], f1, f2); \
OPER(res[3], v1[3], v2[3], f1, f2); \
v1 += 8; \
OPER(res[4], v1[-4], v2[4], f1, f2); \
OPER(res[5], v1[-3], v2[5], f1, f2); \
v2 += 8; \
OPER(res[6], v1[-2], v2[-2], f1, f2); \
OPER(res[7], v1[-1], v2[-1], f1, f2); \
res += 8
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 5 args without prefetching.

Definition at line 277 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL5_FIXUP   do {} while(0)

Definition at line 292 of file unroll_prefetch_def.h.

#define UNROLL8_KERNEL5_PREPARE   do {} while(0)

Definition at line 291 of file unroll_prefetch_def.h.

#define UNROLL8_PREF_KERNEL3 (   OPER,
  T,
  PREFETCH_X,
  CA0 
)

Eight times unrolled kernel for 3 args with prefetching.

Definition at line 774 of file unroll_prefetch_def.h.

#define UNROLL8_PREF_KERNEL4 (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1 
)    UNROLL8_PREF_KERNEL4_STRIDE(OPER,T,PREFETCH_X,CA0,CA1,1)

Definition at line 592 of file unroll_prefetch_def.h.

#define UNROLL8_PREF_KERNEL4_STRIDE (   OPER,
  T,
  PREFETCH_X,
  CA0,
  CA1,
  RI 
)

Eight times unrolled kernel for 4 args with prefetching.

Definition at line 511 of file unroll_prefetch_def.h.

#define UNROLL8_PREF_KERNEL5 (   OPER,
  T,
  CA0,
  CA1,
  CA2 
)

Eight times unrolled kernel for 5 args with prefetching.

Definition at line 175 of file unroll_prefetch_def.h.

#define UNROLL_DEPTH   4

When unrolling the loops, I had the following architectural details in mind:

  • We have a superscalar pipelined instruction execution. Which means
    • We can execute more than one instruction in parallel per cycle. This was the reason to mix FP and Integer insns.
    • That we should have some delay between doing a computation and using the result, as it computation has to go through the pipeline before the result becomes available.
  • We have a relatively slow memory and fast caches; therefore we issue prefetch instructions to trigger memory loads before the the data is needed. These prefetch insns are supposed to trigger the data to be transfered from memory into the local cache without causing the pipelines to stall. When the data is actually accessed, it should be in local cache and not cause any delay.
  • Though prefetching beyond the array should not cause segfaults, we avoid it for performance reasons. Especially important on SMP.

Funny enough, with this little knowledge, we do better than any compiler I found. Compaq cxx on alpha comes close, though. KG.

Definition at line 40 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V (   FNAME,
  OP1 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const);) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res) \
{ \
PREFETCH_W(_res, 3); \
REGISTER long i = sz; \
REGISTER T* res = _res; \
do { \
UNR_KERNEL3(OP1); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP1(*res, f1, f2); \
++res; \
} \
}
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNR_KERNEL3_FIX
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL3_PREP
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define UNR_KERNEL3
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type VEC = OP self.

Definition at line 1328 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V_C (   FNAME,
  OP1 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, LCTYPED(T));) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
LCTYPE(T) f2) \
{ \
PREFETCH_W(_res, 3); \
REGISTER long i = sz; \
REGISTER T* res = _res; \
do { \
UNR_KERNEL3(OP1); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP1(*res, f1, f2); \
++res; \
} \
}
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNR_KERNEL3_FIX
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL3_PREP
#define LCTYPED(T)
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define LCTYPE(T)
Shortcut for loop const ref type.
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define UNR_KERNEL3
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type VEC OP= VAL.

Definition at line 1355 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V_CC (   FNAME,
  OP1 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, \
LCTYPED(T), LCTYPED(T));) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
LCTYPE(T) f1, \
LCTYPE(T) f2) \
{ \
PREFETCH_W(_res, 3); \
REGISTER long i = sz; \
REGISTER T* res = _res; \
do { \
UNR_KERNEL3(OP1); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP1(*res, f1, f2); \
++res; \
} \
}
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNR_KERNEL3_FIX
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL3_PREP
#define LCTYPED(T)
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define LCTYPE(T)
Shortcut for loop const ref type.
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define UNR_KERNEL3
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type VEC *= S OP= VAL.

Definition at line 1383 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V_PREF (   OP,
  T,
  PREFETCH_X,
  CW 
)    do {} while (0)

Definition at line 1043 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V_T (   FNAME,
  OP1,
  TYPE 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, const T* const, TYPE&);) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
const T* const _res, \
TYPE &_f2) \
{ \
PREFETCH_R(_res, 3); \
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
REGISTER TYPE f2(_f2), f1(0.0); \
REGISTER const T* res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL3(OP1); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP1(*res, f1, f2); \
++res; \
} \
_f2 = f2 - f1; \
}
#define REGISTER
Definition: basics.h:108
#define UNR_KERNEL3_FIX
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL3_PREP
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define UNR_KERNEL3
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define CACHE_LOC_READ
Cache locality for read from and written to pointers 0: don&#39;t cache (streaming data, only accessed once).
Definition: perf_opt.h:165
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type TYPE = OP VEC This includes the option to compensate for summation errors using f1.

Definition at line 1416 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_1V_T_LD (   FNAME,
  OP1,
  TYPE 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, const T* const, TYPE&);) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
const T* const _res, \
TYPE &_f2) \
{ \
PREFETCH_R(_res, 3); \
/* REGISTER typename tbci_traits<TYPE>::loop_refval_type f2(_f2); */ \
REGISTER const T* res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL3(OP1); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP1(*res, f1, f2); \
++res; \
} \
_f2 = f2; \
}
#define REGISTER
Definition: basics.h:108
#define UNR_KERNEL3_FIX
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL3_PREP
#define VKERN_TEMPL_1V_PREF(OP, T, PREFETCH_X, CW)
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define UNR_KERNEL3
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define CACHE_LOC_READ
Cache locality for read from and written to pointers 0: don&#39;t cache (streaming data, only accessed once).
Definition: perf_opt.h:165
#define LONG_DOUBLE
Definition: basics.h:219
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type TYPE = OP VEC This does exclude the option to compensate for summation errors using f1 but instead uses LONG_DOUBLE internally.

Definition at line 1450 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V (   FNAME,
  OP2 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, const T* RESTRICT const);) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
const T* RESTRICT const _v1) \
{ \
PREFETCH_W(_res, 3); \
PREFETCH_R(_v1, 3); \
REGISTER const T *v1 = _v1; \
REGISTER T* res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL4(OP2); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP2(*res, *v1, f1, f2); \
++v1; ++res; \
} \
}
#define VKERN_TEMPL_2V_PREF(OP, T, PREFETCH_X, CW)
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL4_FIX
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define UNR_KERNEL4
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
#define UNR_KERNEL4_PREP
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type vec OP= vec.

Definition at line 1161 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_C (   FNAME,
  OP2 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
LCTYPED(T));) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
const T* RESTRICT const _v1, \
LCTYPE(T) f2) \
{ \
PREFETCH_W(_res, 3); \
PREFETCH_R(_v1, 3); \
REGISTER const T *v1 = _v1; \
REGISTER T* res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL4(OP2); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP2(*res, *v1, f1, f2); \
++v1; ++res; \
} \
}
#define VKERN_TEMPL_2V_PREF(OP, T, PREFETCH_X, CW)
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define LCTYPED(T)
#define UNR_KERNEL4_FIX
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define UNR_KERNEL4
#define LCTYPE(T)
Shortcut for loop const ref type.
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
#define UNR_KERNEL4_PREP
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type VEC = VEC OP VAL or VAL OP VEC.

Definition at line 1191 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_CC (   FNAME,
  OP2 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
LCTYPED(T), LCTYPED(T));) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
const T* RESTRICT const _v1, \
LCTYPE(T) f1, \
LCTYPE(T) f2) \
{ \
PREFETCH_W(_res, 3); \
PREFETCH_R(_v1, 3); \
REGISTER const T *v1 = _v1; \
REGISTER T* res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL4(OP2); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP2(*res, *v1, f1, f2); \
++v1; ++res; \
} \
}
#define VKERN_TEMPL_2V_PREF(OP, T, PREFETCH_X, CW)
#define CACHE_LOC_WRITE
Definition: perf_opt.h:168
#define REGISTER
Definition: basics.h:108
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define LCTYPED(T)
#define UNR_KERNEL4_FIX
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define UNR_KERNEL4
#define LCTYPE(T)
Shortcut for loop const ref type.
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
#define UNR_KERNEL4_PREP
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Operations of type VEC = VEC OP VAL or VAL OP VEC.

Definition at line 1223 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_PREF (   OP,
  T,
  PREFETCH_X,
  CW 
)    do {} while (0)

Definition at line 1041 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_PREF_STRIDE (   OP2,
  T,
  PREFETCH_X,
  CW,
  RI 
)    do {} while (0)

Definition at line 1042 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_T (   FNAME,
  OP2,
  TYPE 
)

Operations of type TYPE = VEC OP VEC.

Definition at line 1256 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_2V_T_STRIDE (   FNAME,
  OP2,
  TYPE 
)

Operations of type TYPE = VEC OP VEC.

Definition at line 1292 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_3V (   FNAME,
  OP3 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, const T* RESTRICT const);) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
const T* RESTRICT const _v1, \
const T* RESTRICT const _v2) \
{ \
PREFETCH_W(_res, 3); \
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
REGISTER const T *v1 = _v1, *v2 = _v2; \
REGISTER T *res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL5(OP3); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP3(*res, *v1, *v2, f1, f2); \
++v1; ++v2; ++res; \
} \
}
#define REGISTER
Definition: basics.h:108
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define UNR_KERNEL5_FIX
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define UNR_KERNEL5_PREP
#define VKERN_TEMPL_3V_PREF(OP, T)
Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with varia...
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100
#define UNR_KERNEL5

gcc-2.95.x seems to fail caching a const double& in a REGISTER.

So we have to use a local REGISTER var to force it doing so. for maximum performance. However, this is only beneficial in case we have an elementary type that does fit into a REGISTER. It would be nice to have macros that automatically do it when needed. However, sizeof(T) can't be evaluated by the preprocessor, so we can't know. Instead we use explicit specialization of our templates.Operations of type vec = vec OP vec

Definition at line 1063 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_3V_C (   FNAME,
  OP3 
)
Value:
INST(template <typename T> class Vector<T> friend VEC_INLINE void FNAME \
(const unsigned long, T* RESTRICT const, const T* RESTRICT const, \
const T* RESTRICT const, LCTYPED(T));) \
template <typename T> \
VEC_INLINE void FNAME (const unsigned long sz, \
T* RESTRICT const _res, \
const T* RESTRICT const _v1, \
const T* RESTRICT const _v2, \
LCTYPE(T) f2) \
{ \
PREFETCH_W(_res, 3); \
PREFETCH_R(_v1, 3); PREFETCH_R(_v2, 3); \
REGISTER const T *v1 = _v1, *v2 = _v2; \
REGISTER T *res = _res; \
REGISTER long i = sz; \
do { \
UNR_KERNEL5(OP3); \
} while (i >= UNROLL_DEPTH); \
} \ \
for (; i; --i) { \
OP3(*res, *v1, *v2, f1, f2); \
++v1; ++v2; ++res; \
} \
}
#define REGISTER
Definition: basics.h:108
#define UNROLL_DEPTH
When unrolling the loops, I had the following architectural details in mind:
#define LCTYPED(T)
#define UNR_KERNEL5_FIX
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
#define LCTYPE(T)
Shortcut for loop const ref type.
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
#define RESTRICT
Definition: basics.h:89
#define UNR_KERNEL5_PREP
#define VKERN_TEMPL_3V_PREF(OP, T)
Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with varia...
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100
#define UNR_KERNEL5

Operations of type vec = vec OP val * vec.

Definition at line 1094 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_3V_CC (   FNAME,
  OP3 
)

Operations of type vec = val * vec OP val * vec.

Definition at line 1127 of file unroll_prefetch_def.h.

#define VKERN_TEMPL_3V_PREF (   OP,
  T 
)    do {} while (0)

Fragments to be combined for different cases 1,2,3 vector fields 0,1,2 scalars to multiply with variable number of data elements per cacheline 1,2,4,8,16 cachelines ahead prefetch 1,2,4,8 fold unrolling.

The structure is the same, always. (1) Before anything else, start read prefecthing. (2) Unrolled and (both read+write) prefetching loop (3) Unrolled loop (for the elements where prefecthing would be beyond array which could be a performance problem and for write prefecthing maybe a real problem (4) Non-unrolled loop for the remaining elements.

Definition at line 1040 of file unroll_prefetch_def.h.