TBCI Numerical high perf. C++ Library  2.8.0
Macros
unroll_prefetch_simd_def.h File Reference

macros for composing unrolled prefetching loops over arrays using SIMD instrinsics. More...

Go to the source code of this file.

Macros

#define UNROLL4_PREF_KERNEL5_SIMD(MDOP, ADV, T, SUF, UNA1, UNA2)
 TODO: Should be merged with unroll_prefetch_def.h. More...
 
#define UNROLL4_KERNEL5_SIMD(MDOP, ADV, SUF, UNA1, UNA2)
 Four times unrolled kernel for 5 args without prefetching. More...
 
#define VKERN_TEMPL_3V_NP_SIMD(MDOP, ADV, STP, SUF, UNA1, UNA2)
 
#define VKERN_TEMPL_3V_PLAIN_SIMD(MDOP, ADV, STP, SUF, UNA1, UNA2)
 
#define VKERN_TEMPL_3V_SISD(SDOP, COND, STP, SUF)
 
#define UNROLL4_PREF_KERNEL4_SIMD(MDOP, ADV, T, SUF, UNA)
 Four times unrolled kernel for 4 args with prefetching. More...
 
#define UNROLL4_KERNEL4_SIMD(MDOP, ADV, SUF, UNA)
 Four times unrolled kernel for 4 args without prefetching. More...
 
#define VKERN_TEMPL_2V_NP_SIMD(MDOP, ADV, STP, SUF, UNA)
 
#define VKERN_TEMPL_2V_PLAIN_SIMD(MDOP, ADV, STP, SUF, UNA)
 
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)
 
#define UNROLL4_PREF_KERNEL3_SIMD(MDOP, ADV, T, SUF)
 Four times unrolled kernel for 3 args with prefetching TODO: Prefetching. More...
 
#define UNROLL4_KERNEL3_SIMD(MDOP, ADV, SUF)
 Four times unrolled kernel for 3 args without prefetching. More...
 
#define VKERN_TEMPL_1V_NP_SIMD(MDOP, ADV, STP, SUF)
 
#define VKERN_TEMPL_1V_PLAIN_SIMD(MDOP, ADV, STP, SUF)
 
#define VKERN_TEMPL_1V_SISD(SDOP, COND, STP, SUF)
 
#define NO_TBCI_SIMD_UNROLL
 To unroll or not to unroll: define TBCI_SIMD_UNROLL if it's beneficial for your CPU (it's not for most newer ones) More...
 
#define VKERN_TEMPL_3V_K_SIMD(m, a, s, f, u1, u2)   VKERN_TEMPL_3V_PLAIN_SIMD(m,a,s,f,u1,u2)
 
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)   VKERN_TEMPL_2V_PLAIN_SIMD(m,a,s,f,u)
 
#define VKERN_TEMPL_1V_K_SIMD(m, a, s, f)   VKERN_TEMPL_1V_PLAIN_SIMD(m,a,s,f)
 
#define ALIGN_REQ   0x0f
 
#define MISALIGNMENT_CHECK(x)   ((unsigned long)x & ALIGN_REQ)
 
#define WARN_UNALIGN(v)   do {} while (0)
 WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD accesses, which will be slower ... More...
 
#define VKERN_TEMPL_3V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial. More...
 
#define VKERN_TEMPL_3V_SIMD_UA(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 Without the unaligned warning. More...
 
#define VKERN_TEMPL_3V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_3V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_2V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_2V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_2V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_2V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_2V_T_SIMD_VL(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_1V_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_1V_C_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_1V_CC_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 
#define VKERN_TEMPL_1V_T_SIMD(FNAME, OP, SSUF, MSUF, PREP, SFIN, FIN, ADV, TYPE, STP)
 

Detailed Description

macros for composing unrolled prefetching loops over arrays using SIMD instrinsics.

(c) Kurt Garloff, kurt@.nosp@m.garl.nosp@m.off.d.nosp@m.e, 4/2005, GNU LGPL v2

Id:
unroll_prefetch_simd_def.h,v 1.1.2.28 2022/11/03 17:28:11 garloff Exp

Definition in file unroll_prefetch_simd_def.h.

Macro Definition Documentation

#define ALIGN_REQ   0x0f

Definition at line 257 of file unroll_prefetch_simd_def.h.

#define MISALIGNMENT_CHECK (   x)    ((unsigned long)x & ALIGN_REQ)

Definition at line 266 of file unroll_prefetch_simd_def.h.

#define NO_TBCI_SIMD_UNROLL

To unroll or not to unroll: define TBCI_SIMD_UNROLL if it's beneficial for your CPU (it's not for most newer ones)

Definition at line 230 of file unroll_prefetch_simd_def.h.

#define UNROLL4_KERNEL3_SIMD (   MDOP,
  ADV,
  SUF 
)
Value:
MDOP(res, f1, f2, SUF); \
MDOP(res+ADV, f1, f2, SUF); \
i -= 4*ADV; \
MDOP(res+2*ADV, f1, f2, SUF); \
MDOP(res+3*ADV, f1, f2, SUF); \
res += 4*ADV
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 3 args without prefetching.

Definition at line 186 of file unroll_prefetch_simd_def.h.

#define UNROLL4_KERNEL4_SIMD (   MDOP,
  ADV,
  SUF,
  UNA 
)
Value:
MDOP(res, v1, f1, f2, SUF, UNA); \
MDOP(res+ADV, v1+ADV, f1, f2, SUF, UNA); \
i -= 4*ADV; \
MDOP(res+2*ADV, v1+2*ADV, f1, f2, SUF, UNA); \
MDOP(res+3*ADV, v1+3*ADV, f1, f2, SUF, UNA); \
v1 += 4*ADV; res += 4*ADV
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 4 args without prefetching.

Definition at line 119 of file unroll_prefetch_simd_def.h.

#define UNROLL4_KERNEL5_SIMD (   MDOP,
  ADV,
  SUF,
  UNA1,
  UNA2 
)
Value:
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2);\
v1 += 4*ADV; \
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
v2 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
res += 4*ADV
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Four times unrolled kernel for 5 args without prefetching.

Definition at line 51 of file unroll_prefetch_simd_def.h.

#define UNROLL4_PREF_KERNEL3_SIMD (   MDOP,
  ADV,
  T,
  SUF 
)
Value:
if (EL_PER_CL(T) <= 1) { \
MDOP(res, f1, f2, SUF); \
MDOP(res+ADV, f1, f2, SUF); \
i -= 4*ADV; \
MDOP(res+2*ADV, f1, f2, SUF); \
MDOP(res+3*ADV, f1, f2, SUF); \
res += 4*ADV; \
} else if (EL_PER_CL(T) <= 2) { \
MDOP(res, f1, f2, SUF); \
MDOP(res+ADV, f1, f2, SUF); \
i -= 4*ADV; \
MDOP(res+2*ADV, f1, f2, SUF); \
MDOP(res+3*ADV, f1, f2, SUF); \
res += 4*ADV; \
} else { \
MDOP(res, f1, f2, SUF); \
MDOP(res+ADV, f1, f2, SUF); \
i -= 4*ADV; \
MDOP(res+2*ADV, f1, f2, SUF); \
MDOP(res+3*ADV, f1, f2, SUF); \
res += 4*ADV; \
}
#define EL_PER_CL(T)
Definition: perf_opt.h:172
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Four times unrolled kernel for 3 args with prefetching TODO: Prefetching.

(FIXME: Is it needed? SSE2 capable CPUs do hardware prefetching, no???)

Definition at line 160 of file unroll_prefetch_simd_def.h.

#define UNROLL4_PREF_KERNEL4_SIMD (   MDOP,
  ADV,
  T,
  SUF,
  UNA 
)
Value:
if (EL_PER_CL(T) <= 1) { \
MDOP(res,v1,f1,f2,SUF,UNA); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
v1 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
res += 4*ADV; \
} else if (EL_PER_CL(T) <= 2) { \
MDOP(res,v1,f1,f2, SUF,UNA); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
v1 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
res += 4*ADV; \
} else { \
MDOP(res,v1,f1,f2,SUF,UNA); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,f1,f2,SUF,UNA); \
MDOP(res+2*ADV,v1+2*ADV,f1,f2,SUF,UNA); \
v1 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,f1,f2,SUF,UNA); \
res += 4*ADV; \
}
#define EL_PER_CL(T)
Definition: perf_opt.h:172
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

Four times unrolled kernel for 4 args with prefetching.

Definition at line 90 of file unroll_prefetch_simd_def.h.

#define UNROLL4_PREF_KERNEL5_SIMD (   MDOP,
  ADV,
  T,
  SUF,
  UNA1,
  UNA2 
)
Value:
if (EL_PER_CL(T) <= 1) { \
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
v1 += 4*ADV; \
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
v2 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
res += 4*ADV; \
} else if (EL_PER_CL(T) <= 2) { \
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
v1 += 4*ADV; \
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
v2 += 4*ADV; \
MDOP(res+3*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
res += 4*ADV; \
} else { \
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
i -= 4*ADV; \
MDOP(res+ADV,v1+ADV,v2+ADV,f1,f2,SUF,UNA1,UNA2); \
v1 += 4*ADV; \
MDOP(res+2*ADV,v1-2*ADV,v2+2*ADV,f1,f2,SUF,UNA1,UNA2); \
v2 += 4*ADV; \
MDOP(res+4*ADV,v1-ADV,v2-ADV,f1,f2,SUF,UNA1,UNA2); \
res += 4*ADV; \
}
#define EL_PER_CL(T)
Definition: perf_opt.h:172
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20

TODO: Should be merged with unroll_prefetch_def.h.

Note that we dropped all PREFETCH insns, HW that does SSE2 in general does prefetching as well, so we rather settle for smaller kernels.Four times unrolled kernel for 5 args with prefetching

Definition at line 19 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_C_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER TYPE *res= _res; \
PREP(_f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
SFIN; \
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
FIN(_f2); \
}
#define VKERN_TEMPL_1V_SISD(SDOP, COND, STP, SUF)
#define REGISTER
Definition: basics.h:108
#define VKERN_TEMPL_1V_K_SIMD(m, a, s, f)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89

Definition at line 571 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_CC_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
LCTYPED(TYPE) _f1, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER TYPE *res= _res; \
PREP(_f1, _f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
SFIN; \
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
FIN(_f1, _f2); \
}
#define VKERN_TEMPL_1V_SISD(SDOP, COND, STP, SUF)
#define REGISTER
Definition: basics.h:108
#define VKERN_TEMPL_1V_K_SIMD(m, a, s, f)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89

Definition at line 588 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_K_SIMD (   m,
  a,
  s,
 
)    VKERN_TEMPL_1V_PLAIN_SIMD(m,a,s,f)

Definition at line 241 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_NP_SIMD (   MDOP,
  ADV,
  STP,
  SUF 
)
Value:
if (LIKELY(i >= 4*ADV)) { \
STP TMP UNUSED; \
STP y UNUSED; \
STP t UNUSED; \
do { \
UNROLL4_KERNEL3_SIMD(MDOP,ADV,SUF); \
} while (i >= 4*ADV); \
}
#define UNROLL4_KERNEL3_SIMD(MDOP, ADV, SUF)
Four times unrolled kernel for 3 args without prefetching.
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Definition at line 195 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_PLAIN_SIMD (   MDOP,
  ADV,
  STP,
  SUF 
)
Value:
while (i >= ADV) { \
STP TMP UNUSED; \
STP y UNUSED; \
STP t UNUSED; \
MDOP(res,f1,f2,SUF); \
i -= ADV; res += ADV; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 206 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res)) \
{ \
REGISTER TYPE *res= _res; \
PREP; \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
SFIN; \
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
FIN; \
}
#define VKERN_TEMPL_1V_SISD(SDOP, COND, STP, SUF)
#define REGISTER
Definition: basics.h:108
#define VKERN_TEMPL_1V_K_SIMD(m, a, s, f)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89

Definition at line 555 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_SISD (   SDOP,
  COND,
  STP,
  SUF 
)
Value:
while (COND && i) { \
STP TMP UNUSED; \
STP y UNUSED; \
STP t UNUSED; \
SDOP(res,f1,f2,SUF); \
--i; ++res; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 216 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_1V_T_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
const TYPE* const _res, \
TYPE &_f2)) \
{ \
REGISTER const TYPE *res= _res; \
/* PREP(0.0,_f2); */ \
PREP(_f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_1V_K_SIMD(OP,ADV,STP,MSUF); \
SFIN; \
VKERN_TEMPL_1V_SISD(OP,true,STP,SSUF); \
/* FIN(_f1,_f2); */ \
FIN(_f2); \
}
#define VKERN_TEMPL_1V_SISD(SDOP, COND, STP, SUF)
#define REGISTER
Definition: basics.h:108
#define VKERN_TEMPL_1V_K_SIMD(m, a, s, f)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 606 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_C_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER const TYPE *v1 = _v1; \
REGISTER TYPE *res= _res; \
PREP(_f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
} else { \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
} \
SFIN; \
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
FIN(_f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)

Definition at line 454 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_CC_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
LCTYPED(TYPE) _f1, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER const TYPE *v1 = _v1; \
REGISTER TYPE *res= _res; \
PREP(_f1, _f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
} else { \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
} \
SFIN; \
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
FIN(_f1, _f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)

Definition at line 478 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_K_SIMD (   m,
  a,
  s,
  f,
 
)    VKERN_TEMPL_2V_PLAIN_SIMD(m,a,s,f,u)

Definition at line 240 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_NP_SIMD (   MDOP,
  ADV,
  STP,
  SUF,
  UNA 
)
Value:
if (LIKELY(i >= 4*ADV)) { \
STP TMP, LD UNUSED, t UNUSED; \
do { \
UNROLL4_KERNEL4_SIMD(MDOP,ADV,SUF,UNA); \
} while (i >= 4*ADV); \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
#define UNROLL4_KERNEL4_SIMD(MDOP, ADV, SUF, UNA)
Four times unrolled kernel for 4 args without prefetching.
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Definition at line 129 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_PLAIN_SIMD (   MDOP,
  ADV,
  STP,
  SUF,
  UNA 
)
Value:
while (i >= ADV) { \
STP TMP, LD UNUSED, t UNUSED; \
MDOP(res, v1, f1, f2, SUF, UNA); \
i -= ADV; v1 += ADV; res += ADV; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 138 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1)) \
{ \
REGISTER TYPE *res = _res; \
REGISTER const TYPE *v1 = _v1; \
PREP; \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
} else { \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
} \
SFIN; \
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
FIN; \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)

Definition at line 431 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_SISD (   SDOP,
  COND,
  STP,
  SUF 
)
Value:
while (COND && i) { \
STP TMP, LD UNUSED, t UNUSED; \
SDOP(res,v1,f1,f2,SUF,u); \
--i; ++v1; ++res; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 146 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_T_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
const TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
TYPE &_f2)) \
{ \
REGISTER const TYPE *res= _res, *v1 = _v1; \
/* PREP(0.0,_f2); */ \
PREP(_f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
} else { \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
} \
SFIN; \
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
/* FIN(_f1,_f2); */ \
FIN(_f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)

Definition at line 503 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_2V_T_SIMD_VL (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
const TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
volatile long &_f2)) \
{ \
REGISTER const TYPE *res= _res, *v1 = _v1; \
/* PREP(0.0,_f2); */ \
PREP(_f2); \
REGISTER long i = sz; \
REGISTER int rg = 0; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,u) \
} else { \
VKERN_TEMPL_2V_K_SIMD(OP,ADV,STP,MSUF,); \
} \
SFIN; \
VKERN_TEMPL_2V_SISD(OP,true,STP,SSUF); \
/* FIN(_f1,_f2); */ \
_fin: \
FIN(_f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_2V_K_SIMD(m, a, s, f, u)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89
#define VKERN_TEMPL_2V_SISD(SDOP, COND, STP, SUF)

Definition at line 528 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_C_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
const TYPE* RESTRICT const _v2, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
REGISTER TYPE *res = _res; \
PREP(_f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
} else if (MISALIGNMENT_CHECK(v1)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
} else if (MISALIGNMENT_CHECK(v2)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
} else { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
} \
SFIN; \
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
FIN(_f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_3V_SISD(SDOP, COND, STP, SUF)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
#define VKERN_TEMPL_3V_K_SIMD(m, a, s, f, u1, u2)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89

Definition at line 367 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_CC_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
const TYPE* RESTRICT const _v2, \
LCTYPED(TYPE) _f1, \
LCTYPED(TYPE) _f2)) \
{ \
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
REGISTER TYPE *res = _res; \
PREP(_f1, _f2); \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
} else if (MISALIGNMENT_CHECK(v1)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
} else if (MISALIGNMENT_CHECK(v2)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
} else { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
} \
SFIN; \
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
FIN(_f1, _f2); \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_3V_SISD(SDOP, COND, STP, SUF)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
#define VKERN_TEMPL_3V_K_SIMD(m, a, s, f, u1, u2)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define LCTYPED(T)
Definition: plain_def.h:14
#define RESTRICT
Definition: basics.h:89

Definition at line 398 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_K_SIMD (   m,
  a,
  s,
  f,
  u1,
  u2 
)    VKERN_TEMPL_3V_PLAIN_SIMD(m,a,s,f,u1,u2)

Definition at line 239 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_NP_SIMD (   MDOP,
  ADV,
  STP,
  SUF,
  UNA1,
  UNA2 
)
Value:
if (LIKELY(i >= 4*ADV)) { \
STP TMP, LD UNUSED; \
do { \
UNROLL4_KERNEL5_SIMD(MDOP,ADV,SUF,UNA1,UNA2); \
} while (i >= 4*ADV); \
}
#define UNROLL4_KERNEL5_SIMD(MDOP, ADV, SUF, UNA1, UNA2)
Four times unrolled kernel for 5 args without prefetching.
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100

Definition at line 63 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_PLAIN_SIMD (   MDOP,
  ADV,
  STP,
  SUF,
  UNA1,
  UNA2 
)
Value:
while (i >= ADV) { \
STP TMP, LD UNUSED; \
MDOP(res,v1,v2,f1,f2,SUF,UNA1,UNA2); \
i -= ADV; res += ADV; v1 += ADV; v2 +=ADV; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 72 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_SIMD (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
const TYPE* RESTRICT const _v2)) \
{ \
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
REGISTER TYPE *res = _res; \
PREP; \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
} else if (MISALIGNMENT_CHECK(v1)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
} else if (MISALIGNMENT_CHECK(v2)) { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
} else { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
} \
SFIN; \
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
FIN; \
}
#define REGISTER
Definition: basics.h:108
#define WARN_UNALIGN(v)
WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD ...
#define VKERN_TEMPL_3V_SISD(SDOP, COND, STP, SUF)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
#define VKERN_TEMPL_3V_K_SIMD(m, a, s, f, u1, u2)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89

TODO: Check whether enabling the non-unrolled fixup (loop tail) is beneficial.

Macro abuse ... FNAME:Function name OP: operation for each loop (macro), sse2 intrinsics SSUF: argument passed to OP macro (suffix for single data operation) MSUF: dito (suffix used for multiple data operation (SIMD)) PREP: Preparation macro before loop, called with _f1, _f2 as args (as available) SFIN: Cleanup macro after we're done with SIMD part FIN: Cleanup macro before leaving, called with _f1, _f2 (as avail) ADV: How many elements the SIMD instructions handle per insn OP (2/4) TYPE: Standard C data type (float/double) STP: SIMD data type (__m128/__m128d)

Definition at line 306 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_SIMD_UA (   FNAME,
  OP,
  SSUF,
  MSUF,
  PREP,
  SFIN,
  FIN,
  ADV,
  TYPE,
  STP 
)
Value:
TWEAK(template <> \
VEC_INLINE void FNAME < TYPE > (const unsigned long sz, \
TYPE* RESTRICT const _res, \
const TYPE* RESTRICT const _v1, \
const TYPE* RESTRICT const _v2)) \
{ \
REGISTER const TYPE *v1 = _v1, *v2 = _v2; \
REGISTER TYPE *res = _res; \
PREP; \
REGISTER long i = sz; \
/* Make sure we have proper alignment */ \
/*WARN_UNALIGN(v1); WARN_UNALIGN(v2);*/ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,u); \
} else if (MISALIGNMENT_CHECK(v1)) { \
/*WARN_UNALIGN(v1);*/ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,u,); \
} else if (MISALIGNMENT_CHECK(v2)) { \
/*WARN_UNALIGN(v2);*/ \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,u); \
} else { \
VKERN_TEMPL_3V_K_SIMD(OP,ADV,STP,MSUF,,); \
} \
SFIN; \
VKERN_TEMPL_3V_SISD(OP,true,STP,SSUF); \
FIN; \
}
#define REGISTER
Definition: basics.h:108
#define VKERN_TEMPL_3V_SISD(SDOP, COND, STP, SUF)
#define MISALIGNMENT_CHECK(x)
#define VEC_INLINE
Definition: basics.h:1266
if(value==0) return 1
#define TWEAK(x)
Definition: basics.h:486
int i
Definition: LM_fit.h:71
#define VKERN_TEMPL_3V_K_SIMD(m, a, s, f, u1, u2)
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define RESTRICT
Definition: basics.h:89

Without the unaligned warning.

Definition at line 337 of file unroll_prefetch_simd_def.h.

#define VKERN_TEMPL_3V_SISD (   SDOP,
  COND,
  STP,
  SUF 
)
Value:
while (COND && i) { \
STP TMP, LD UNUSED; \
SDOP(res,v1,v2,f1,f2,SUF,,); \
--i; ++res; ++v1; ++v2; \
}
#define UNUSED
Definition: basics.h:471
int i
Definition: LM_fit.h:71
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199

Definition at line 80 of file unroll_prefetch_simd_def.h.

#define WARN_UNALIGN (   v)    do {} while (0)

WARN_UNALIGNED macro: If defined, the TBCI library will print a warning to stderr for unaligned SIMD accesses, which will be slower ...

Definition at line 283 of file unroll_prefetch_simd_def.h.