TBCI Numerical high perf. C++ Library 2.8.0
perf_opt.h
Go to the documentation of this file.
1
7
8#ifndef TBCI_PERF_OPT_H
9#define TBCI_PERF_OPT_H
10
11/* gcc-4:
12 * The user should use -funroll-loops -fvectorize-trees
13 * and -fprefetch-loop-arrays as needed.
14 */
15#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(USE_PLAIN_VEC_KERNELS) \
16 && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
17# define USE_PLAIN_VEC_KERNELS
18#endif
19
20/* DEC Alpha architecture */
21#ifdef __alpha__
22# define DEF_CACHELINE_SZ 32
23# ifdef __GNUC__
24# if __GNUC__ <= 2
25# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
26//# define USE_PLAIN_VEC_KERNELS
27# define USE_UNR_VEC_KERNELS
28# endif
29# define DEF_UNROLL_DEPTH 8
30# define DEF_PREFETCH_AHEAD 8
31# else /* gcc 3 */
32# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
33# define USE_UNR_VEC_KERNELS
34# endif
35# define DEF_UNROLL_DEPTH 8
36# define DEF_PREFETCH_AHEAD 8
37# endif
38# else /* DEC/Compaq/HP compiled */
39# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
40# define USE_UNR_VEC_KERNELS
41# endif
42# define DEF_UNROLL_DEPTH 1
43# define DEF_PREFETCH_AHEAD 4
44# endif
45#endif
46
47/* iA32 arch */
48#if defined(__i386__) || defined(__x86_64__)
49# if defined(OPT_PENTIUM4) || defined(OPT_CORE2) || defined(__x86_64__)
50# define DEF_CACHELINE_SZ 64
51# else
52# define DEF_CACHELINE_SZ 32
53# endif
54# ifdef __GNUC__
55# ifdef OPT_PENTIUM4
56# if __GNUC__ <= 2
57# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
58# define USE_UNR_VEC_KERNELS
59# endif
60# define DEF_UNROLL_DEPTH 1 /* Hopefully -funroll-loops is enabled */
61# define DEF_PREFETCH_AHEAD 4
62# else /* gcc >= 3 */
63# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
64# define USE_PLAIN_VEC_KERNELS /* P4 does HW prefetch */
65# endif
66# define DEF_UNROLL_DEPTH 4
67# define DEF_PREFETCH_AHEAD 16
68# if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH)
69# define NO_PREFETCH /* P4 does HW prefetch */
70# endif
71# endif
72# elif defined(OPT_PENTIUM3) || defined(OPT_CORE2) || defined(SSE_PREFETCH) /* Pentium M / 3 */
73# if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH)
74# define NO_PREFETCH /* Pentium-M does HW prefetch, Pentium-3 does not ... */
75# endif
76# if !defined(SSE_PREFETCH) && defined(OPT_ARCH_PENTIUM3) &&!defined(NO_PREFETCH)
77# define SSE_PREFETCH
78# endif
79# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
80# define USE_UNR_VEC_KERNELS
81# endif
82# define DEF_UNROLL_DEPTH 4
83# define DEF_PREFETCH_AHEAD 8
84# elif defined(OPT_ATHLON) || defined(AMD_PREFETCH) /* Athlon / Opteron */
85# if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH) && defined(__x86_64__)
86# define NO_PREFETCH /* AMD64 does HW prefetch */
87# endif
88# if !defined(AMD_PREFETCH) && defined(OPT_ARCH_ATHLON) && !defined(NO_PREFETCH)
89# define AMD_PREFETCH
90# endif
91# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
92# define USE_UNR_VEC_KERNELS
93# endif
94# define DEF_UNROLL_DEPTH 1 /* Hopefully -funroll-loops is enabled */
95# define DEF_PREFETCH_AHEAD 8
96# else /* deflt proc */
97# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
98# define USE_UNR_VEC_KERNELS
99# endif
100# define DEF_UNROLL_DEPTH 8
101# define DEF_PREFETCH_AHEAD 4
102# endif
103# else /* ! __GNUC__ */
104# if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
105# define USE_UNR_VEC_KERNELS
106# endif
107# define DEF_UNROLL_DEPTH 8
108# define DEF_PREFETCH_AHEAD 8
109# endif
110#endif /* __i386__ || __x86_64__*/
111
112/* defaults */
113#if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
114# define USE_PLAIN_VEC_KERNELS
115#endif
116#ifndef DEF_UNROLL_DEPTH
117# define DEF_UNROLL_DEPTH 4
118#endif
119#ifndef DEF_PREFETCH_AHEAD
120# define DEF_PREFETCH_AHEAD 4
121#endif
122#ifndef DEF_CACHELINE_SZ
123# ifdef __WORDSIZE // Heuristics: Most 64bit arches have 64byte cache lines
124# define DEF_CACHELINE_SZ __WORDSIZE
125# else
126# define DEF_CACHELINE_SZ 32
127# endif
128#endif
129
132#define DEF_CACHE_LOC_READ 2
133#define DEF_CACHE_LOC_WRITE 3
134
135/* Apply defaults */
136
139#ifndef PREFETCH_AHEAD
140# define PREFETCH_AHEAD DEF_PREFETCH_AHEAD
141#endif
145#ifndef UNROLL_DEPTH
146# define UNROLL_DEPTH DEF_UNROLL_DEPTH
147#endif
151#ifndef CACHELINE_SZ
152# define CACHELINE_SZ DEF_CACHELINE_SZ
153#endif
164#ifndef CACHE_LOC_READ
165# define CACHE_LOC_READ DEF_CACHE_LOC_READ
166#endif
167#ifndef CACHE_LOC_WRITE
168# define CACHE_LOC_WRITE DEF_CACHE_LOC_WRITE
169#endif
170
171// Those should be evaluated at compile time
172#define EL_PER_CL(T) (signed)((CACHELINE_SZ/sizeof( T ))?(CACHELINE_SZ/sizeof( T )):1)
173#define PREF_OFFS(T) (EL_PER_CL(T)*PREFETCH_AHEAD)
174
175#endif /* TBCI_PERF_OPT_H */