TBCI Numerical high perf. C++ Library  2.8.0
perf_opt.h
Go to the documentation of this file.
1 
8 #ifndef TBCI_PERF_OPT_H
9 #define TBCI_PERF_OPT_H
10 
11 /* gcc-4:
12  * The user should use -funroll-loops -fvectorize-trees
13  * and -fprefetch-loop-arrays as needed.
14  */
15 #if defined(__GNUC__) && __GNUC__ >= 4 && !defined(USE_PLAIN_VEC_KERNELS) \
16  && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
17 # define USE_PLAIN_VEC_KERNELS
18 #endif
19 
20 /* DEC Alpha architecture */
21 #ifdef __alpha__
22 # define DEF_CACHELINE_SZ 32
23 # ifdef __GNUC__
24 # if __GNUC__ <= 2
25 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
26 //# define USE_PLAIN_VEC_KERNELS
27 # define USE_UNR_VEC_KERNELS
28 # endif
29 # define DEF_UNROLL_DEPTH 8
30 # define DEF_PREFETCH_AHEAD 8
31 # else /* gcc 3 */
32 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
33 # define USE_UNR_VEC_KERNELS
34 # endif
35 # define DEF_UNROLL_DEPTH 8
36 # define DEF_PREFETCH_AHEAD 8
37 # endif
38 # else /* DEC/Compaq/HP compiled */
39 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
40 # define USE_UNR_VEC_KERNELS
41 # endif
42 # define DEF_UNROLL_DEPTH 1
43 # define DEF_PREFETCH_AHEAD 4
44 # endif
45 #endif
46 
47 /* iA32 arch */
48 #if defined(__i386__) || defined(__x86_64__)
49 # if defined(OPT_PENTIUM4) || defined(OPT_CORE2) || defined(__x86_64__)
50 # define DEF_CACHELINE_SZ 64
51 # else
52 # define DEF_CACHELINE_SZ 32
53 # endif
54 # ifdef __GNUC__
55 # ifdef OPT_PENTIUM4
56 # if __GNUC__ <= 2
57 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
58 # define USE_UNR_VEC_KERNELS
59 # endif
60 # define DEF_UNROLL_DEPTH 1 /* Hopefully -funroll-loops is enabled */
61 # define DEF_PREFETCH_AHEAD 4
62 # else /* gcc >= 3 */
63 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
64 # define USE_PLAIN_VEC_KERNELS /* P4 does HW prefetch */
65 # endif
66 # define DEF_UNROLL_DEPTH 4
67 # define DEF_PREFETCH_AHEAD 16
68 # if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH)
69 # define NO_PREFETCH /* P4 does HW prefetch */
70 # endif
71 # endif
72 # elif defined(OPT_PENTIUM3) || defined(OPT_CORE2) || defined(SSE_PREFETCH) /* Pentium M / 3 */
73 # if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH)
74 # define NO_PREFETCH /* Pentium-M does HW prefetch, Pentium-3 does not ... */
75 # endif
76 # if !defined(SSE_PREFETCH) && defined(OPT_ARCH_PENTIUM3) &&!defined(NO_PREFETCH)
77 # define SSE_PREFETCH
78 # endif
79 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
80 # define USE_UNR_VEC_KERNELS
81 # endif
82 # define DEF_UNROLL_DEPTH 4
83 # define DEF_PREFETCH_AHEAD 8
84 # elif defined(OPT_ATHLON) || defined(AMD_PREFETCH) /* Athlon / Opteron */
85 # if !defined(FORCE_PREFETCH) && !defined(NO_PREFETCH) && defined(__x86_64__)
86 # define NO_PREFETCH /* AMD64 does HW prefetch */
87 # endif
88 # if !defined(AMD_PREFETCH) && defined(OPT_ARCH_ATHLON) && !defined(NO_PREFETCH)
89 # define AMD_PREFETCH
90 # endif
91 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
92 # define USE_UNR_VEC_KERNELS
93 # endif
94 # define DEF_UNROLL_DEPTH 1 /* Hopefully -funroll-loops is enabled */
95 # define DEF_PREFETCH_AHEAD 8
96 # else /* deflt proc */
97 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
98 # define USE_UNR_VEC_KERNELS
99 # endif
100 # define DEF_UNROLL_DEPTH 8
101 # define DEF_PREFETCH_AHEAD 4
102 # endif
103 # else /* ! __GNUC__ */
104 # if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
105 # define USE_UNR_VEC_KERNELS
106 # endif
107 # define DEF_UNROLL_DEPTH 8
108 # define DEF_PREFETCH_AHEAD 8
109 # endif
110 #endif /* __i386__ || __x86_64__*/
111 
112 /* defaults */
113 #if !defined(USE_PLAIN_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS) && !defined(USE_UNR_VEC_KERNELS2)
114 # define USE_PLAIN_VEC_KERNELS
115 #endif
116 #ifndef DEF_UNROLL_DEPTH
117 # define DEF_UNROLL_DEPTH 4
118 #endif
119 #ifndef DEF_PREFETCH_AHEAD
120 # define DEF_PREFETCH_AHEAD 4
121 #endif
122 #ifndef DEF_CACHELINE_SZ
123 # ifdef __WORDSIZE // Heuristics: Most 64bit arches have 64byte cache lines
124 # define DEF_CACHELINE_SZ __WORDSIZE
125 # else
126 # define DEF_CACHELINE_SZ 32
127 # endif
128 #endif
129 
132 #define DEF_CACHE_LOC_READ 2
133 #define DEF_CACHE_LOC_WRITE 3
134 
135 /* Apply defaults */
136 
139 #ifndef PREFETCH_AHEAD
140 # define PREFETCH_AHEAD DEF_PREFETCH_AHEAD
141 #endif
142 #ifndef UNROLL_DEPTH
146 # define UNROLL_DEPTH DEF_UNROLL_DEPTH
147 #endif
148 #ifndef CACHELINE_SZ
152 # define CACHELINE_SZ DEF_CACHELINE_SZ
153 #endif
154 
164 #ifndef CACHE_LOC_READ
165 # define CACHE_LOC_READ DEF_CACHE_LOC_READ
166 #endif
167 #ifndef CACHE_LOC_WRITE
168 # define CACHE_LOC_WRITE DEF_CACHE_LOC_WRITE
169 #endif
170 
171 // Those should be evaluated at compile time
172 #define EL_PER_CL(T) (signed)((CACHELINE_SZ/sizeof( T ))?(CACHELINE_SZ/sizeof( T )):1)
173 #define PREF_OFFS(T) (EL_PER_CL(T)*PREFETCH_AHEAD)
174 
175 #endif /* TBCI_PERF_OPT_H */