TBCI Numerical high perf. C++ Library 2.8.0
matrix_kernels.h
Go to the documentation of this file.
1
6
7#ifndef TBCI_MATRIX_KERNELS_H
8#define TBCI_MATRIX_KERNELS_H
9#include "tbci/basics.h"
10
11template <typename T> class Matrix;
12template <typename T> class TMatrix;
13template <typename T> class TSMatrix;
14template <typename T> class Vector;
15template <typename T> class TVector;
16template <typename T> class TSVector;
17
18//#include "vec_kern_unr_pref.h"
19
20INST(template <typename T> class Matrix friend void do_old_mat_mat_mult
21 (const unsigned start, const unsigned end,
22 TMatrix<T> *res, const Matrix<T> *a,
23 const Matrix<T> *b);)
24template <typename T>
25void do_old_mat_mat_mult (const unsigned start, const unsigned end,
27 const Matrix<T> *b)
28{
29 const unsigned bc = b->columns();
30 const unsigned ac = a->columns();
31 if (do_exactsum2()) {
32 for (unsigned i=start; i<end; ++i) {
33 for (unsigned j=0; j<bc; ++j) {
34 PREFETCH_R(a->getrowptr(i), 2);
35 PREFETCH_W(&res->setval(i,j), 2);
36 REGISTER T val (a->get(i,0) * b->get(0,j));
37 REGISTER T comp(0.0);
38 for (REGISTER unsigned l=1; l<ac; ++l) {
39 const REGISTER T y = a->get(i,l) * b->get(l,j);
40 const REGISTER T t = val + y;
41 comp += (t-val) - y;
42 val = t;
43 }
44 res->set(val-comp,i,j);
45 }
46 }
47 } else {
48 for (unsigned i=start; i<end; ++i) {
49 for (unsigned j=0; j<bc; ++j) {
50 PREFETCH_R(a->getrowptr(i), 2);
51 PREFETCH_W(&res->setval(i,j), 2);
52 REGISTER T val (a->get(i,0) * b->get(0,j));
53 for (REGISTER unsigned l=1; l<ac; ++l)
54 val += a->get(i,l) * b->get(l,j);
55 res->set(val,i,j);
56 }
57 }
58 }
59}
60
62
63/* Following Dowd/Severance, ch. 8: Friendlier mem access pattern */
64/* Should be faster in general. However, on iPII, it's faster (+)
65 * or slower (-), depending on the data type T:
66 * int: -, float: +, double: -, long long double: +, cplx<double>: +
67 * AXP: Generally faster
68 * (KG, 99/07/28) */
69#if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
70template <typename T>
71void do_mat_mat_mult (const unsigned start, const unsigned end,
73 const Matrix<T> *a, const Matrix<T> *b)
74{
75#ifdef OLD_MAT_MAT_MULT
76 do_old_mat_mat_mult(start, end, res, a, b);
77#else
78 if (do_exactsum2()) {
79 do_old_mat_mat_mult(start, end, res, a, b);
80 return;
81 }
82 const unsigned bc = b->columns();
83 const unsigned ac = a->columns();
84 for (unsigned i=start; i<end; ++i) {
85 const T* RESTRICT aptr = a->getrowptr(i);
86 PREFETCH_R(aptr,2);
87 for (unsigned l=0; l<ac; ++l) {
88 T* RESTRICT rptr = res->getrowptr(i);
89 const T* RESTRICT bptr = b->getrowptr(l);
90 int n = bc - 8;
91 PREFETCH_R(rptr,2); PREFETCH_R(bptr,1);
92 ALIGN3(const REGISTER T tmp, *aptr, MIN_ALIGN2);
93 if (LIKELY(n >= 0)) {
94 if (sizeof(T)*2 >= CACHELINE_SZ) {
95 PREFETCH_R(rptr+2,2); PREFETCH_R(bptr+2,1);
96 }
97 PREFETCH_R(rptr+4,2); PREFETCH_R(bptr+4,1);
98 if (sizeof(T)*2 >= CACHELINE_SZ) {
99 PREFETCH_R(rptr+6,2); PREFETCH_R(bptr+6,1);
100 }
101 for (; n > 3; n -= 4) {
102 *rptr += tmp * *bptr;
103 PREFETCH_R(rptr+ 8,2);
104 *(rptr+1) += tmp * *(bptr+1);
105 PREFETCH_R(bptr+ 8,1);
106 *(rptr+2) += tmp * *(bptr+2);
107 if (sizeof(T)*2 >= CACHELINE_SZ)
108 PREFETCH_R(rptr+10,2);
109 rptr += 4;
110 if (sizeof(T)*2 >= CACHELINE_SZ)
111 PREFETCH_R(bptr+10,1);
112 *(rptr-1) += tmp * *(bptr+3);
113 bptr += 4;
114 }
115 }
116 if (LIKELY(l+4 < ac-CACHELINE_SZ/sizeof(T)))
117 PREFETCH_R(aptr+4, 2);
118 for (n += 8; n; --n)
119 *rptr++ += tmp * *bptr++;
120 aptr++;
121 }
122 }
123#endif
124}
125#else
126template <typename T>
127void do_mat_mat_mult (const unsigned start, const unsigned end,
128 TMatrix<T> *res, const Matrix<T> *a, const Matrix<T> *b)
129{
130 if (do_exactsum2()) {
131 do_old_mat_mat_mult(start, end, res, a, b);
132 return;
133 }
134 const unsigned bc = b->columns();
135 const unsigned ac = a->columns();
136 Vector<T> corr((T)0, end-start);
137 for (unsigned i=start; i<end; ++i) {
138 const T* RESTRICT aptr = a->getrowptr(i);
139 for (unsigned l=0; l<ac; ++l) {
140 T* RESTRICT rptr = res->getrowptr(i);
141 const T* RESTRICT bptr = b->getrowptr(l);
142 const REGISTER T tmp = *aptr;
143 for (int n = bc; n; --n)
144 *rptr++ += tmp * *bptr++;
145 aptr++;
146 }
147 }
148}
149#endif
150
151/* costs of mat vec mult; r = row, c = col */
152#define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
153 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
154
155template <typename T>
156/*inline*/ void do_mat_vec_mult (const unsigned start, const unsigned end,
157 TVector<T> *res,
158 const Matrix<T> *mat, const Vector<T> *vec)
159{
160#ifdef DEBUG_THREAD
161 fprintf (stderr, "do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
162#endif
163 PREFETCH_R(vec->vec,3);
164 const unsigned mc = mat->col;
165#if 0 //def TBCI_SIMD_SUM
166 if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
167 /* OK, so we can not ensure alignment -- should do something clever ? */
168 }
169#endif
170 for (unsigned rw = start; rw < end; ++rw) {
171 //PREFETCH_W(res->vec+rw,3);
172 T val(0.0);
173 if (do_exactsum2())
174 do_vec_mult_exact<T> (mc, mat->mat[rw], vec->vec, val);
175 else
176 do_vec_mult_quick<T> (mc, mat->mat[rw], vec->vec, val);
177 res->vec[rw] = val;
178 }
179}
180
181template <typename T>
182/*inline*/ void do_mat_tsv_mult (const unsigned start, const unsigned end,
183 TVector<T> *res,
184 const Matrix<T> *mat, const TSVector<T> *tsv)
185{
186 PREFETCH_R(tsv->vec,3);
187 const unsigned mc = mat->col;
188 const T fact = tsv->getfac();
189#if 0 //def TBCI_SIMD_SUM
190 if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
191 for (unsigned rw = start; rw < end; ++rw) {
192 T val(0.0);
193 if (do_exactsum2())
194 do_vec_mult_exact<T>(mc, mat->mat[rw], tsv->vec, val);
195 else
196 do_vec_mult_quick<T>(mc, mat->mat[rw], tsv->vec, val);
197 res->vec[rw] = val*fact;
198 }
199 } else
200#endif
201 for (unsigned rw = start; rw < end; ++rw) {
202 //PREFETCH_W(res->vec+rw,3);
203 T val(0.0);
204 // TODO: May be unaligned!
205 if (do_exactsum2())
206 do_vec_mult_exact<T> (mc, mat->mat[rw], tsv->vec, val);
207 else
208 do_vec_mult_quick<T> (mc, mat->mat[rw], tsv->vec, val);
209 res->vec[rw] = val*fact;
210 }
211}
212
213template <typename T>
214/*inline*/ void do_mat_vec_transmult_exact (const unsigned start, const unsigned end,
215 TVector<T> *res,
216 const Matrix<T> *mat, const Vector<T> *vec)
217{
218#ifdef DEBUG_THREAD
219 fprintf (stderr, "do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
220#endif
221 PREFETCH_R(&vec->getcref(0),3);
222 const unsigned mr = mat->row;
223 Vector<T> corr((T)0, end-start);
224 for (unsigned cl = start; cl < end; ++cl)
225 res->setval(cl) = (T)0;
226 for (unsigned rw = 0; rw < mr; ++rw) {
227 const REGISTER T fac = vec->get(rw);
228 //do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
229 for (unsigned off = start; off < end; ++off) {
230 const REGISTER T y = mat->get(rw, off) * fac;
231 const REGISTER T t = res->get(off) + y;
232 corr(off-start) += (t - res->get(off)) - y;
233 res->setval(off) = t;
234 }
235 }
236 for (unsigned cl = start; cl < end; ++cl)
237 res->setval(cl) -= corr(cl-start);
238}
239
240#if 1
241template <typename T>
242/*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
243 TVector<T> *res,
244 const Matrix<T> *mat, const Vector<T> *vec)
245{
246 if (do_exactsum2()) {
247 do_mat_vec_transmult_exact(start, end, res, mat, vec);
248 return;
249 }
250#ifdef DEBUG_THREAD
251 fprintf (stderr, "do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
252#endif
253 PREFETCH_R(vec->vec,3);
254 const unsigned mr = mat->row;
255 for (unsigned cl = start; cl < end; ++cl)
256 res->setval(cl) = (T)0;
257 for (unsigned rw = 0; rw < mr; ++rw) {
258 const REGISTER T fac = vec->get(rw);
259 do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
260 }
261}
262
263#else
264template <typename T>
265/*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
266 TVector<T> *res,
267 const Matrix<T> *mat, const Vector<T> *vec)
268{
269#if 1
270 PREFETCH_R(vec->vec,3);
271 const unsigned mr = mat->row;
272 //const unsigned mstr = mr;
273 const unsigned mstr = mat->mat[1] - mat->mat[0];
274 //fprintf(stderr, "m-v-transmult: stride %i (%ix%i)\n",
275 // mstr, mr, mat->col);
276 /* We don't have SIMD routines for non=unit strides */
277 for (unsigned cl = start; cl < end; ++cl) {
278 //PREFETCH_W(res->vec+rw,3);
279 T val(0.0);
280 if (do_exactsum2())
281 do_vec_mult_stride_exact<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
282 else
283 do_vec_mult_stride_quick<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
284 res->vec[cl] = val;
285 }
286#else /* OLD IMPLEMENTATION */
287
288 PREFETCH_R(vec->vec,3);
289 const unsigned mr = mat->row;
290 for (unsigned cl = start; cl < end; cl++)
291 {
292 PREFETCH_R(mat->mat[0]+cl,1);
293 REGISTER T* vecptr = vec->vec;
294 REGISTER int i = mr - 9;
295 int r = 1;
296 ALIGN3(T el, mat->mat[0][cl] * *vecptr++, MIN_ALIGN2);
297 /* FIXME: Exact summing needs to be implemented
298 * -- and a SIMD version would be nice as well ...
299 * This calls for a do_vec_mult_stride kernel */
300 //T comp(0.0);
301 if (LIKELY(i >= 0)) {
302 PREFETCH_R(mat->mat[1]+cl,1);
303 PREFETCH_R(mat->mat[2]+cl,1);
304 PREFETCH_R(mat->mat[3]+cl,1);
305 PREFETCH_R(mat->mat[4]+cl,1);
306 PREFETCH_R(mat->mat[5]+cl,1);
307 PREFETCH_R(mat->mat[6]+cl,1);
308 PREFETCH_R(mat->mat[7]+cl,1);
309 for (; i > 3; i -= 4) {
310 el += mat->mat[r][cl] * *vecptr;
311 PREFETCH_R(mat->mat[r+ 8]+cl, 1);
312 el += mat->mat[r+1][cl] * *(vecptr+1);
313 PREFETCH_R(mat->mat[r+ 9]+cl, 1);
314 el += mat->mat[r+2][cl] * *(vecptr+2);
315 PREFETCH_R(mat->mat[r+10]+cl, 1);
316 el += mat->mat[r+3][cl] * *(vecptr+3);
317 PREFETCH_R(mat->mat[r+11]+cl, 1);
318 r += 4; vecptr += 4;
319 }
320 }
321 if (LIKELY(cl < end-CACHELINE_SZ/sizeof(T))) {
322 PREFETCH_W(res->vec + cl, 2);
323 }
324 for (i += 8; i; --i, r++)
325 el += mat->mat[r][cl] * *vecptr++;
326
327 res->vec[cl] = el;
328 }
329#endif
330}
331#endif
332
333/* TODO:
334 * Divide and Conquer algorithm for matrix-matrix multiplication:
335 * (Strassen's algorithm)
336 *
337 * [C11 C12 ] [A11 A12 ] [B11 B12 ]
338 * | | = | | × | |
339 * [C21 C22 ] [A21 A22 ] [B21 B22 ]
340 *
341 * P1 = (A11 + A22) (B11 + B22)
342 * P2 = (A21 + A22) B11
343 * P3 = A11 (B12 - B22)
344 * P4 = A22( B21 - B11)
345 * P5 = (A11 + A12) B22
346 * P6 = (A21 - A11) (B11 + B12)
347 * P7 = (A12 - A22) (B21 + B22)
348 *
349 * C11 = P1 + P4 - P5 + P7
350 * C12 = P3 + P5
351 * C21 = P2 + P4
352 * C22 = P1 + P3 - P2 + P6
353 *
354 * O(n^2log7) multiplications
355 *
356 * http://www.cse.ohio-state.edu/~gurari/course/cse693s04/cse693s04su61.html#x84-7200011.4
357 * http://www.brpreiss.com/books/opus5/html/page457.html
358 */
359
360#endif /* TBCI_MATRIX_KERNELS_H */
int i
Definition LM_fit.h:71
doublereal y
Definition TOMS_707.C:27
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition basics.h:100
#define MIN_ALIGN2
Definition basics.h:424
#define ALIGN3(v, i, x)
Definition basics.h:442
#define INST(x)
Definition basics.h:238
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition basics.h:748
#define PREFETCH_W(addr, loc)
Definition basics.h:749
#define UNLIKELY(expr)
Definition basics.h:101
#define REGISTER
Definition basics.h:108
#define RESTRICT
Definition basics.h:89
#define T
Definition bdmatlib.cc:20
unsigned int col
Definition matrix.h:116
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
Definition matrix.h:288
unsigned int row
Definition matrix.h:116
T ** mat
C storage layout: mat[row][col].
Definition matrix.h:119
T * vec
Definition vector.h:1029
T *const & vecptr() const
Definition vector.h:1163
const T & getfac() const
Definition vector.h:1073
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition vector.h:73
tm fac
Definition f_matrix.h:1052
F_TMatrix< T > b
Definition f_matrix.h:736
#define TBCI_SIMD_ALIGN
long double fact(const double x)
Definition mathplus.h:101
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *vec)
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
const unsigned TMatrix< T > const Matrix< T > * a
const unsigned end
const unsigned TMatrix< T > * res
const unsigned ac
#define CACHELINE_SZ
(L1) Cache line size in bytes.
Definition perf_opt.h:152
unsigned int do_exactsum2()
Definition tbci_param.h:151