TBCI Numerical high perf. C++ Library  2.8.0
matrix_kernels.h
Go to the documentation of this file.
1 
7 #ifndef TBCI_MATRIX_KERNELS_H
8 #define TBCI_MATRIX_KERNELS_H
9 #include "tbci/basics.h"
10 
11 template <typename T> class Matrix;
12 template <typename T> class TMatrix;
13 template <typename T> class TSMatrix;
14 template <typename T> class Vector;
15 template <typename T> class TVector;
16 template <typename T> class TSVector;
17 
18 //#include "vec_kern_unr_pref.h"
19 
20 INST(template <typename T> class Matrix friend void do_old_mat_mat_mult
21  (const unsigned start, const unsigned end,
22  TMatrix<T> *res, const Matrix<T> *a,
23  const Matrix<T> *b);)
24 template <typename T>
25 void do_old_mat_mat_mult (const unsigned start, const unsigned end,
26  TMatrix<T> *res, const Matrix<T> *a,
27  const Matrix<T> *b)
28 {
29  const unsigned bc = b->columns();
30  const unsigned ac = a->columns();
31  if (do_exactsum2()) {
32  for (unsigned i=start; i<end; ++i) {
33  for (unsigned j=0; j<bc; ++j) {
34  PREFETCH_R(a->getrowptr(i), 2);
35  PREFETCH_W(&res->setval(i,j), 2);
36  REGISTER T val (a->get(i,0) * b->get(0,j));
37  REGISTER T comp(0.0);
38  for (REGISTER unsigned l=1; l<ac; ++l) {
39  const REGISTER T y = a->get(i,l) * b->get(l,j);
40  const REGISTER T t = val + y;
41  comp += (t-val) - y;
42  val = t;
43  }
44  res->set(val-comp,i,j);
45  }
46  }
47  } else {
48  for (unsigned i=start; i<end; ++i) {
49  for (unsigned j=0; j<bc; ++j) {
50  PREFETCH_R(a->getrowptr(i), 2);
51  PREFETCH_W(&res->setval(i,j), 2);
52  REGISTER T val (a->get(i,0) * b->get(0,j));
53  for (REGISTER unsigned l=1; l<ac; ++l)
54  val += a->get(i,l) * b->get(l,j);
55  res->set(val,i,j);
56  }
57  }
58  }
59 }
60 
62 
63 /* Following Dowd/Severance, ch. 8: Friendlier mem access pattern */
64 /* Should be faster in general. However, on iPII, it's faster (+)
65  * or slower (-), depending on the data type T:
66  * int: -, float: +, double: -, long long double: +, cplx<double>: +
67  * AXP: Generally faster
68  * (KG, 99/07/28) */
69 #if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
70 template <typename T>
71 void do_mat_mat_mult (const unsigned start, const unsigned end,
72  TMatrix<T> *res,
73  const Matrix<T> *a, const Matrix<T> *b)
74 {
75 #ifdef OLD_MAT_MAT_MULT
76  do_old_mat_mat_mult(start, end, res, a, b);
77 #else
78  if (do_exactsum2()) {
79  do_old_mat_mat_mult(start, end, res, a, b);
80  return;
81  }
82  const unsigned bc = b->columns();
83  const unsigned ac = a->columns();
84  for (unsigned i=start; i<end; ++i) {
85  const T* RESTRICT aptr = a->getrowptr(i);
86  PREFETCH_R(aptr,2);
87  for (unsigned l=0; l<ac; ++l) {
88  T* RESTRICT rptr = res->getrowptr(i);
89  const T* RESTRICT bptr = b->getrowptr(l);
90  int n = bc - 8;
91  PREFETCH_R(rptr,2); PREFETCH_R(bptr,1);
92  ALIGN3(const REGISTER T tmp, *aptr, MIN_ALIGN2);
93  if (LIKELY(n >= 0)) {
94  if (sizeof(T)*2 >= CACHELINE_SZ) {
95  PREFETCH_R(rptr+2,2); PREFETCH_R(bptr+2,1);
96  }
97  PREFETCH_R(rptr+4,2); PREFETCH_R(bptr+4,1);
98  if (sizeof(T)*2 >= CACHELINE_SZ) {
99  PREFETCH_R(rptr+6,2); PREFETCH_R(bptr+6,1);
100  }
101  for (; n > 3; n -= 4) {
102  *rptr += tmp * *bptr;
103  PREFETCH_R(rptr+ 8,2);
104  *(rptr+1) += tmp * *(bptr+1);
105  PREFETCH_R(bptr+ 8,1);
106  *(rptr+2) += tmp * *(bptr+2);
107  if (sizeof(T)*2 >= CACHELINE_SZ)
108  PREFETCH_R(rptr+10,2);
109  rptr += 4;
110  if (sizeof(T)*2 >= CACHELINE_SZ)
111  PREFETCH_R(bptr+10,1);
112  *(rptr-1) += tmp * *(bptr+3);
113  bptr += 4;
114  }
115  }
116  if (LIKELY(l+4 < ac-CACHELINE_SZ/sizeof(T)))
117  PREFETCH_R(aptr+4, 2);
118  for (n += 8; n; --n)
119  *rptr++ += tmp * *bptr++;
120  aptr++;
121  }
122  }
123 #endif
124 }
125 #else
126 template <typename T>
127 void do_mat_mat_mult (const unsigned start, const unsigned end,
128  TMatrix<T> *res, const Matrix<T> *a, const Matrix<T> *b)
129 {
130  if (do_exactsum2()) {
131  do_old_mat_mat_mult(start, end, res, a, b);
132  return;
133  }
134  const unsigned bc = b->columns();
135  const unsigned ac = a->columns();
136  Vector<T> corr((T)0, end-start);
137  for (unsigned i=start; i<end; ++i) {
138  const T* RESTRICT aptr = a->getrowptr(i);
139  for (unsigned l=0; l<ac; ++l) {
140  T* RESTRICT rptr = res->getrowptr(i);
141  const T* RESTRICT bptr = b->getrowptr(l);
142  const REGISTER T tmp = *aptr;
143  for (int n = bc; n; --n)
144  *rptr++ += tmp * *bptr++;
145  aptr++;
146  }
147  }
148 }
149 #endif
150 
151 /* costs of mat vec mult; r = row, c = col */
152 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
153  +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
154 
155 template <typename T>
156 /*inline*/ void do_mat_vec_mult (const unsigned start, const unsigned end,
157  TVector<T> *res,
158  const Matrix<T> *mat, const Vector<T> *vec)
159 {
160 #ifdef DEBUG_THREAD
161  fprintf (stderr, "do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
162 #endif
163  PREFETCH_R(vec->vec,3);
164  const unsigned mc = mat->col;
165 #if 0 //def TBCI_SIMD_SUM
166  if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
167  /* OK, so we can not ensure alignment -- should do something clever ? */
168  }
169 #endif
170  for (unsigned rw = start; rw < end; ++rw) {
171  //PREFETCH_W(res->vec+rw,3);
172  T val(0.0);
173  if (do_exactsum2())
174  do_vec_mult_exact<T> (mc, mat->mat[rw], vec->vec, val);
175  else
176  do_vec_mult_quick<T> (mc, mat->mat[rw], vec->vec, val);
177  res->vec[rw] = val;
178  }
179 }
180 
181 template <typename T>
182 /*inline*/ void do_mat_tsv_mult (const unsigned start, const unsigned end,
183  TVector<T> *res,
184  const Matrix<T> *mat, const TSVector<T> *tsv)
185 {
186  PREFETCH_R(tsv->vec,3);
187  const unsigned mc = mat->col;
188  const T fact = tsv->getfac();
189 #if 0 //def TBCI_SIMD_SUM
190  if (UNLIKELY(sizeof(T) < TBCI_SIMD_ALIGN && mc % (TBCI_SIMD_ALIGN/sizeof(T)))) {
191  for (unsigned rw = start; rw < end; ++rw) {
192  T val(0.0);
193  if (do_exactsum2())
194  do_vec_mult_exact<T>(mc, mat->mat[rw], tsv->vec, val);
195  else
196  do_vec_mult_quick<T>(mc, mat->mat[rw], tsv->vec, val);
197  res->vec[rw] = val*fact;
198  }
199  } else
200 #endif
201  for (unsigned rw = start; rw < end; ++rw) {
202  //PREFETCH_W(res->vec+rw,3);
203  T val(0.0);
204  // TODO: May be unaligned!
205  if (do_exactsum2())
206  do_vec_mult_exact<T> (mc, mat->mat[rw], tsv->vec, val);
207  else
208  do_vec_mult_quick<T> (mc, mat->mat[rw], tsv->vec, val);
209  res->vec[rw] = val*fact;
210  }
211 }
212 
213 template <typename T>
214 /*inline*/ void do_mat_vec_transmult_exact (const unsigned start, const unsigned end,
215  TVector<T> *res,
216  const Matrix<T> *mat, const Vector<T> *vec)
217 {
218 #ifdef DEBUG_THREAD
219  fprintf (stderr, "do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
220 #endif
221  PREFETCH_R(&vec->getcref(0),3);
222  const unsigned mr = mat->row;
223  Vector<T> corr((T)0, end-start);
224  for (unsigned cl = start; cl < end; ++cl)
225  res->setval(cl) = (T)0;
226  for (unsigned rw = 0; rw < mr; ++rw) {
227  const REGISTER T fac = vec->get(rw);
228  //do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
229  for (unsigned off = start; off < end; ++off) {
230  const REGISTER T y = mat->get(rw, off) * fac;
231  const REGISTER T t = res->get(off) + y;
232  corr(off-start) += (t - res->get(off)) - y;
233  res->setval(off) = t;
234  }
235  }
236  for (unsigned cl = start; cl < end; ++cl)
237  res->setval(cl) -= corr(cl-start);
238 }
239 
240 #if 1
241 template <typename T>
242 /*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
243  TVector<T> *res,
244  const Matrix<T> *mat, const Vector<T> *vec)
245 {
246  if (do_exactsum2()) {
247  do_mat_vec_transmult_exact(start, end, res, mat, vec);
248  return;
249  }
250 #ifdef DEBUG_THREAD
251  fprintf (stderr, "do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
252 #endif
253  PREFETCH_R(vec->vec,3);
254  const unsigned mr = mat->row;
255  for (unsigned cl = start; cl < end; ++cl)
256  res->setval(cl) = (T)0;
257  for (unsigned rw = 0; rw < mr; ++rw) {
258  const REGISTER T fac = vec->get(rw);
259  do_vec_add_svc<T> (end-start, &res->setval(start), mat->mat[rw], fac);
260  }
261 }
262 
263 #else
264 template <typename T>
265 /*inline*/ void do_mat_vec_transmult (const unsigned start, const unsigned end,
266  TVector<T> *res,
267  const Matrix<T> *mat, const Vector<T> *vec)
268 {
269 #if 1
270  PREFETCH_R(vec->vec,3);
271  const unsigned mr = mat->row;
272  //const unsigned mstr = mr;
273  const unsigned mstr = mat->mat[1] - mat->mat[0];
274  //fprintf(stderr, "m-v-transmult: stride %i (%ix%i)\n",
275  // mstr, mr, mat->col);
276  /* We don't have SIMD routines for non=unit strides */
277  for (unsigned cl = start; cl < end; ++cl) {
278  //PREFETCH_W(res->vec+rw,3);
279  T val(0.0);
280  if (do_exactsum2())
281  do_vec_mult_stride_exact<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
282  else
283  do_vec_mult_stride_quick<T> (mr, mat->mat[0]+cl, vec->vec, val, mstr);
284  res->vec[cl] = val;
285  }
286 #else /* OLD IMPLEMENTATION */
287 
288  PREFETCH_R(vec->vec,3);
289  const unsigned mr = mat->row;
290  for (unsigned cl = start; cl < end; cl++)
291  {
292  PREFETCH_R(mat->mat[0]+cl,1);
293  REGISTER T* vecptr = vec->vec;
294  REGISTER int i = mr - 9;
295  int r = 1;
296  ALIGN3(T el, mat->mat[0][cl] * *vecptr++, MIN_ALIGN2);
297  /* FIXME: Exact summing needs to be implemented
298  * -- and a SIMD version would be nice as well ...
299  * This calls for a do_vec_mult_stride kernel */
300  //T comp(0.0);
301  if (LIKELY(i >= 0)) {
302  PREFETCH_R(mat->mat[1]+cl,1);
303  PREFETCH_R(mat->mat[2]+cl,1);
304  PREFETCH_R(mat->mat[3]+cl,1);
305  PREFETCH_R(mat->mat[4]+cl,1);
306  PREFETCH_R(mat->mat[5]+cl,1);
307  PREFETCH_R(mat->mat[6]+cl,1);
308  PREFETCH_R(mat->mat[7]+cl,1);
309  for (; i > 3; i -= 4) {
310  el += mat->mat[r][cl] * *vecptr;
311  PREFETCH_R(mat->mat[r+ 8]+cl, 1);
312  el += mat->mat[r+1][cl] * *(vecptr+1);
313  PREFETCH_R(mat->mat[r+ 9]+cl, 1);
314  el += mat->mat[r+2][cl] * *(vecptr+2);
315  PREFETCH_R(mat->mat[r+10]+cl, 1);
316  el += mat->mat[r+3][cl] * *(vecptr+3);
317  PREFETCH_R(mat->mat[r+11]+cl, 1);
318  r += 4; vecptr += 4;
319  }
320  }
321  if (LIKELY(cl < end-CACHELINE_SZ/sizeof(T))) {
322  PREFETCH_W(res->vec + cl, 2);
323  }
324  for (i += 8; i; --i, r++)
325  el += mat->mat[r][cl] * *vecptr++;
326 
327  res->vec[cl] = el;
328  }
329 #endif
330 }
331 #endif
332 
333 /* TODO:
334  * Divide and Conquer algorithm for matrix-matrix multiplication:
335  * (Strassen's algorithm)
336  *
337  * [C11 C12 ] [A11 A12 ] [B11 B12 ]
338  * | | = | | × | |
339  * [C21 C22 ] [A21 A22 ] [B21 B22 ]
340  *
341  * P1 = (A11 + A22) (B11 + B22)
342  * P2 = (A21 + A22) B11
343  * P3 = A11 (B12 - B22)
344  * P4 = A22( B21 - B11)
345  * P5 = (A11 + A12) B22
346  * P6 = (A21 - A11) (B11 + B12)
347  * P7 = (A12 - A22) (B21 + B22)
348  *
349  * C11 = P1 + P4 - P5 + P7
350  * C12 = P3 + P5
351  * C21 = P2 + P4
352  * C22 = P1 + P3 - P2 + P6
353  *
354  * O(n^2log7) multiplications
355  *
356  * http://www.cse.ohio-state.edu/~gurari/course/cse693s04/cse693s04su61.html#x84-7200011.4
357  * http://www.brpreiss.com/books/opus5/html/page457.html
358  */
359 
360 #endif /* TBCI_MATRIX_KERNELS_H */
T ** mat
C storage layout: mat[row][col].
Definition: matrix.h:119
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *vec)
tm fac
Definition: f_matrix.h:1052
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
T & setval(const unsigned long i) const
Definition: vector.h:192
#define CACHELINE_SZ
(L1) Cache line size in bytes.
Definition: perf_opt.h:152
long double fact(const double x)
Definition: mathplus.h:101
const unsigned end
#define REGISTER
Definition: basics.h:108
const unsigned ac
T *const & vecptr() const
Definition: vector.h:1163
tbci_traits< T >::const_refval_type get(const unsigned long i) const
Definition: vector.h:190
#define MIN_ALIGN2
Definition: basics.h:424
const T & getfac() const
Definition: vector.h:1073
unsigned int col
Definition: matrix.h:116
#define UNLIKELY(expr)
Definition: basics.h:101
T & setval(const T &val, const unsigned int r, const unsigned int c)
Definition: matrix.h:281
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
unsigned int columns() const
number of columns
Definition: matrix.h:315
#define TBCI_SIMD_ALIGN
Definition: malloc_cache.h:122
F_TMatrix< T > b
Definition: f_matrix.h:736
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
T * vec
Definition: vector.h:1029
const T & getcref(const unsigned long i) const
Definition: vector.h:188
Definition: bvector.h:49
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
int i
Definition: LM_fit.h:71
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition: bvector.h:52
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
T * vec
Definition: bvector.h:73
#define ALIGN3(v, i, x)
Definition: basics.h:442
#define T
Definition: bdmatlib.cc:20
unsigned int do_exactsum2()
Definition: tbci_param.h:151
const T * getrowptr(const unsigned r) const
Helpers for matvecmul.
Definition: matrix.h:296
T & set(const T &val, const unsigned r, const unsigned c)
Definition: matrix.h:290
#define RESTRICT
Definition: basics.h:89
const unsigned TMatrix< T > const Matrix< T > * a
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100
unsigned int row
Definition: matrix.h:116
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
Definition: matrix.h:288