7 #ifndef TBCI_MATRIX_KERNELS_H
8 #define TBCI_MATRIX_KERNELS_H
9 #include "tbci/basics.h"
11 template <
typename T>
class Matrix;
12 template <
typename T>
class TMatrix;
13 template <
typename T>
class TSMatrix;
14 template <
typename T>
class Vector;
15 template <
typename T>
class TVector;
16 template <
typename T>
class TSVector;
20 INST(template <typename T>
class Matrix friend void do_old_mat_mat_mult
21 (
const unsigned start,
const unsigned end,
25 void do_old_mat_mat_mult (
const unsigned start,
const unsigned end,
29 const unsigned bc = b->
columns();
32 for (
unsigned i=start;
i<
end; ++
i) {
33 for (
unsigned j=0; j<bc; ++j) {
44 res->
set(val-comp,
i,j);
48 for (
unsigned i=start;
i<
end; ++
i) {
49 for (
unsigned j=0; j<bc; ++j) {
54 val += a->
get(
i,l) * b->
get(l,j);
69 #if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
71 void do_mat_mat_mult (
const unsigned start,
const unsigned end,
75 #ifdef OLD_MAT_MAT_MULT
76 do_old_mat_mat_mult(start, end, res, a, b);
79 do_old_mat_mat_mult(start, end, res, a, b);
82 const unsigned bc = b->
columns();
83 const unsigned ac = a->
columns();
84 for (
unsigned i=start;
i<
end; ++
i) {
87 for (
unsigned l=0; l<
ac; ++l) {
101 for (; n > 3; n -= 4) {
102 *rptr += tmp * *bptr;
104 *(rptr+1) += tmp * *(bptr+1);
106 *(rptr+2) += tmp * *(bptr+2);
112 *(rptr-1) += tmp * *(bptr+3);
119 *rptr++ += tmp * *bptr++;
126 template <
typename T>
127 void do_mat_mat_mult (
const unsigned start,
const unsigned end,
131 do_old_mat_mat_mult(start, end, res, a, b);
134 const unsigned bc = b->
columns();
135 const unsigned ac = a->
columns();
137 for (
unsigned i=start;
i<
end; ++
i) {
139 for (
unsigned l=0; l<
ac; ++l) {
143 for (
int n = bc; n; --n)
144 *rptr++ += tmp * *bptr++;
152 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
153 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
155 template <
typename T>
161 fprintf (stderr,
"do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
164 const unsigned mc = mat->
col;
165 #if 0 //def TBCI_SIMD_SUM
170 for (
unsigned rw = start; rw <
end; ++rw) {
174 do_vec_mult_exact<T> (mc, mat->
mat[rw], vec->
vec, val);
176 do_vec_mult_quick<T> (mc, mat->
mat[rw], vec->
vec, val);
181 template <
typename T>
187 const unsigned mc = mat->
col;
189 #if 0 //def TBCI_SIMD_SUM
191 for (
unsigned rw = start; rw <
end; ++rw) {
194 do_vec_mult_exact<T>(mc, mat->
mat[rw], tsv->
vec, val);
196 do_vec_mult_quick<T>(mc, mat->
mat[rw], tsv->
vec, val);
201 for (
unsigned rw = start; rw <
end; ++rw) {
206 do_vec_mult_exact<T> (mc, mat->
mat[rw], tsv->
vec, val);
208 do_vec_mult_quick<T> (mc, mat->
mat[rw], tsv->
vec, val);
213 template <
typename T>
214 void do_mat_vec_transmult_exact (
const unsigned start,
const unsigned end,
219 fprintf (stderr,
"do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
222 const unsigned mr = mat->
row;
224 for (
unsigned cl = start; cl <
end; ++cl)
226 for (
unsigned rw = 0; rw < mr; ++rw) {
229 for (
unsigned off = start; off <
end; ++off) {
232 corr(off-start) += (t - res->
get(off)) - y;
236 for (
unsigned cl = start; cl <
end; ++cl)
237 res->
setval(cl) -= corr(cl-start);
241 template <
typename T>
247 do_mat_vec_transmult_exact(start, end, res, mat, vec);
251 fprintf (stderr,
"do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(), res, mat, vec, start, end);
254 const unsigned mr = mat->
row;
255 for (
unsigned cl = start; cl <
end; ++cl)
257 for (
unsigned rw = 0; rw < mr; ++rw) {
259 do_vec_add_svc<T> (end-start, &res->
setval(start), mat->
mat[rw],
fac);
264 template <
typename T>
271 const unsigned mr = mat->
row;
273 const unsigned mstr = mat->
mat[1] - mat->
mat[0];
277 for (
unsigned cl = start; cl <
end; ++cl) {
281 do_vec_mult_stride_exact<T> (mr, mat->
mat[0]+cl, vec->
vec, val, mstr);
283 do_vec_mult_stride_quick<T> (mr, mat->
mat[0]+cl, vec->
vec, val, mstr);
289 const unsigned mr = mat->
row;
290 for (
unsigned cl = start; cl <
end; cl++)
309 for (; i > 3; i -= 4) {
312 el += mat->
mat[r+1][cl] * *(vecptr+1);
314 el += mat->
mat[r+2][cl] * *(vecptr+2);
316 el += mat->
mat[r+3][cl] * *(vecptr+3);
324 for (i += 8;
i; --
i, r++)
325 el += mat->
mat[r][cl] * *vecptr++;
T ** mat
C storage layout: mat[row][col].
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *vec)
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
T & setval(const unsigned long i) const
#define CACHELINE_SZ
(L1) Cache line size in bytes.
long double fact(const double x)
T *const & vecptr() const
tbci_traits< T >::const_refval_type get(const unsigned long i) const
T & setval(const T &val, const unsigned int r, const unsigned int c)
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
unsigned int columns() const
number of columns
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
const T & getcref(const unsigned long i) const
#define PREFETCH_W(addr, loc)
Temporary Base Class Idiom: Class TVector is used for temporary variables.
const Vector< T > Vector< T > Vector< T > Vector< T > & y
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
unsigned int do_exactsum2()
const T * getrowptr(const unsigned r) const
Helpers for matvecmul.
T & set(const T &val, const unsigned r, const unsigned c)
const unsigned TMatrix< T > const Matrix< T > * a
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption