7 #ifndef TBCI_MATRIX_KERNELS_H 8 #define TBCI_MATRIX_KERNELS_H 9 #include "tbci/basics.h" 11 template <
typename T>
class Matrix;
12 template <
typename T>
class TMatrix;
13 template <
typename T>
class TSMatrix;
14 template <
typename T>
class Vector;
15 template <
typename T>
class TVector;
16 template <
typename T>
class TSVector;
20 INST(template <typename T>
class Matrix friend void do_old_mat_mat_mult
21 (
const unsigned start,
const unsigned end,
25 void do_old_mat_mat_mult (
const unsigned start,
const unsigned end,
29 const unsigned bc =
b->columns();
30 const unsigned ac =
a->columns();
32 for (
unsigned i=start;
i<
end; ++
i) {
33 for (
unsigned j=0; j<bc; ++j) {
44 res->set(val-comp,
i,j);
48 for (
unsigned i=start;
i<
end; ++
i) {
49 for (
unsigned j=0; j<bc; ++j) {
54 val +=
a->get(
i,l) *
b->get(l,j);
69 #if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS) 71 void do_mat_mat_mult (
const unsigned start,
const unsigned end,
75 #ifdef OLD_MAT_MAT_MULT 76 do_old_mat_mat_mult(start,
end,
res,
a,
b);
79 do_old_mat_mat_mult(start,
end,
res,
a,
b);
82 const unsigned bc =
b->columns();
83 const unsigned ac =
a->columns();
84 for (
unsigned i=start;
i<
end; ++
i) {
87 for (
unsigned l=0; l<
ac; ++l) {
101 for (; n > 3; n -= 4) {
102 *rptr += tmp * *bptr;
104 *(rptr+1) += tmp * *(bptr+1);
106 *(rptr+2) += tmp * *(bptr+2);
112 *(rptr-1) += tmp * *(bptr+3);
119 *rptr++ += tmp * *bptr++;
126 template <
typename T>
127 void do_mat_mat_mult (
const unsigned start,
const unsigned end,
131 do_old_mat_mat_mult(start,
end,
res,
a,
b);
134 const unsigned bc =
b->columns();
135 const unsigned ac =
a->columns();
137 for (
unsigned i=start;
i<
end; ++
i) {
139 for (
unsigned l=0; l<
ac; ++l) {
143 for (
int n = bc; n; --n)
144 *rptr++ += tmp * *bptr++;
152 #define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \ 153 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP))) 155 template <
typename T>
161 fprintf (stderr,
"do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
164 const unsigned mc = mat->
col;
165 #if 0 //def TBCI_SIMD_SUM 170 for (
unsigned rw = start; rw <
end; ++rw) {
174 do_vec_mult_exact<T> (mc, mat->
mat[rw],
vec->vec, val);
176 do_vec_mult_quick<T> (mc, mat->
mat[rw],
vec->vec, val);
181 template <
typename T>
187 const unsigned mc = mat->
col;
189 #if 0 //def TBCI_SIMD_SUM 191 for (
unsigned rw = start; rw <
end; ++rw) {
194 do_vec_mult_exact<T>(mc, mat->
mat[rw], tsv->
vec, val);
196 do_vec_mult_quick<T>(mc, mat->
mat[rw], tsv->
vec, val);
201 for (
unsigned rw = start; rw <
end; ++rw) {
206 do_vec_mult_exact<T> (mc, mat->
mat[rw], tsv->
vec, val);
208 do_vec_mult_quick<T> (mc, mat->
mat[rw], tsv->
vec, val);
213 template <
typename T>
214 void do_mat_vec_transmult_exact (
const unsigned start,
const unsigned end,
219 fprintf (stderr,
"do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
222 const unsigned mr = mat->
row;
224 for (
unsigned cl = start; cl <
end; ++cl)
225 res->setval(cl) = (
T)0;
226 for (
unsigned rw = 0; rw < mr; ++rw) {
229 for (
unsigned off = start; off <
end; ++off) {
232 corr(off-start) += (t -
res->get(off)) -
y;
233 res->setval(off) = t;
236 for (
unsigned cl = start; cl <
end; ++cl)
237 res->setval(cl) -= corr(cl-start);
241 template <
typename T>
247 do_mat_vec_transmult_exact(start,
end,
res, mat,
vec);
251 fprintf (stderr,
"do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
254 const unsigned mr = mat->
row;
255 for (
unsigned cl = start; cl <
end; ++cl)
256 res->setval(cl) = (
T)0;
257 for (
unsigned rw = 0; rw < mr; ++rw) {
259 do_vec_add_svc<T> (
end-start, &
res->setval(start), mat->
mat[rw],
fac);
264 template <
typename T>
271 const unsigned mr = mat->
row;
273 const unsigned mstr = mat->
mat[1] - mat->
mat[0];
277 for (
unsigned cl = start; cl <
end; ++cl) {
281 do_vec_mult_stride_exact<T> (mr, mat->
mat[0]+cl,
vec->vec, val, mstr);
283 do_vec_mult_stride_quick<T> (mr, mat->
mat[0]+cl,
vec->vec, val, mstr);
289 const unsigned mr = mat->
row;
290 for (
unsigned cl = start; cl <
end; cl++)
309 for (;
i > 3;
i -= 4) {
324 for (
i += 8;
i; --
i, r++)
T ** mat
C storage layout: mat[row][col].
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *vec)
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
#define CACHELINE_SZ
(L1) Cache line size in bytes.
long double fact(const double x)
const unsigned TMatrix< T > * res
const unsigned TMatrix< T > const Matrix< T > const Matrix< T > * b
INST(template< typename T > class Matrix friend void do_old_mat_mat_mult(const unsigned start, const unsigned end, TMatrix< T > *res, const Matrix< T > *a, const Matrix< T > *b);) template< typename T > void do_old_mat_mat_mult(const unsigned start
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
#define PREFETCH_W(addr, loc)
Temporary Base Class Idiom: Class TVector is used for temporary variables.
T *const & vecptr() const
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
unsigned int do_exactsum2()
const unsigned TMatrix< T > const Matrix< T > * a
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...