7#ifndef TBCI_MATRIX_KERNELS_H
8#define TBCI_MATRIX_KERNELS_H
9#include "tbci/basics.h"
11template <
typename T>
class Matrix;
12template <
typename T>
class TMatrix;
14template <
typename T>
class Vector;
15template <
typename T>
class TVector;
20INST(template <typename T>
class Matrix friend void do_old_mat_mat_mult
21 (
const unsigned start,
const unsigned end,
25void do_old_mat_mat_mult (
const unsigned start,
const unsigned end,
29 const unsigned bc =
b->columns();
30 const unsigned ac =
a->columns();
32 for (
unsigned i=start;
i<
end; ++
i) {
33 for (
unsigned j=0; j<bc; ++j) {
44 res->set(val-comp,
i,j);
48 for (
unsigned i=start;
i<
end; ++
i) {
49 for (
unsigned j=0; j<bc; ++j) {
54 val +=
a->get(
i,l) *
b->get(l,j);
69#if defined(USE_PREFETCH) || defined(USE_UNR_VEC_KERNELS)
71void do_mat_mat_mult (
const unsigned start,
const unsigned end,
75#ifdef OLD_MAT_MAT_MULT
76 do_old_mat_mat_mult(start,
end,
res,
a,
b);
79 do_old_mat_mat_mult(start,
end,
res,
a,
b);
82 const unsigned bc =
b->columns();
83 const unsigned ac =
a->columns();
84 for (
unsigned i=start;
i<
end; ++
i) {
87 for (
unsigned l=0; l<
ac; ++l) {
101 for (; n > 3; n -= 4) {
102 *rptr += tmp * *bptr;
104 *(rptr+1) += tmp * *(bptr+1);
106 *(rptr+2) += tmp * *(bptr+2);
112 *(rptr-1) += tmp * *(bptr+3);
119 *rptr++ += tmp * *bptr++;
127void do_mat_mat_mult (
const unsigned start,
const unsigned end,
131 do_old_mat_mat_mult(start,
end,
res,
a,
b);
134 const unsigned bc =
b->columns();
135 const unsigned ac =
a->columns();
137 for (
unsigned i=start;
i<
end; ++
i) {
139 for (
unsigned l=0; l<
ac; ++l) {
143 for (
int n = bc; n; --n)
144 *rptr++ += tmp * *bptr++;
152#define COST_MATVEC(r,c) (r*(COST_UNIT_STORE+COST_LOOP \
153 +c*(3*COST_UNIT_LOAD+COST_CALL+COST_ADD+COST_MULT+COST_LOOP)))
161 fprintf (stderr,
"do_mat_vec_mult (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
164 const unsigned mc = mat->
col;
170 for (
unsigned rw = start; rw <
end; ++rw) {
174 do_vec_mult_exact<T> (mc, mat->
mat[rw],
vec->vec, val);
176 do_vec_mult_quick<T> (mc, mat->
mat[rw],
vec->vec, val);
187 const unsigned mc = mat->
col;
191 for (
unsigned rw = start; rw <
end; ++rw) {
194 do_vec_mult_exact<T>(mc, mat->
mat[rw], tsv->
vec, val);
196 do_vec_mult_quick<T>(mc, mat->
mat[rw], tsv->
vec, val);
201 for (
unsigned rw = start; rw <
end; ++rw) {
206 do_vec_mult_exact<T> (mc, mat->
mat[rw], tsv->
vec, val);
208 do_vec_mult_quick<T> (mc, mat->
mat[rw], tsv->
vec, val);
214 void do_mat_vec_transmult_exact (
const unsigned start,
const unsigned end,
219 fprintf (stderr,
"do_mat_vec_transmult_exact (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
222 const unsigned mr = mat->
row;
224 for (
unsigned cl = start; cl <
end; ++cl)
225 res->setval(cl) = (
T)0;
226 for (
unsigned rw = 0; rw < mr; ++rw) {
229 for (
unsigned off = start; off <
end; ++off) {
232 corr(off-start) += (t -
res->get(off)) -
y;
233 res->setval(off) = t;
236 for (
unsigned cl = start; cl <
end; ++cl)
237 res->setval(cl) -= corr(cl-start);
247 do_mat_vec_transmult_exact(start,
end,
res, mat,
vec);
251 fprintf (stderr,
"do_mat_vec_transmult (pid %i): %p %p %p, %i - %i\n", getpid(),
res, mat,
vec, start,
end);
254 const unsigned mr = mat->
row;
255 for (
unsigned cl = start; cl <
end; ++cl)
256 res->setval(cl) = (
T)0;
257 for (
unsigned rw = 0; rw < mr; ++rw) {
259 do_vec_add_svc<T> (
end-start, &
res->setval(start), mat->
mat[rw],
fac);
271 const unsigned mr = mat->
row;
273 const unsigned mstr = mat->
mat[1] - mat->
mat[0];
277 for (
unsigned cl = start; cl <
end; ++cl) {
281 do_vec_mult_stride_exact<T> (mr, mat->
mat[0]+cl,
vec->vec, val, mstr);
283 do_vec_mult_stride_quick<T> (mr, mat->
mat[0]+cl,
vec->vec, val, mstr);
289 const unsigned mr = mat->
row;
290 for (
unsigned cl = start; cl <
end; cl++)
309 for (;
i > 3;
i -= 4) {
324 for (
i += 8;
i; --
i, r++)
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
#define PREFETCH_W(addr, loc)
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
T ** mat
C storage layout: mat[row][col].
T *const & vecptr() const
Temporary Base Class Idiom: Class TVector is used for temporary variables.
long double fact(const double x)
void do_mat_tsv_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const TSVector< T > *vec)
void do_mat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
void do_mat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const Matrix< T > *mat, const Vector< T > *vec)
const unsigned TMatrix< T > const Matrix< T > * a
const unsigned TMatrix< T > * res
#define CACHELINE_SZ
(L1) Cache line size in bytes.
unsigned int do_exactsum2()