TBCI Numerical high perf. C++ Library 2.8.0
band_matrix.h
Go to the documentation of this file.
1
5
6// Last change: 98/11/03 KG
7/* $Id: band_matrix.h,v 1.64.2.119 2022/11/03 17:28:10 garloff Exp $ */
8
9#ifndef TBCI_BAND_MATRIX_H
10#define TBCI_BAND_MATRIX_H
11
12//#define BDMAT_NEW_ALLOC
13
14#include "tbci/basics.h"
15#include "tbci/vector.h"
16#include "tbci/matrix.h"
17
18// Important: Values smaller than this will be considered ZERO
19// The default is reasonable for doubles which are of order 1
20#ifndef BD_MINVAL
21# define BD_MINVAL 1e-16
22#endif
23
25#ifndef BDMAT_NEW_ALLOC
26# define BDMAT_ONE_BLOCK
27#endif
28
29#ifdef BDMAT_MEM_TRACE
30# define BDMATDBG(x) x
31#else
32# define BDMATDBG(x)
33#endif
34
35// Avoid -fguiding-decls
36#if !defined(NO_GD) && !defined(AUTO_DECL)
37# include "tbci/band_matrix_gd.h"
38#endif
39
41
42#ifndef TBCI_DISABLE_EXCEPT
43//# include "except.h" is in basics.h
44
46class BdMatrixErr : public NumErr
47{
48 public:
50 : NumErr("Error in var conf Band_Matrix class") {}
51 BdMatrixErr(const char* t, const long i = 0)
52 : NumErr(t, i) {}
54 : NumErr(be) {}
55 virtual ~BdMatrixErr() throw() {}
56};
57#endif /* TBCI_DISABLE_EXCEPT */
58
59#ifdef PRAGMA_I
60# pragma interface "band_matrix.h"
61#endif
62
63// Forward - declaration
64template <typename T> class ILU0_BdMatrixPreconditioner;
65template <typename T> class DILU_BdMatrixPreconditioner;
66
102template <typename T>
103class BdMatrix : public Matrix_Sig<T>
104{
105 friend class ILU0_BdMatrixPreconditioner<T>;
106 friend class DILU_BdMatrixPreconditioner<T>;
107 protected:
108
109 unsigned int dim, maxoff;
110 unsigned long arsz;
111 BVector <unsigned> diagconf;
112 BVector <T*> adiag, bdiag;
114 //BVector<unsigned int> bestrow;
115
116 //protected constructor
117 void constructor(const unsigned int, const bool = true);
118 T* check(const unsigned r, const unsigned c) const HOT;
119 T* check_internal(const unsigned r, const unsigned c) const HOT;
120 inline void free_diags(const unsigned);
121 inline void do_copy(const BdMatrix<T>&);
122
123 private:
124 mutable T dummy ALIGN(MIN_ALIGN);
125 mutable int outw, outp, outf;
126 mutable bool outset;
127
128 inline bool search2(unsigned int&, const unsigned int, const unsigned int) HOT;
129
130 public:
131
132 typedef T value_type;
133 typedef T element_type;
134 typedef T aligned_value_type TALIGN(MIN_ALIGN2);
135 // constructors and destructors
136
138 explicit BdMatrix(const unsigned int = 0);
140 explicit BdMatrix(const unsigned int, const unsigned int);
142 explicit BdMatrix(const unsigned int, const BVector<unsigned int>&);
144 BdMatrix(const BdMatrix<T>&);
146 BdMatrix(const T&, const unsigned int);
147 BdMatrix(const T&, const unsigned int, const BVector<unsigned int>&);
149 BdMatrix(const T&, const unsigned int, unsigned int);
151 BdMatrix(const Matrix<T>&);
153
154
156 void destroy()
157 {
158 free_diags(diagconf.size());
159 dim = 0; arsz = 0;
160 adiag.resize(0); bdiag.resize(0); diagconf.resize(0);
161 }
162
163 // allow instantiation (Matrix_Sig)
164 /*virtual*/ static const char* mat_info()
165 { return "BdMatrix"; }
166
167#ifndef HAVE_PROMOTION_BUG
168 // Promotion (only explicit)
169# ifndef HAVE_GCC295_TMPLFRNDCLS_BUG
170 template <typename U> friend class BdMatrix;
171 template <typename U> explicit BdMatrix(const BdMatrix<U>& bm)
172 : diagconf(bm.diagconf), adiag(bm.dim), bdiag(bm.dim)
173 {
174 constructor(bm.dim);
175 for (unsigned k=0; k<dim; ++k)
176 diag[k] = bm.diag[k];
177 for (unsigned i=0; i<diagconf.size(); ++i) {
178 const unsigned di = diagconf.get(i);
179 for (unsigned j=0; j<dim-di; ++j) {
180 adiag.get(di)[j] = bm.adiag.get(di)[j];
181 bdiag.get(di)[j] = bm.bdiag.get(di)[j];
182 }
183 }
184 if (bm.outset) {
185 outw = bm.outw; outp = bm.outp;
186 outf = bm.outf; outset = true;
187 }
188 }
189# endif
190#endif
191
193 operator TMatrix<T>() const;
194 //operator Matrix<T>() const;
195
197 T& operator() (const unsigned int, const unsigned int) HOT;
199 typename tbci_traits<T>::const_refval_type
200 operator() (const unsigned int, const unsigned int) const HOT;
201 typename tbci_traits<T>::const_refval_type
202 get(const unsigned int, const unsigned int) const HOT;
203
205 BdMatrix<T>& setval(const T& v, const unsigned int r,
206 const unsigned int c)
207 { return autoinsert(v,r,c); }
208 T& setval(const unsigned int r, const unsigned int c);
209 //const T& getcref(const unsigned r, const unsigned c) const
210 //{ return (*this)(r,c); }
211
212 //T val(const unsigned int i, const unsigned int j) const
213 // { return this->operator() (i, j); }
214
215 // Basic operations
218
219 BdMatrix<T>& resize(const unsigned int);
220 BdMatrix<T>& resize(const T&, const unsigned int);
221 BdMatrix<T>& resize(const unsigned int, const BVector<unsigned int>&);
222 BdMatrix<T>& resize(const T&, const unsigned int, const BVector<unsigned int>&);
225 { resize(bdm.dim, bdm.diagconf); return *this = bdm; }
226
228 BdMatrix<T>& setunit(const T& = 1); //unity Matrix
229 BdMatrix<T>& fill(const T& = 0);
231 BdMatrix<T>& adddiag(const unsigned);
232 BdMatrix<T>& removediag(const unsigned);
233 BdMatrix<T>& autoinsert(const T&, const unsigned, const unsigned);
234 BdMatrix<T>& expand(unsigned = 0);
236 BdMatrix<T>& set_row(const Vector<T>& val, const unsigned, const unsigned = 0);
238 TVector<T> get_row(const unsigned int) const;
240 TVector<T> get_col(const unsigned int) const;
241
243 inline unsigned int size() const { return dim; }
245 inline unsigned int noelem() const { return arsz; }
247 inline unsigned int rows() const { return dim; }
249 inline unsigned int columns() const { return dim; }
250
252 const BVector<unsigned int>& getcfg() const { return diagconf; }
254 unsigned int getmaxoff() const { return maxoff; }
255
256 T trace() const
257 {
258 T r ALIGN(MIN_ALIGN) = 0;
259 for (REGISTER unsigned i = 0; i < dim; ++i)
260 r += diag[i];
261 return r;
262 }
263 //T sdet(unsigned int, BVector<unsigned int>&, BVector<unsigned int>&);
264 //T subdet(BVector<unsigned int>, BVector<unsigned int>);
265 //T det() { return subdet(BVector<unsigned>(0), BVector<unsigned>(0)); }
266
269 BdMatrix<T>& operator = (const T&);
272 //BdMatrix<T>& operator *= (BdMatrix<T>&);
273 BdMatrix<T>& operator *= (const T&);
274 BdMatrix<T>& operator /= (const T&);
275
276 // Don't need friendship here. Good!
277#ifdef BDMAT_TRANS_FRIENDS
279#endif
280
281 bool operator == (const BdMatrix<T>&) const;
282 bool operator != (const BdMatrix<T>& ma) const { return !(*this == ma); }
283
284 BdMatrix<T> operator + (const BdMatrix<T>&) const;
285 BdMatrix<T> operator - (const BdMatrix<T>&) const;
286 BdMatrix<T> operator * (const BdMatrix<T>&) const;
287 BdMatrix<T> operator * (const T&) const;
288 // Friendship also not required
289#ifdef BDMAT_OP_FRIENDS
290 friend NOINST BdMatrix<T> operator * FGD (const T&, const BdMatrix<T>&);
291#endif
292
293 BdMatrix<T> operator / (const T&) const;
294 BdMatrix<T> operator - () const;
295
296 /*
297 * friend Vector<T> rvec FGD (const BdMatrix<T>&);
298 * friend Vector<T> cvec FGD (const BdMatrix<T>&);
299 */
300
302 TVector<T> operator * (const Vector<T>&) const HOT;
304 TVector<T> transMult(const Vector<T>&) const HOT;
305 //TVector<T> transMult(const TVector<T>&) const;
306 //TVector<T> transMult(const TSVector<T>&) const;
307 TVector<T> dotMult(const Vector<T>&) const;
308
310
314 BdMatrix<T>& mult_row(const T&, const unsigned);
315 BdMatrix<T>& div_row(const T&, const unsigned);
316
317 // I/O operations
318
319 friend STD__ istream& operator >> FGD (STD__ istream&, BdMatrix<T>&);
320 friend STD__ ostream& operator << FGD (STD__ ostream&, const BdMatrix<T>&);
321 const BdMatrix<T>& setoutopts(int w = 5, int p = 4, char f= ' ') const
322 { outw = w; outp = p; outf = f; outset = true; return *this; }
323
324 // Solver
325 friend NOINST void /* FRIEND_TBCI2__ */ gaussj FGDT (const BdMatrix<T>&,
326 const Matrix<T>&);
327 friend NOINST int /* FRIEND_TBCI2__ */ lu_decomp FGDT (BdMatrix<T>&) HOT;
328
329 // Inversion by determinants
331
332 // Friends for bdmat-vec mult
333 friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw FGDT (const unsigned, const unsigned,
334 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
335 friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw_opt FGDT (const unsigned, const unsigned,
336 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
337 friend void FRIEND_TBCI2__ do_bdmat_vec_mult_diagw_exact FGDT (const unsigned, const unsigned,
338 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
339 friend void FRIEND_TBCI2__ do_bdmat_vec_mult FGDT (const unsigned, const unsigned,
340 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
341 friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw FGDT (const unsigned, const unsigned,
342 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
343 friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw_opt FGDT (const unsigned, const unsigned,
344 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
345 friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_diagw_exact FGDT (const unsigned, const unsigned,
346 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
347 friend void FRIEND_TBCI2__ do_bdmat_vec_transmult FGDT (const unsigned, const unsigned,
348 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
349 friend void FRIEND_TBCI2__ do_bdmat_vec_dotmult FGDT (const unsigned, const unsigned,
350 TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
351};
352
353
355#ifdef BDMAT_ONE_BLOCK
356
357template <typename T>
358inline void BdMatrix<T>::free_diags(const unsigned dummy)
359{
360 BDMATDBG(STD__ cerr << "BdMat free " << diag << " (" << dim << ")" << STD__ endl);
361 if (dim)
363}
364
365template <typename T>
366inline void BdMatrix<T>::do_copy(const BdMatrix<T>& bdm)
367{
368 TBCICOPY(diag, bdm.diag, T, arsz);
369 // In case standard setup has been changed, e.g. by transpose
370 for (unsigned i = 0; i < diagconf.size(); ++i) {
371 const unsigned di = diagconf.get(i);
372 adiag.set(di) = (T*)((long)diag + (long)bdm.adiag.get(di) - (long)bdm.diag);
373 bdiag.set(di) = (T*)((long)diag + (long)bdm.bdiag.get(di) - (long)bdm.diag);
374 }
375}
376
377template <typename T>
378void BdMatrix<T>::constructor(const unsigned int d, const bool delout)
379{
380 dim = d; maxoff = 0;
381 //maxoff = findmax(diagconf);
382 BCHKNR(dim > 0 && maxoff >= dim, BdMatrixErr, Wrong config Vector, maxoff);
383 adiag.resize((T*)0, d); bdiag.resize((T*)0, d); //initialization with NULL Vectors
384 unsigned long sz = dim; int t;
385 const unsigned long dcs = diagconf.size();
386 for (REGISTER unsigned j = 0; j < dcs; ++j)
387 if ((t = ((signed)dim - (signed)diagconf.get(j))) > 0)
388 sz += 2 * t;
389 if (UNLIKELY(sz)) {
390 diag = NEW(T, sz);
391 if (LIKELY(diag)) {
392 adiag.set(0) = diag; bdiag.set(0) = diag; sz = dim;
393 for (unsigned i = 0; i < dcs; ++i) {
394 const unsigned int di = diagconf.get(i);
395 int s = (signed)dim - (signed)di;
396 if (UNLIKELY(s <= 0 || s >= (signed)dim)) {
397 STD__ cerr << "Wrong config. Vector supplied!" << STD__ endl;
398 s = 0;
399 } else {
400 adiag.set(di) = diag+sz; sz += s;
401 bdiag.set(di) = diag+sz; sz += s;
402 }
403 if (LIKELY(di > maxoff))
404 maxoff = di;
405 }
406 } else { // alloc failure
407 adiag.resize(0); bdiag.resize(0); dim = 0;
408 STD__ cerr << "BdMat ctor: malloc failed to get " << sz << " bytes!" << STD__ endl;
409 BCHKNR(!diag, BdMatrixErr, memory alloc failure, sz);
410 sz = 0;
411 }
412 }
413 else
414 diag = (T*)0;
415
416 arsz = sz; dummy = (T)0;
417 BDMATDBG(STD__ cerr << "BdMat alloc " << diag << " (" << sz << ")" << STD__ endl);
418 if (delout)
419 outset = false;
420}
421
422#else /* ! BDMAT_ONE_BLOCK */
423
424/* Free diags until diagconf(max)-1 */
425template <typename T>
426inline void BdMatrix<T>::free_diags(const unsigned u)
427{
428 for (int i = u-1; i >= 0; --i) {
429 // The diags could have been swapped by transpose ()
430 REGISTER const unsigned di = diagconf.get(i);
431 REGISTER T* ptr = MIN(adiag.get(di), bdiag.get(di));
432 BDMATDBG(STD__ cerr << "BdMat free " << ptr << STD__ endl);
433 TBCIDELETE(T, ptr, 2*(dim-di));
434 }
435 BDMATDBG(STD__ cerr << "BdMat free " << diag << " (" << dim << ")" << STD__ endl);
436 if (LIKELY(dim))
437 TBCIDELETE(T, diag, dim);
438}
439
440template <typename T>
441inline void BdMatrix<T>::do_copy(const BdMatrix<T>& bdm)
442{
443 TBCICOPY(diag, bdm.diag, T, dim);
444 for (unsigned i = 0; i < diagconf.size(); ++i) {
445 REGISTER const unsigned di = diagconf.get(i);
446 TBCICOPY(adiag.get(di), bdm.adiag.get(di), T, (dim-di));
447 TBCICOPY(bdiag.get(di), bdm.bdiag.get(di), T, (dim-di));
448 }
449}
450
451template <typename T>
452void BdMatrix<T>::constructor(const unsigned int d, const bool delout)
453{
454 unsigned long sz = 0; unsigned i = 0;
455 dim = d; maxoff = 0;
456 //maxoff = findmax(diagconf);
457 BCHKNR (dim > 0 && maxoff >= dim, BdMatrixErr, Wrong config Vector, maxoff);
458 //initialization with NULL Vectors
459 adiag.resize((T*)0, d);
460 bdiag.resize((T*)0, d);
461
462 if (LIKELY(!dim))
463 diag = (T*)0;
464 else {
465 diag = NEW(T,dim);
466 if (UNLIKELY(!diag))
467 goto memallocfail;
468 sz += dim;
469 adiag.set(0) = diag; bdiag.set(0) = diag; // paranoia
470 }
471
472 while (i < diagconf.size()) {
473 const unsigned di = diagconf.get(i);
474 int s = (signed)dim - (signed)di;
475 if (UNLIKELY(s <= 0 || s >= (signed)dim)) {
476 BCHKNR(true, BdMatrixErr, Illegal Config BVector, di);
477 // Try to salvage
479 } else {
480 T* ptr = NEW(T, 2*s);
481 if (UNLIKELY(!ptr))
482 goto memallocfail;
483 adiag.set(di) = ptr;
484 bdiag.set(di) = ptr+s;
485 if (LIKELY(di > maxoff))
486 maxoff = di;
487 sz += 2*s; ++i;
488 }
489 }
490 arsz = sz; dummy = (T)0;
491 if (delout)
492 outset = false;
493 BDMATDBG(STD__ cerr << "BdMat alloc " << diag << " (" << sz << ")" << STD__ endl);
494 return;
495 memallocfail:
496 free_diags(i);
497 adiag.resize(0); bdiag.resize(0); dim = 0; //sz = 0;
498 STD__ cerr << "BdMat ctor: mem alloc (" << sz << ") failed!" << STD__ endl;
499 BCHKNR(1, BdMatrixErr, mem alloc failure, sz);
500 return;
501}
502
503#endif /* BDMAT_ONE_BLOCK */
504
505template <typename T>
506inline BdMatrix<T>::BdMatrix(const unsigned int d)
507: diagconf(1, 1), adiag(d), bdiag(d)
508{
509 if (UNLIKELY(d <= 1))
510 diagconf.resize(0); // default: Tridiagonal shape
511 constructor(d); clear();
512}
513
514template <typename T>
515inline BdMatrix<T>::BdMatrix(const unsigned int d, const unsigned int e)
516: diagconf(1, 1), adiag(d), bdiag(d)
517{
518 BCHKNR(e != d, BdMatrixErr, "Non square BdMatrix shape impossible!", e);
519 if (UNLIKELY(d <= 1))
520 diagconf.resize(0); // default: Tridiagonal shape
521 constructor(d); clear();
522}
523
524template <typename T>
525inline BdMatrix<T>::BdMatrix(const unsigned int d, const BVector<unsigned int>& conf)
526: diagconf(conf), adiag(d), bdiag(d)
527{
528 constructor(d); clear();
529}
530
531
532template <typename T>
534: diagconf(mat.diagconf), adiag(mat.dim), bdiag(mat.dim)
535{
536 constructor(mat.dim); do_copy(mat);
537
538 if (UNLIKELY(mat.outset)) {
539 outw = mat.outw; outp = mat.outp;
540 outf = mat.outf; outset = true;
541 }
542}
543
544template <typename T>
545inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d)
546: diagconf(1, 1), adiag(d), bdiag(d)
547{
548 if (UNLIKELY(d <= 1))
549 diagconf.resize(0); // default: Tridiagonal shape
550 constructor(d);
551 setunit(val);
552}
553
554template <typename T>
555inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d, unsigned int mo)
556: diagconf(mo), adiag(d), bdiag(d)
557{
558 if (mo >= d)
559 mo = d - 1;
560 diagconf.resize(mo);
561 for (REGISTER unsigned int t = 0; t < mo; ++t)
562 diagconf.set(t) = t+1;
563
564 constructor(d);
565 setunit(val);
566}
567
568
569template <typename T>
570inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d,
571 const BVector<unsigned int>& conf)
572: diagconf(conf), adiag(d), bdiag(d)
573{
574 constructor(d);
575 setunit(val);
576}
577
578
579template <typename T>
581: diagconf(0, 0), adiag(mat.rows()), bdiag(mat.columns())
582{
583 BCHKNR(mat.columns() != mat.rows(), BdMatrixErr, Only quadratic matrices, 0);
584
585 //set up conf Vector
586 unsigned int ds = 0;
587 for (unsigned int l = 1; l < mat.columns(); ++l) {
588 for (unsigned REGISTER j = 0;
589 (signed)j < ((signed)(mat.rows() - l)); ++j) {
590 if (mat(j, j+l) != T(0) || mat(j+l, j) != T(0)) {
591 diagconf.resize(++ds);
592 diagconf.set(ds - 1) = l;
593 break;
594 }
595 }
596 }
597
598 //std::cout << diagconf << std::endl;
599 constructor(mat.columns());
600
601 //copy diagonal
602 for (REGISTER unsigned int k = 0; k < dim; ++k)
603 diag[k] = mat(k, k);
604 //copy configured offdiags
605 for (unsigned int i = 0; i < diagconf.size(); ++i) {
606 const unsigned int dci = diagconf.get(i);
607 for (REGISTER unsigned int j = 0; j < dim - dci; ++j) {
608 adiag.get(dci)[j] = mat(j, j + dci);
609 bdiag.get(dci)[j] = mat(dci + j, j);
610 }
611 }
612}
613
614template <typename T>
616{
617 BCHKNR(dummy!=(T)0, BdMatrixErr, dummy not zero, 0);
618 free_diags(diagconf.size());
619}
620
621
622template <typename T>
623inline bool BdMatrix<T>::search2(unsigned int& cfg, const unsigned int i,
624 const unsigned int j)
625{
626 for (REGISTER int t = 0; (unsigned)t < diagconf.size(); ++t) {
627 if (UNLIKELY(CSTD__ abs((int)(j - i)) == (int)diagconf.get(t))) {
628 cfg = t;
629 return true;
630 }
631 }
632 return false;
633}
634
635template <typename T>
636inline T* BdMatrix<T>::check_internal(const unsigned i, const unsigned j) const
637{
638 const int df = i-j;
639 if (LIKELY(!df))
640 return diag+i;
641 if (CSTD__ abs(df) > (int)maxoff)
642 return (T*)0;
643
644 T* ptr;
645 if (df > 0) {
646 if (LIKELY(ptr = bdiag.get( df)))
647 return ptr+j;
648 else
649 return (T*)0;
650 } else {
651 if (LIKELY(ptr = adiag.get(-df)))
652 return ptr+i;
653 else
654 return (T*)0;
655 }
656}
657
658template <typename T>
659inline T* BdMatrix<T>::check(const unsigned i, const unsigned j) const
660{
661 EXPCHK(i>=dim, BdMatrixErr, Invalid row index, i, (T*)0);
662 EXPCHK(j>=dim, BdMatrixErr, Invalid col index, j, (T*)0);
663 return check_internal(i, j);
664}
665
666
667template <typename T>
668inline T& BdMatrix<T>::operator() (const unsigned int r, const unsigned int c)
669{
670 REGISTER T* pt = check(r, c);
671 if (LIKELY(pt))
672 return *pt;
673 else {
674 //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
675 return (dummy = (T)0);
676 }
677}
678
679template <typename T>
680inline typename tbci_traits<T>::const_refval_type
681BdMatrix<T>::operator() (const unsigned int r, const unsigned int c) const
682{
683 REGISTER T* pt = check(r, c);
684 if (LIKELY(pt))
685 return *pt;
686 else {
687 //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
688 return (dummy = (T)0);
689 }
690}
691
692template <typename T>
693inline typename tbci_traits<T>::const_refval_type
694BdMatrix<T>::get(const unsigned int r, const unsigned int c) const
695{
696 REGISTER T* pt = check_internal(r, c);
697 if (LIKELY(pt))
698 return *pt;
699 else {
700 //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
701 return (dummy = (T)0);
702 }
703}
704
705
706template <typename T>
707T& BdMatrix<T>::setval(const unsigned int r, const unsigned int c)
708{
709 //Auto-insert
710 REGISTER T* pt = check(r, c);
711 if (LIKELY(pt))
712 return *pt;
713 else {
714 adddiag(CSTD__ abs((int)(r-c)));
715 pt = check(r, c);
716 BCHK(!pt, BdMatrixErr, Internal autoinsert error, 0, dummy=(T)0);
717 //*pt = (T)0;
718 }
719 return *pt;
720}
721
722
723#ifdef BDMAT_ONE_BLOCK
724template <typename T>
726{
727 TBCIFILL(diag, val, T, dim);
728 if (arsz > dim)
729 TBCIFILL((diag+dim), (T)0, T, (arsz-dim));
730 dummy = (T)0;
731 return *this;
732}
733#else
734template <typename T>
735inline BdMatrix<T>& BdMatrix<T>::setunit(const T& val)
736{
737 TBCIFILL(diag, val, T, dim);
738 for (unsigned i = 0; i < diagconf.size(); ++i) {
739 const REGISTER unsigned di = diagconf.get(i);
740 TBCIFILL(adiag.get(di), (T)0, T, (dim-di));
741 TBCIFILL(bdiag.get(di), (T)0, T, (dim-di));
742 }
743 dummy = (T)0;
744 return *this;
745}
746#endif
747
748template <typename T>
749BdMatrix<T>& BdMatrix<T>::resize(const unsigned int nd)
750{
751 if (nd == dim)
752 return *this;
753
754 T* oldd(diag); unsigned int od = dim;
755#ifdef BDMAT_ONE_BLOCK
756 unsigned int oldar = arsz;
757#endif
758 BVector<T*> oldad(adiag), oldbd(bdiag);
760
761 //compatibility with constructor
762 if (od <= 1 && nd > 1)
763 diagconf.resize(1, 1);
764 constructor(nd, false); //dim = nd;
765 int min = MIN(nd, od);
766
767 if (min) {
768 //copy diagonal
769 TBCICOPY(diag, oldd, T, min);
770 if (nd > od)
771 TBCIFILL((diag+min), (T)0, T, (nd-dim));
772 //copy offdiags
773 for (unsigned int t = 0; t < diagconf.size(); ++t) {
774 const REGISTER int s = diagconf.get(t);
775 if (min - s > 0) {
776 TBCICOPY(adiag.get(s), oldad.get(s), T, (min-s));
777 TBCICOPY(bdiag.get(s), oldbd.get(s), T, (min-s));
778 if (nd > od) {
779 TBCIFILL((adiag.get(s)+(min-s)), (T)0, T, (nd-dim));
780 TBCIFILL((bdiag.get(s)+(min-s)), (T)0, T, (nd-dim));
781 }
782 }
783 }
784 }
785
786#ifndef BDMAT_ONE_BLOCK
787 for (unsigned i = 0; i < oldconf.size(); ++i)
788 TBCIDELETE(T, MIN(oldad.get(oldconf.get(i)), oldbd.get(oldconf.get(i))),
789 2*(od-oldconf.get(i)));
790 if (od)
791 TBCIDELETE(T, oldd, od);
792#else
793 if (oldar)
794 TBCIDELETE(T, oldd, oldar);
795#endif
796 return *this;
797}
798
799
800template <typename T>
801BdMatrix<T>& BdMatrix<T>::resize(const T& val, const unsigned int nd)
802{
803 if (nd != dim) {
804 if (dim)
805 free_diags(diagconf.size());
806 //compatibility with constructor
807 if (dim <= 1 && nd > 1)
808 diagconf.resize(1, 1);
809 constructor(nd, false);
810 }
811 setunit(val);
812 return *this;
813}
814
815
816//Warning: This resize() will NOT preserve data
817template <typename T>
818BdMatrix<T>& BdMatrix<T>::resize(const unsigned int nd, const BVector<unsigned int>& cfg)
819{
820 if (nd == dim && cfg == diagconf)
821 return *this;
822 if (dim)
823 free_diags(diagconf.size());
824 diagconf.copy(cfg);
825 constructor(nd, false);
826 clear();
827 return *this;
828}
829
830
831template <typename T>
832BdMatrix<T>& BdMatrix<T>::resize(const T& val, const unsigned int nd,
833 const BVector<unsigned int>& cfg)
834{
835 resize(nd, cfg);
836 //setunit(val);
837 //clear(); is called by resize(nd, cfg);
838 for (unsigned i = 0; i < dim; ++i)
839 diag[i] = val;
840 return *this;
841}
842
843
844//reconfigures a BdMatrix and preserves its contents
845template <typename T>
847{
848 if (dim == 0 || cfg == diagconf)
849 return *this;
850 T* oldd = diag;
851#ifdef BDMAT_ONE_BLOCK
852 unsigned int oldar = arsz;
853#else
854 unsigned int olddim = dim;
855#endif
856 BVector<T*> oldad(adiag), oldbd(bdiag);
858
859 diagconf.resize(cfg); //diagconf = cfg;
860 constructor(dim, false);
861 //delete junk
862 clear();
863 //copy elements: diagonal
864 TBCICOPY(diag, oldd, T, dim);
865 // offdiag
866 for (REGISTER unsigned t = 0; t < diagconf.size(); ++t) {
867 const unsigned int dct = diagconf.get(t);
868 if (oldad(dct)) {
869 TBCICOPY(adiag.get(dct), oldad.get(dct), T, (dim-dct));
870 TBCICOPY(bdiag.get(dct), oldbd.get(dct), T, (dim-dct));
871 }
872 }
873#ifndef BDMAT_ONE_BLOCK
874 for (unsigned i = 0; i < oldconf.size(); ++i)
875 TBCIDELETE(T, MIN(oldad.get(oldconf.get(i)), oldbd.get(oldconf.get(i))),
876 2*(olddim-oldconf.get(i)));
877 if (oldd)
878 TBCIDELETE(T, oldd, olddim);
879#else
880 if (oldar)
881 TBCIDELETE(T, oldd, oldar);
882#endif
883 return *this;
884}
885
886#ifdef BDMAT_ONE_BLOCK
887template <typename T>
889{
890 const int newsz = dim - ndg; /* no of elements in new diagonal */
891 BCHK(newsz <= 0, BdMatrixErr, Diag out of range, ndg, *this);
892 // Diag already stored ?
893 if (LIKELY(diagconf.contains(ndg)))
894 return *this;
895 // Save old position (for diff)
896 T* olddiag = diag;
897 REALLOC(diag, arsz, T, (arsz + 2 * newsz));
898 if (UNLIKELY(!diag)) {
899 diag = olddiag;
900 STD__ cerr << "Allocation error" << STD__ endl;
901 return *this;
902 }
903 BDMATDBG(STD__ cerr << "BdMat realloc " << diag << " ("
904 << arsz+2*newsz << ")" << STD__ endl);
905 // Delete new elements
906 // Calculate new pointers
907 /* difference in units of sizeof(T) could fail and actually did, as NO
908 * alignment is guaranteed by new ! Ughh ! Hard to find bug ! KG, 98/09/04
909 * (Note: Stroustrup (3rd ed.) does not say anything about alignment for
910 * operator new, so I can't tell whether this is a bug or not.)
911 * maybe overloading of the new operator would be a good idea to ensure
912 * alignment */
913 long diff = (long)diag - (long)olddiag; //long is better for 64 bit platforms ..
914 if (LIKELY(diff)) {
915 //STD__ cout << "DEBUG: diag " << olddiag << " -> " << diag << " , ad0 " << adiag(0) << " -> " << adiag(0) + diff << STD__ endl;
916 /* ANSI C++ is really stupid about forbidding pointer arithmetic,
917 * never produced such ugly code before! KG, 98/09/04 */
918 adiag.set(0) = (T*)((long)adiag.get(0) + diff);
919 bdiag.set(0) = (T*)((long)bdiag.get(0) + diff);
920 const unsigned dcs = diagconf.size();
921 for (unsigned i = 0; i < dcs; ++i) {
922 const unsigned di = diagconf.get(i);
923 adiag.set(di) = (T*)((long)adiag.get(di) + diff);
924 bdiag.set(di) = (T*)((long)bdiag.get(di) + diff);
925 }
926 //STD__ cout << " -> " << adiag(0) << STD__ endl;
927 }
928 // Paranoia
929 BCHKNR(adiag.get(0) != diag || bdiag.get(0) != diag, BdMatrixErr, adddiag: mismatch while doing pointer arithmetic, diag - adiag.get(0));
930 // Set pointers for new diagonals
931 diagconf.append(ndg);
932 adiag.set(ndg) = diag+arsz; bdiag.set(ndg) = diag+arsz+newsz;
933 TBCIFILL((diag+arsz), (T)0, T, (unsigned)(2*newsz));
934 // New sizes ...
935 if (ndg > maxoff)
936 maxoff = ndg;
937 arsz += 2 * newsz;
938 //STD__ cout << "DEBUG: diagconf, diag, arsz: " << diagconf << " - " << diag << " - "<< arsz << "\n adiag: " << adiag << "\n bdiag: " << bdiag << STD__ endl;
939 return *this;
940}
941
942#else /* BDMAT_ONE_BLOCK */
943
944template <typename T>
945BdMatrix<T>& BdMatrix<T>::adddiag(const unsigned ndg)
946{
947 const int newsz = dim - ndg; /* no of elements in new diagonal */
948 BCHK(newsz <= 0, BdMatrixErr, Diag out of range, ndg, *this);
949 // Diag already stored ?
950 if (LIKELY(diagconf.contains(ndg)))
951 return *this;
952 T* ptr = NEW(T, 2*newsz);
953 if (UNLIKELY(!ptr)) {
954 BCHK(1, BdMatrixErr, Allocation failed in adddiag, 2*newsz*sizeof(T), *this);
955 STD__ cerr << "BdMat adddiag allocation failed!" << STD__ endl;
956 return *this;
957 }
958 adiag.set(ndg) = ptr; bdiag.set(ndg) = ptr+newsz;
959 TBCIFILL(ptr, (T)0, T, (2*newsz));
960 diagconf.append(ndg);
961 // New sizes ...
962 if (ndg > maxoff)
963 maxoff = ndg;
964 arsz += 2 * newsz;
965 BDMATDBG(STD__ cerr << "BdMat adddiag " << ptr << " (" << arsz << ")" << STD__ endl);
966 return *this;
967}
968
969#endif /* BDMAT_ONE_BLOCK */
970
971// Only really functional for new (not ONE_BLOCK) mem alloc strategy
972template <typename T>
974{
975 if (dg > maxoff)
976 return *this;
977#ifdef BDMAT_ONE_BLOCK
978 TBCIFILL(adiag.get(dg), (T)0, T, (dim-dg));
979 TBCIFILL(bdiag.get(dg), (T)0, T, (dim-dg));
980#else
981 if (adiag.get(dg)) {
982 BDMATDBG(STD__ cerr << "BdMat rmvdiag "
983 << MIN(adiag.get(dg), bdiag.get(dg)) << STD__ endl);
984 TBCIDELETE(T, MIN(adiag.get(dg),bdiag.get(dg)), 2*(dim-dg));
985 adiag.set(dg) = (T*)0; bdiag.set(dg) = (T*)0;
986 unsigned ix; diagconf.contains(dg, &ix);
987 diagconf.remove(ix); arsz -= 2*(dim-dg);
988 if (maxoff == dg)
989 maxoff = diagconf.max();
990 }
991#endif
992 return *this;
993}
994
995template <typename T>
996BdMatrix<T>& BdMatrix<T>::autoinsert(const T& val, const unsigned r, const unsigned c)
997{
998 REGISTER T* pt = check(r, c);
999 if (LIKELY(pt)) {
1000 *pt = val;
1001 return *this;
1002 } else {
1003 if (LIKELY(MATH__ fabs(val) >= BD_MINVAL)) {
1004 adddiag(CSTD__ abs((int)(r-c)));
1005 pt = check(r, c);
1006 BCHK(!pt, BdMatrixErr, Internal autoinsert error, 0, *this);
1007 *pt = val;
1008 }
1009 }
1010 return *this;
1011}
1012
1013template <typename T>
1014BdMatrix<T>& BdMatrix<T>::set_row(const Vector<T>& val, const unsigned r, const unsigned s)
1015{
1016 for (unsigned i=0; i<val.size(); ++i)
1017 autoinsert(val(i), r, i+s);
1018 return *this;
1019}
1020
1021template <typename T>
1022TVector<T> BdMatrix<T>::get_row(const unsigned int r) const
1023{
1024 TVector<T> ret(0, dim);
1025 BCHK(r >= dim, BdMatrixErr, get_row idx out of bounds, r, ret);
1026 ret.set(diag[r], r);
1027 for (unsigned i = 0; i < diagconf.size(); ++i) {
1028 const REGISTER unsigned di = diagconf.get(i);
1029 if (LIKELY(di < dim-r))
1030 ret.set(adiag.get(di)[r] , r+di);
1031 if (LIKELY(r >= di))
1032 ret.set(bdiag.get(di)[r-di], r-di);
1033 }
1034 return ret;
1035}
1036
1037template <typename T>
1038TVector<T> BdMatrix<T>::get_col(const unsigned int c) const
1039{
1040 TVector<T> ret(0, dim);
1041 BCHK(c >= dim, BdMatrixErr, get_col idx out of bounds, c, ret);
1042 ret.set(diag[c], c);
1043 for (unsigned i = 0; i < diagconf.size(); i++) {
1044 const REGISTER unsigned di = diagconf.get(i);
1045 if (LIKELY(di < dim-c))
1046 ret.set(bdiag.get(di)[c] , c+di);
1047 if (LIKELY(c >= di))
1048 ret.set(adiag.get(di)[c-di], c-di);
1049 }
1050 return ret;
1051}
1052
1053#ifdef BDMAT_ONE_BLOCK
1054template <typename T>
1056{
1057 TBCIFILL(diag, val, T, arsz);
1058 return *this;
1059}
1060
1061template <typename T>
1063{
1064 TBCIFILL(diag, (T)0, T, arsz);
1065 return *this;
1066}
1067
1068#else
1069
1070template <typename T>
1071inline BdMatrix<T>& BdMatrix<T>::fill(const T& val)
1072{
1073 TBCIFILL(diag, val, T, dim);
1074 for (unsigned i = 0; i < diagconf.size(); ++i) {
1075 const REGISTER unsigned di = diagconf.get(i);
1076 TBCIFILL(adiag.get(di), val, T, (dim-di));
1077 TBCIFILL(bdiag.get(di), val, T, (dim-di));
1078 }
1079 return *this;
1080}
1081
1082template <typename T>
1084{
1085 TBCIFILL(diag, (T)0, T, dim);
1086 for (unsigned i = 0; i < diagconf.size(); ++i) {
1087 const REGISTER unsigned di = diagconf.get(i);
1088 TBCIFILL(adiag.get(di), (T)0, T, (dim-di));
1089 TBCIFILL(bdiag.get(di), (T)0, T, (dim-di));
1090 }
1091 return *this;
1092}
1093#endif
1094
1095// expand to outermost diagonal (or m, if given)
1096template <typename T>
1098{
1099 if (UNLIKELY(m == 0))
1100 m = maxoff;
1101 BCHK(m>dim, BdMatrixErr, Wrong expansion size, m, *this);
1102 if (LIKELY(maxoff == m && arsz == (dim + 2*dim*m - m*(m+1))))
1103 return *this;
1104#ifdef BDMAT_ONE_BLOCK
1105 BVector<unsigned> newconf(m);
1106 for (unsigned i = 0; i < m; ++i)
1107 newconf.set(i) = i+1;
1108 return reconfig(newconf);
1109#else
1110 for (unsigned i=1; i<=m; ++i)
1111 adddiag(i);
1112 return *this;
1113#endif
1114}
1115
1116
1117//#warning "Calculation of (sub)determinants not yet implemented!"
1118#if 0
1119template <typename T>
1120T BdMatrix<T>::sdet(unsigned int level, BVector<unsigned int>& arow,
1122{
1123 T el;
1124 int sz = arow.size();
1125 //if (sz < 1)
1126 // STD__ cerr << "FEHLER!" << STD__ endl;
1127 if (sz == 1)
1128 return this->operator() (arow(0), acol(0));
1129 if (sz == 2) {
1130 el = (this->operator() (arow(0), acol(0)) \
1131 * this->operator() (arow(1), acol(1)) \
1132 - this->operator() (arow(0), acol(1)) \
1133 * this->operator() (arow(1), acol(0)));
1134 return el;
1135 }
1136 T tmp = 0;
1137 //if (sz>=20) STD__ cout << sz << " ";
1139 int r = bestrow(level++); //row to expand
1140 int rw = 0; int mul;
1141 //setup rows
1142 for (REGISTER int i = 0; i < sz; i++) {
1143 if (arow(i) == r) {
1144 mul = (i%2? -1: 1); rw--;
1145 } else
1146 ar(rw) = arow(i);
1147 rw++;
1148 }
1149 for (REGISTER int j = 0; j < sz; j++) {
1150 el = this->operator() (r, acol(j));
1151 if (MATH__ fabs(el) < BD_MINVAL)
1152 continue;
1153 //setup cols if necessary
1154#if 0
1155 int cl = 0;
1156 for (REGISTER int t = 0; t < sz; t++) {
1157 if (t == j)
1158 cl--;
1159 else
1160 ac(cl) = acol(t);
1161 cl++;
1162 }
1163#else
1164 TBCICOPY(&(ac(0)), &(acol(0)), int, j);
1165 TBCICOPY(&(ac(j)), &(acol(j+1)), int, (sz-j-1));
1166#endif
1167
1168 tmp += (j%2? -1: 1) * sdet(level, ar, ac) * el;
1169 }
1170 return tmp * mul;
1171}
1172
1173template <typename T>
1175{
1176 if (ri.size() != ci.size()) {
1177#ifndef TBCI_DISABLE_EXCEPT
1178 throw (BdMatrixErr("Unequal # of lines and rows to ignore!"));
1179#else
1180 STD__ cerr << "Unequal # of lines and rows to ignore!" << STD__ endl;
1181 return 0;
1182#endif
1183 }
1184
1185 BVector<unsigned int> actrow(dim - ri.size());
1186 BVector<unsigned int> actcol(dim - ci.size());
1187 BVector<unsigned int> rowfill(dim);
1188 bestrow.resize(dim);
1189
1190 unsigned int rw = 0; unsigned int cl = 0;
1191 for (REGISTER int j = 0; j < dim; j++) {
1192 if (ri.contains (j))
1193 rw--;
1194 else
1195 actrow(rw) = j;
1196 if (ci.contains(j))
1197 cl--;
1198 else
1199 actcol(cl) = j;
1200 rw++; cl++;
1201 }
1202
1203 //STD__ cout << actrow << STD__ endl; STD__ cout << actcol << STD__ endl;
1204
1205 // (pseudo-) count # of non-zero elements of rows
1206 matmem<T>* el = offdiag.setfirst(); int elemi = (el? el->row: -1);
1207
1208 for (int i = 0; i < dim; i++) {
1209 if (!(actrow.contains(i))) {
1210 rowfill(i) = dim + 1; continue;
1211 }
1212
1213 int fill = 0;
1214 if (actcol.contains(i))
1215 fill++;
1216 if (actcol.contains(i+1) && i+1 < dim)
1217 fill++;
1218 if (actcol.contains(i-1) && i-1 >= 0)
1219 fill++;
1220
1221 for (REGISTER int s = 0; s < diagconf.size(); s++) {
1222 if (actcol.contains(i+diagconf.get(s)) && i+diagconf.get(s) < dim)
1223 fill++;
1224 if (actcol.contains(i-diagconf.get(s)) && i-diagconf.get(s) >= 0)
1225 fill++;
1226 }
1227
1228 while (el && elemi == i) {
1229 if (actcol.contains(el->col))
1230 fill++;
1231 el = offdiag.setnext(); elemi = (el? el->row: -1);
1232 }
1233
1234 rowfill(i) = fill;
1235 }
1236 //STD__ cout << "rowfill:" << rowfill << STD__ endl;
1237 //sort
1238 for (int k = 0; k < dim; k++) {
1239 int min = dim + 2; int minr = -1;
1240 for (int s = 0; s < dim; s++)
1241 if (rowfill(s) < min) {
1242 min = rowfill(s); minr = s;
1243 }
1244 bestrow(k) = minr;
1245 if (minr >= 0)
1246 rowfill(minr) = dim + 1;
1247 }
1248 return sdet(0, actrow, actcol);
1249}
1250#endif /* 0 */
1251
1252
1253template <typename T>
1255{
1256 BCHK(mat.dim != dim, BdMatrixErr, Assignment from wrong size Matrix, mat.dim, *this);
1257 if (diagconf != mat.diagconf) {
1258#ifdef BOUNDCHECK
1259 STD__ cout << "old:" << mat.diagconf << "\nnew:" << diagconf << STD__ endl;
1260 STD__ cout << "Warning: Automatic reconfiguring BdMatrix in assignment!" << STD__ endl;
1261#endif
1262 free_diags(diagconf.size());
1263 diagconf.resize(mat.diagconf.size());
1264 diagconf = mat.diagconf;
1266 }
1267
1268 dummy = mat.dummy;
1269 do_copy(mat);
1270 return *this;
1271}
1272
1273
1274//writes value to diagonal and deletes off-diags
1275template <typename T>
1277{
1278 setunit(val); return *this;
1279}
1280
1281
1282// Fully flexible now!
1283#define SFORALL_S(op) \
1284template <typename T> \
1285BdMatrix<T>& BdMatrix<T>::operator op (const BdMatrix<T>& mat) \
1286{ \
1287 BCHK(mat.dim != dim, BdMatrixErr, Adding wrong size BdMatrix, mat.dim, *this); \
1288 unsigned i, j; \
1289 for (i = 0; i < dim; ++i) \
1290 diag[i] op mat.diag[i]; \
1291 for (j = 0; j < diagconf.size(); ++j) { \
1292 const REGISTER unsigned di = diagconf.get(j); \
1293 if (LIKELY(mat.adiag.get(di))) \
1294 for (i = 0; i < (dim-di); ++i) { \
1295 adiag.set(di)[i] op mat.adiag.get(di)[i]; \
1296 bdiag.set(di)[i] op mat.bdiag.get(di)[i]; \
1297 } \
1298 } \
1299 if (diagconf == mat.diagconf) \
1300 return *this; \
1301 for (j = 0; j < mat.diagconf.size(); ++j) { \
1302 const REGISTER unsigned di = mat.diagconf.get(j); \
1303 if (!diagconf.contains(di)) { \
1304 adddiag(di); \
1305 for (i = 0; i < (dim-di); ++i) { \
1306 adiag.set(di)[i] op mat.adiag.get(di)[i]; \
1307 bdiag.set(di)[i] op mat.bdiag.get(di)[i]; \
1308 } \
1309 } \
1310 } \
1311 return *this; \
1312}
1313
1316
1317
1318#define SFORALL_T(op) \
1319template <typename T> \
1320BdMatrix<T>& BdMatrix<T>::operator op (const T& mult) \
1321{ \
1322 for (REGISTER unsigned int i = 0; i < dim; ++i) \
1323 diag[i] op mult; \
1324 for (unsigned j = 0; j < diagconf.size(); ++j) { \
1325 const REGISTER unsigned di = diagconf.get(j); \
1326 for (unsigned k = 0; k < (dim-di); ++k) { \
1327 adiag.set(di)[k] op mult; \
1328 bdiag.set(di)[k] op mult; \
1329 } \
1330 } \
1331 return *this; \
1332}
1333
1335//SFORALL_T(+=)
1336//SFORALL_T(-=)
1337
1338//The inline is necessary to prevent linker errors
1339template <typename T>
1340inline BdMatrix<T>& BdMatrix<T>::operator /= (const T& div)
1341{
1342 BCHK((div==(T)0), BdMatrixErr, "Divide BdMatrix by zero", 0, *this);
1343 REGISTER T mult ALIGN(MIN_ALIGN) = (T)1 / div;
1344 for (REGISTER unsigned int i = 0; i < dim; ++i)
1345 diag[i] *= mult;
1346 for (unsigned j = 0; j < diagconf.size(); ++j) {
1347 const REGISTER unsigned di = diagconf.get(j);
1348 for (unsigned k = 0; k < (dim-di); ++k) {
1349 adiag.set(di)[k] *= mult;
1350 bdiag.set(di)[k] *= mult;
1351 }
1352 }
1353 return *this;
1354}
1355
1356#define INTDIVEQ(T) \
1357template <> \
1358inline BdMatrix < T >& BdMatrix< T >::operator /= (const T & div) \
1359{ \
1360 BCHK((div==( T )0), BdMatrixErr, "Divide BdMatrix by zero", 0, *this); \
1361 for (REGISTER unsigned i = 0; i < dim; ++i) \
1362 diag[i] /= div; \
1363 for (unsigned j = 0; j < diagconf.size(); ++j) { \
1364 const REGISTER unsigned di = diagconf.get(j); \
1365 for (unsigned k = 0; k < (dim-di); ++k) { \
1366 adiag.set(di)[k] /= div; \
1367 bdiag.set(di)[k] /= div; \
1368 } \
1369 } \
1370 return *this; \
1371}
1372
1374INTDIVEQ(unsigned int)
1375
1376template <typename T>
1378{
1379#if 0 //def BDMAT_ONE_BLOCK
1380 // A swap_vectors implementation would be faster!
1381 for (REGISTER unsigned int t = 0; t < diagconf.size(); ++t) {
1382 const unsigned int s = diagconf.get(t);
1383 T* tmp = adiag.get(s); adiag.set(s) = bdiag.get(s); bdiag.set(s) = tmp;
1384 }
1385#else
1386 adiag.swap(bdiag);
1387#endif
1388 return *this;
1389}
1390
1391INST(template <typename T> class BdMatrix friend BdMatrix<T> transpose(BdMatrix<T>&);)
1392template <typename T>
1394{
1395 return (BdMatrix<T>(mat)).transpose();
1396}
1397
1398#define STDDEF_ST(op) \
1399template <typename T> \
1400inline BdMatrix<T> BdMatrix<T>::operator op (const T& val) const \
1401{ \
1402 return (BdMatrix<T> (*this)) op##= val; \
1403}
1404
1407//STDDEF_ST(+)
1408//STDDEF_ST(-)
1409
1410#define STDDEF_SS(op) \
1411template <typename T> \
1412BdMatrix<T> BdMatrix<T>::operator op (const BdMatrix<T>& m2) const \
1413{ \
1414 return (BdMatrix<T> (*this)) op##= m2; \
1415}
1416
1419
1420
1421template <typename T>
1422BdMatrix<T> BdMatrix<T>::operator * (const BdMatrix<T>& mat2) const
1423{
1424 const unsigned int newoff = maxoff + mat2.maxoff;
1425 //const unsigned int moff = MAX(maxoff, mat2.maxoff);
1426 BdMatrix<T> prd(0, dim, newoff);
1427 BCHK(dim != mat2.dim, BdMatrixErr, Multiplying diff size matrices, 0, prd);
1428
1429 for (int i = 0; i < (signed)dim; ++i) {
1430 /* A really good compiler would do this for us */
1431 const int jend = MIN(dim, i+newoff+1);
1432 const int kstart = MAX(0, i-(signed)maxoff);
1433 const int kend = MIN(dim, i+maxoff+1);
1434 /* TODO: Exact summing */
1435 for (int j = MAX(0,i-(signed)newoff); j < jend; ++j) {
1436 REGISTER T el ALIGN(MIN_ALIGN) = T(0);
1437 for (REGISTER int k = kstart; k < kend; ++k)
1438 el += this->get(i, k) * mat2.get(k, j);
1439 //prd.setval(i, j, el);
1440 if (el != T(0))
1441 prd(i, j) = el; /* autoinsert ? */
1442 }
1443 }
1444 return prd;
1445}
1446
1447INST(template <typename T> class BdMatrix friend BdMatrix<T> operator * (const T&, const BdMatrix<T>&);)
1448template <typename T>
1449inline BdMatrix<T> operator * (const T& v, const BdMatrix<T>& m)
1450{ return m*v; }
1451// Assuming commutativity here!
1452
1453#if 0
1454template <typename T>
1456{
1457 TMatrix<T> tm (mat.rows(), mat.columns());
1458 if (UNLIKELY(tm.rows() != dim)) {
1459#ifndef TBCI_DISABLE_EXCEPT
1460 throw (BdMatrixErr("Multiplying Matrix with Matrix: wrong sizes!"));
1461#else
1462 STD__ cerr << "Multiplying Matrix with Matrix: wrong sizes!" << STD__ endl;
1463 return tm;
1464#endif
1465 }
1466
1467 T val ALIGN(MIN_ALIGN);
1468 for (int c = 0; c < tm.columns(); c++)
1469 for (int i = 0; i < mat.rows(); i++) {
1470 val = 0;
1471 for (REGISTER int j=0; j < dim; j++)
1472 val += this->operator()(i, j) * mat(j, c);
1473 tm(i, c) = val;
1474 }
1475 return tm
1476}
1477#endif /* 0 */
1478
1479#ifdef USE_PREFETCH
1480# define MAY_PREFETCH_R(adr,loc) PREFETCH_R(adr,loc)
1481# define MAY_PREFETCH_W(adr,loc) PREFETCH_W(adr,loc)
1482#else
1483# define MAY_PREFETCH_R(adr,loc) do {} while (0)
1484# define MAY_PREFETCH_W(adr,loc) do {} while (0)
1485#endif
1486
1487
1488template <typename T>
1490{
1491 BdMatrix<T> neg(dim, diagconf);
1492 //diagonal
1493 for (REGISTER unsigned int j = 0; j < dim; ++j)
1494 neg.diag[j] = -diag[j];
1495 //offdiag
1496 for (unsigned int t = 0; t < diagconf.size(); ++t) {
1497 const unsigned int dc = diagconf.get(t);
1498 for (REGISTER unsigned int i = 0; i < (dim - dc); ++i) {
1499 neg.adiag.set(dc)[i] = -adiag.get(dc)[i];
1500 neg.bdiag.set(dc)[i] = -bdiag.get(dc)[i];
1501 }
1502 }
1503 return neg;
1504}
1505
1506/* Default if optimized linewise multiply */
1507#if !defined(BDMATVEC_LNWISE) && !defined(BDMATVEC_LNWISE_OPT) && !defined(BDMATVEC_DIAGWISE)
1508# define BDMATVEC_LNWISE_OPT
1509#endif
1510
1511/* Cost of these ops w/ dim dxd, o offdiags and a maxoffset of m */
1512#define COST_BDMATVEC_LN(d,o,m) (d*(COST_MULT+2*COST_UNIT_LOAD+COST_UNIT_STORE+COST_LOOP\
1513 +o*(COST_LOOP+2*COST_BRANCH+COST_UNIT_LOAD))+(2*d-m)*o*(3*COST_NU_LOAD+COST_ADD+COST_MULT))
1514#define COST_BDMATVEC_LNO(d,o,m) (d*(COST_MULT+2*COST_UNIT_LOAD+COST_UNIT_STORE+COST_LOOP\
1515 +o*(COST_LOOP+COST_UNIT_LOAD))+(2*d-m)*o*(3*COST_NU_LOAD+COST_ADD+COST_MULT+COST_BRANCH/2))
1516#define COST_BDMATVEC_DIAG(d,o,m) (d*(2*COST_UNIT_LOAD+COST_MULT+COST_UNIT_STORE+COST_LOOP)\
1517 +o*(6*COST_NU_LOAD+COST_LOOP+3*COST_UNIT_LOAD+2*COST_NU_STORE\
1518 +(2*d-m)*(3*COST_UNIT_LOAD+COST_LOOP+COST_MULT+COST_ADD+COST_UNIT_STORE)))
1519
1520template <typename T>
1521inline void do_bdmat_vec_mult_lnw(const unsigned start, const unsigned end,
1522 TVector<T> *res, const BdMatrix<T> *mat,
1523 const Vector<T> *vec)
1524
1525{
1526 const unsigned int od = mat->diagconf.size();
1527 const unsigned int dim = mat->dim;
1528 // Zeilenweise, stur
1529 // TODO: Exact summation
1530 if (do_exactsum2()) {
1531 for (unsigned i = start; i < end; ++i) {
1532 // diag
1533 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1534 REGISTER T comp(0.0);
1535 // offdiag
1536 for (unsigned j = 0; j < od; ++j) {
1537 REGISTER const unsigned s = mat->diagconf.get(j);
1538 if (LIKELY(s <= i)) {
1539 const REGISTER T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1540 const REGISTER T t = val+y;
1541 comp += (t-val) - y;
1542 val = t;
1543 }
1544 if (LIKELY(i+s < dim)) {
1545 const REGISTER T y = mat->adiag.get(s)[i] * vec->get(i+s);
1546 const REGISTER T t = val+y;
1547 comp += (t-val) - y;
1548 val = t;
1549 }
1550 }
1551 res->set(val-comp, i);
1552 }
1553 } else {
1554 for (unsigned i = start; i < end; ++i) {
1555 // diag
1556 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1557 // offdiag
1558 for (unsigned j = 0; j < od; ++j) {
1559 REGISTER const unsigned s = mat->diagconf.get(j);
1560 if (LIKELY(s <= i))
1561 val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1562 if (LIKELY(i+s < dim))
1563 val += mat->adiag.get(s)[i] * vec->get(i+s);
1564 }
1565 res->set(val, i);
1566 }
1567 }
1568}
1569
1570template <typename T>
1571inline void do_bdmat_vec_mult_lnw_opt(const unsigned start, const unsigned end,
1572 TVector<T> *res, const BdMatrix<T> *mat,
1573 const Vector<T> *vec)
1574
1575{
1576 const unsigned int od = mat->diagconf.size();
1577 const unsigned int dim = mat->dim;
1578 // Zeilenweise, optimiert
1579 unsigned int i;
1580 //const unsigned dconf0 = mat->diagconf.get(0);
1581 const unsigned int maxoff = mat->maxoff;
1582 /* A very good compiler would do this for us */
1583 unsigned iend = MIN(maxoff, dim-maxoff);
1584 iend = MIN(iend, end);
1585 MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
1586 //MAY_PREFETCH_R(&mat->adiag.get(0), 3);
1587 //MAY_PREFETCH_R(&mat->bdiag.get(0), 3);
1588#ifdef USE_PREFETCH
1589 for (unsigned int _j = 0; _j < od; ++_j) {
1590 const unsigned s = mat->diagconf.get(_j);
1591 PREFETCH_R(&mat->adiag.getcref(s), 2);
1592 PREFETCH_R(&mat->bdiag.getcref(s), 2);
1593 PREFETCH_R(mat->adiag.get(s), 2);
1594 PREFETCH_R(mat->bdiag.get(s), 2);
1595 }
1596#endif
1597 MAY_PREFETCH_R(mat->diag, 2);
1598 MAY_PREFETCH_R(&vec->getcref(0), 2);
1599 MAY_PREFETCH_W(&res->setval(0), 3);
1600 if (do_exactsum2()) {
1601 /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
1602 * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
1603 */
1604 for (i = start; i < iend; ++i) {
1605 // diag
1606 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1607 REGISTER T comp(0.0);
1608 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1609 MAY_PREFETCH_R(mat->diag+i+1, 2);
1610 // offdiag
1611 for (unsigned int j = 0; j < od; ++j) {
1612 REGISTER const unsigned int s = mat->diagconf.get(j);
1613 if (s <= i) {
1614 const T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1615 const T t = val + y;
1616 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1617 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1618 comp += (t-val) - y;
1619 val = t;
1620 }
1621 // s <= maxoff; i < N-maxoff => s+i < N
1622 /*if (s+i < dim)*/
1623 const T y = mat->adiag.get(s)[i] * vec->get(i+s);
1624 const T t = val + y;
1625 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1626 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1627 comp += (t-val) - y;
1628 val = t;
1629 }
1630 res->set(val-comp, i);
1631 MAY_PREFETCH_W(&res->setval(i+1), 3);
1632 }
1633 iend = MIN(end, dim - maxoff);
1634 if (i < iend) {
1635 /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
1636 for (; i < iend; ++i) {
1637 // diag
1638 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1639 REGISTER T comp(0.0);
1640 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1641 MAY_PREFETCH_R(mat->diag+i+1, 2);
1642 // offdiag, no checking necessary
1643 for (unsigned int j = 0; j < od; ++j) {
1644 REGISTER const unsigned int s = mat->diagconf.get(j);
1645 T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1646 T t = val + y;
1647 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1648 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1649 comp += (t-val) - y;
1650 val = t;
1651 y = mat->adiag.get(s)[i] * vec->get(i+s);
1652 t = val + y;
1653 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1654 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1655 comp += (t-val) - y;
1656 val = t;
1657 }
1658 res->set(val-comp, i);
1659 MAY_PREFETCH_W(&res->setval(i+1), 3);
1660 }
1661 } else {
1662 iend = MIN(maxoff, end);
1663 /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
1664 // TODO: Exact summation
1665 for (; i < iend; ++i) {
1666 // diag
1667 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1668 REGISTER T comp(0.0);
1669 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1670 MAY_PREFETCH_R(mat->diag+i+1, 2);
1671 // offdiag, no checking necessary
1672 for (unsigned int j = 0; j < od; ++j) {
1673 REGISTER const unsigned int s = mat->diagconf.get(j);
1674 if (s <= i) {
1675 const T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1676 const T t = val + y;
1677 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1678 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1679 comp += (t-val) - y;
1680 val = t;
1681 }
1682 if (s < (signed)dim - i) {
1683 const T y = mat->adiag.get(s)[i] * vec->get(i+s);
1684 const T t = val + y;
1685 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1686 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1687 comp += (t-val) - y;
1688 val = t;
1689 }
1690 }
1691 res->set(val-comp, i);
1692 MAY_PREFETCH_W(&res->setval(i+1), 3);
1693 }
1694 }
1695 /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
1696 * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
1697 */
1698 for (; i < end; ++i) {
1699 // diag
1700 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1701 REGISTER T comp(0.0);
1702 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1703 MAY_PREFETCH_R(mat->diag+i+1, 2);
1704 // offdiag
1705 for (unsigned int j = 0; j < od; ++j) {
1706 REGISTER const unsigned int s = mat->diagconf.get(j);
1707 /* if (s <= i) */
1708 T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1709 T t = val + y;
1710 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1711 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1712 comp += (t-val) - y;
1713 val = t;
1714 if (s < (signed)dim - i) {
1715 y = mat->adiag.get(s)[i] * vec->get(i+s);
1716 t = val + y;
1717 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1718 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1719 comp += (t-val) - y;
1720 val = t;
1721 }
1722 }
1723 res->set(val-comp, i);
1724 MAY_PREFETCH_W(&res->setval(i+1), 3);
1725 }
1726 } else { /* Not exactsum2 */
1727 /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
1728 * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
1729 */
1730 for (i = start; i < iend; ++i) {
1731 // diag
1732 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1733 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1734 MAY_PREFETCH_R(mat->diag+i+1, 2);
1735 // offdiag
1736 for (unsigned int j = 0; j < od; ++j) {
1737 REGISTER const unsigned int s = mat->diagconf.get(j);
1738 if (s <= i) {
1739 val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1740 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1741 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1742 }
1743 // s <= maxoff; i < N-maxoff => s+i < N
1744 /*if (s+i < dim)*/
1745 val += mat->adiag.get(s)[i] * vec->get(i+s);
1746 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1747 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1748 }
1749 res->set(val, i);
1750 MAY_PREFETCH_W(&res->setval(i+1), 3);
1751 }
1752 iend = MIN(end, dim - maxoff);
1753 if (i < iend) {
1754 /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
1755 for (; i < iend; ++i) {
1756 // diag
1757 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1758 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1759 MAY_PREFETCH_R(mat->diag+i+1, 2);
1760 // offdiag, no checking necessary
1761 for (unsigned int j = 0; j < od; ++j) {
1762 REGISTER const unsigned int s = mat->diagconf.get(j);
1763 val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1764 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1765 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1766 val += mat->adiag.get(s)[i] * vec->get(i+s);
1767 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1768 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1769 }
1770 res->set(val, i);
1771 MAY_PREFETCH_W(&res->setval(i+1), 3);
1772 }
1773 } else {
1774 iend = MIN(maxoff, end);
1775 /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
1776 for (; i < iend; ++i) {
1777 // diag
1778 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1779 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1780 MAY_PREFETCH_R(mat->diag+i+1, 2);
1781 // offdiag, no checking necessary
1782 for (unsigned int j = 0; j < od; ++j) {
1783 REGISTER const unsigned int s = mat->diagconf.get(j);
1784 if (s <= i) {
1785 val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1786 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1787 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1788 }
1789 if (s < (signed)dim - i) {
1790 val += mat->adiag.get(s)[i] * vec->get(i+s);
1791 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1792 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1793 }
1794 }
1795 res->set(val, i);
1796 MAY_PREFETCH_W(&res->setval(i+1), 3);
1797 }
1798 }
1799 /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
1800 * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
1801 */
1802 for (; i < end; ++i) {
1803 // diag
1804 REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1805 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1806 MAY_PREFETCH_R(mat->diag+i+1, 2);
1807 // offdiag
1808 for (unsigned int j = 0; j < od; ++j) {
1809 REGISTER const unsigned int s = mat->diagconf.get(j);
1810 /* if (s <= i) */
1811 val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1812 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1813 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1814 if (s < (signed)dim - i) {
1815 val += mat->adiag.get(s)[i] * vec->get(i+s);
1816 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1817 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1818 }
1819 }
1820 res->set(val, i);
1821 MAY_PREFETCH_W(&res->setval(i+1), 3);
1822 }
1823 }
1824}
1825
1826template <typename T>
1827inline void do_bdmat_vec_mult_diagw_exact(const unsigned start, const unsigned end,
1828 TVector<T> *res, const BdMatrix<T> *mat,
1829 const Vector<T> *vec)
1830
1831{
1832 const unsigned int od = mat->diagconf.size();
1833 const unsigned int dim = mat->dim;
1834 //Diagonal wise
1835 // We need to store a complete vector full of compensation values
1836 // and this is actually FASTER than the linewise algo
1837 // that we could employ alternatively.
1838 // TODO: Find heuristics to determine what's better
1839 // (current suspicion: for rather sparse matrices, the lnw is better,
1840 // for rather full matrices, the compensation vector will work best)
1841 // diag
1842 Vector<T> corr((T)0, end-start);
1843 do_vec_vec_mul(end-start, &res->setval(start),
1844 &mat->diag[start], &vec->getcref(start));
1845 // offdiag
1846 for (unsigned j = 0; j < od; ++j) {
1847 // s ist die Nummer der Diagonalen (1<=s<=maxoff)
1848 const unsigned s = mat->diagconf.get(j);
1849 const unsigned iend = MIN(end,dim-s);
1850 const unsigned iend2 = MAX(0,(int)(end-s));
1851 const unsigned istart2 = MAX((int)(start-s),0);
1852 //do_add_vec_vec_mul(iend-start, &res->setval(start),
1853 // mat->adiag.get(s)+start, &vec->getcref(start+s));
1854 for (unsigned off = start; off < iend; ++off) {
1855 const REGISTER T y = *(mat->adiag.get(s)+off) * vec->get(s+off);
1856 const REGISTER T t = res->get(off) + y;
1857 corr.setval(off-start) += (t - res->get(off)) - y;
1858 res->setval(off) = t;
1859 }
1860 //do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
1861 // mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
1862 for (unsigned off = istart2; off < iend2; ++off) {
1863 const REGISTER T y = *(mat->bdiag.get(s)+off) * vec->get(off);
1864 const REGISTER T t = res->get(s+off) + y;
1865 corr.setval(s+off-start) += (t - res->get(s+off)) - y;
1866 res->setval(s+off) = t;
1867 }
1868 }
1869 /* Apply correction */
1870 for (unsigned off = start; off < end; ++off)
1871 res->setval(off) -= corr.get(off-start);
1872}
1873
1874template <typename T>
1875inline void do_bdmat_vec_mult(const unsigned start, const unsigned end,
1876 TVector<T> *res, const BdMatrix<T> *mat,
1877 const Vector<T> *vec)
1878
1879{
1880 const unsigned int dim = mat->dim;
1881 const unsigned int od = mat->diagconf.size();
1882 // Diagonal wise
1883 // TODO: Find heuristics to determine what's better
1884 // (current suspicion: for rather sparse matrices, the lnw is better,
1885 // for rather full matrices, the compensation vector will work best)
1886 if (do_exactsum2()) {
1887 if (od < 8)
1888 do_bdmat_vec_mult_lnw(start, end, res, mat, vec);
1889 else
1890 do_bdmat_vec_mult_diagw_exact(start, end, res, mat, vec);
1891 return;
1892 }
1893 // diag
1894 do_vec_vec_mul(end-start, &res->setval(start),
1895 &mat->diag[start], &vec->getcref(start));
1896 // offdiag
1897 for (unsigned j = 0; j < od; ++j) {
1898 // s ist die Nummer der Diagonalen (1<=s<=maxoff)
1899 const unsigned s = mat->diagconf.get(j);
1900 const unsigned iend = MIN(end,dim-s);
1901 const unsigned iend2 = MAX(0,(int)(end-s));
1902 const unsigned istart2 = MAX((int)(start-s),0);
1903 if (LIKELY(iend > start))
1904 do_add_vec_vec_mul(iend-start, &res->setval(start),
1905 mat->adiag.get(s)+start, &vec->getcref(start+s));
1906 if (LIKELY(iend2 > istart2))
1907 do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
1908 mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
1909 }
1910}
1911
1912#if defined(BDMATVEC_LNWISE)
1913# define COST_BDMATVEC(d,o,m) COST_BDMATVEC_LN(d,o,m)
1914#elif defined(BDMATVEC_LNWISE_OPT)
1915# define COST_BDMATVEC(d,o,m) COST_BDMATVEC_LNO(d,o,m)
1916#elif defined(BDMATVEC_DIAGWISE)
1917# define COST_BDMATVEC(d,o,m) COST_BDMATVEC_DIAG(d,o,m)
1918#endif
1919
1920#define COST_BDMATVEC_TRANS(d,o,m) COST_BDMATVEC(d,o,m)
1921template <typename T>
1922inline void do_bdmat_vec_transmult_lnw(const unsigned start, const unsigned end,
1923 TVector<T> *res, const BdMatrix<T> *mat,
1924 const Vector<T> *vec)
1925{
1926 const unsigned od = mat->diagconf.size();
1927 const unsigned int dim = mat->dim;
1928 if (do_exactsum2()) {
1929 for (unsigned i = start; i < end; ++i) {
1930 //diagonal
1931 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1932 REGISTER T comp(0.0);
1933 //off-diagonals
1934 for (unsigned int j = 0; j < od; ++j) {
1935 REGISTER const unsigned s = mat->diagconf.get(j);
1936 if (LIKELY(s <= i)) {
1937 const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
1938 const T t = val + y;
1939 comp += (t-val) - y;
1940 val = t;
1941 }
1942 if (LIKELY(i+s < dim)) {
1943 const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
1944 const T t = val + y;
1945 comp += (t-val) - y;
1946 val = t;
1947 }
1948 }
1949 res->set(val-comp, i);
1950 }
1951 } else {
1952 for (unsigned i = start; i < end; ++i) {
1953 //diagonal
1954 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1955 //off-diagonals
1956 for (unsigned int j = 0; j < od; ++j) {
1957 REGISTER const unsigned s = mat->diagconf.get(j);
1958 if (LIKELY(s <= i))
1959 val += mat->adiag.get(s)[i-s] * vec->get(i-s);
1960 if (LIKELY(i+s < dim))
1961 val += mat->bdiag.get(s)[i] * vec->get(i+s);
1962 }
1963 res->set(val, i);
1964 }
1965 }
1966}
1967
1968template <typename T>
1969inline void do_bdmat_vec_transmult_lnw_opt(const unsigned start, const unsigned end,
1970 TVector<T> *res, const BdMatrix<T> *mat,
1971 const Vector<T> *vec)
1972{
1973 const unsigned od = mat->diagconf.size();
1974 const unsigned int dim = mat->dim;
1975 // Zeilenweise, optimiert
1976 //const unsigned dconf0 = mat->diagconf.get(0);
1977 unsigned int i;
1978 const unsigned int maxoff = mat->maxoff;
1979 /* A very good compiler would do this for us */
1980 unsigned iend = MIN(maxoff, mat->dim-maxoff);
1981 iend = MIN(end, iend);
1982 MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
1983 //MAY_PREFETCH_R(&mat->adiag.getcref(0), 2);
1984 //MAY_PREFETCH_R(&mat->bdiag.getcref(0), 2);
1985#ifdef USE_PREFETCH
1986 for (unsigned int _j = 0; _j < od; ++_j) {
1987 const unsigned s = mat->diagconf.get(_j);
1988 PREFETCH_R(&mat->adiag.getcref(s), 2);
1989 PREFETCH_R(&mat->bdiag.getcref(s), 2);
1990 PREFETCH_R(mat->adiag.get(s), 2);
1991 PREFETCH_R(mat->bdiag.get(s), 2);
1992 }
1993#endif
1994 MAY_PREFETCH_R(mat->diag, 2);
1995 MAY_PREFETCH_R(&vec->getcref(0), 2);
1996 MAY_PREFETCH_W(&res->setval(0), 3);
1997 /* First loop: Need to check adiag, as i-s could be smaller than 0
1998 * bdiag safe, as i < N-maxoff and s <= maxoff => i+s < N
1999 */
2000 if (do_exactsum2()) {
2001 for (i = start; i < iend; ++i) {
2002 // diag
2003 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2004 REGISTER T comp(0.0);
2005 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2006 MAY_PREFETCH_R(mat->diag+i+1, 2);
2007 // offdiag
2008 for (unsigned int j = 0; j < od; ++j) {
2009 REGISTER const unsigned int s = mat->diagconf.get(j);
2010 if (s <= i) {
2011 const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2012 const T t = val + y;
2013 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2014 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2015 comp += (t-val) - y;
2016 val = t;
2017 }
2018 /*if (s < mat->dim - i)*/
2019 const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
2020 const T t = val + y;
2021 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2022 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2023 comp += (t-val) - y;
2024 val = t;
2025 }
2026 res->set(val-comp, i);
2027 MAY_PREFETCH_W(&res->setval(i+1), 3);
2028 }
2029 iend = MIN(end, dim - maxoff);
2030 if (i < iend) {
2031 /* Second loop, no checks necessary, 2*maxoff < N */
2032 // TODO: Exact summation
2033 for (; i < iend; ++i) {
2034 // diag
2035 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2036 REGISTER T comp(0.0);
2037 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2038 MAY_PREFETCH_R(mat->diag+i+1, 2);
2039 // offdiag, no checking necessary
2040 for (unsigned int j = 0; j < od; ++j) {
2041 REGISTER const unsigned int s = mat->diagconf.get(j);
2042 T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2043 T t = val + y;
2044 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2045 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2046 comp += (t-val) - y;
2047 val = t;
2048 y = mat->bdiag.get(s)[i] * vec->get(i+s);
2049 t = val + y;
2050 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2051 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2052 comp += (t-val) - y;
2053 val = t;
2054 }
2055 res->set(val-comp, i);
2056 MAY_PREFETCH_W(&res->setval(i+1), 3);
2057 }
2058 } else {
2059 iend = MIN(end, maxoff);
2060 /* Second loop, both checks necessary */
2061 // TODO: Exact summation
2062 for (; i < iend; ++i) {
2063 // diag
2064 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2065 REGISTER T comp(0.0);
2066 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2067 MAY_PREFETCH_R(mat->diag+i+1, 2);
2068 // offdiag, no checking necessary
2069 for (unsigned int j = 0; j < od; ++j) {
2070 REGISTER const unsigned int s = mat->diagconf.get(j);
2071 if (s <= i) {
2072 const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2073 const T t = val + y;
2074 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2075 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2076 comp += (t-val) - y;
2077 val = t;
2078 }
2079 if (s < (signed)dim - i) {
2080 const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
2081 const T t = val + y;
2082 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2083 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2084 comp += (t-val) - y;
2085 val = t;
2086 }
2087 }
2088 res->set(val-comp, i);
2089 MAY_PREFETCH_W(&res->setval(i+1), 3);
2090 }
2091 }
2092 /* Third loop, adiag safe */
2093 // TODO: Exact summation
2094 for (; i < end; ++i) {
2095 // diag
2096 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2097 REGISTER T comp(0.0);
2098 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2099 MAY_PREFETCH_R(mat->diag+i+1, 2);
2100 // offdiag
2101 for (unsigned int j = 0; j < od; ++j) {
2102 REGISTER const unsigned int s = mat->diagconf.get(j);
2103 /* if (s <= i) */
2104 T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2105 T t = val + y;
2106 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2107 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2108 comp += (t-val) - y;
2109 val = t;
2110 if (s < (signed)dim - i) {
2111 y = mat->bdiag.get(s)[i] * vec->get(i+s);
2112 t = val + y;
2113 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2114 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2115 comp += (t-val) - y;
2116 val = t;
2117 }
2118 }
2119 res->set(val-comp, i);
2120 MAY_PREFETCH_W(&res->setval(i+1), 3);
2121 }
2122 } else {
2123 for (i = start; i < iend; ++i) {
2124 // diag
2125 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2126 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2127 MAY_PREFETCH_R(mat->diag+i+1, 2);
2128 // offdiag
2129 for (unsigned int j = 0; j < od; ++j) {
2130 REGISTER const unsigned int s = mat->diagconf.get(j);
2131 if (s <= i) {
2132 val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2133 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2134 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2135 }
2136 /*if (s < mat->dim - i)*/
2137 val += mat->bdiag.get(s)[i] * vec->get(i+s);
2138 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2139 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2140 }
2141 res->set(val, i);
2142 MAY_PREFETCH_W(&res->setval(i+1), 3);
2143 }
2144 iend = MIN(end, dim - maxoff);
2145 if (i < iend) {
2146 /* Second loop, no checks necessary, 2*maxoff < N */
2147 // TODO: Exact summation
2148 for (; i < iend; ++i) {
2149 // diag
2150 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2151 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2152 MAY_PREFETCH_R(mat->diag+i+1, 2);
2153 // offdiag, no checking necessary
2154 for (unsigned int j = 0; j < od; ++j) {
2155 REGISTER const unsigned int s = mat->diagconf.get(j);
2156 val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2157 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2158 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2159 val += mat->bdiag.get(s)[i] * vec->get(i+s);
2160 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2161 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2162 }
2163 res->set(val, i);
2164 MAY_PREFETCH_W(&res->setval(i+1), 3);
2165 }
2166 } else {
2167 iend = MIN(end, maxoff);
2168 /* Second loop, both checks necessary */
2169 // TODO: Exact summation
2170 for (; i < iend; ++i) {
2171 // diag
2172 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2173 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2174 MAY_PREFETCH_R(mat->diag+i+1, 2);
2175 // offdiag, no checking necessary
2176 for (unsigned int j = 0; j < od; ++j) {
2177 REGISTER const unsigned int s = mat->diagconf.get(j);
2178 if (s <= i) {
2179 val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2180 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2181 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2182 }
2183 if (s < (signed)dim - i) {
2184 val += mat->bdiag.get(s)[i] * vec->get(i+s);
2185 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2186 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2187 }
2188 }
2189 res->set(val, i);
2190 MAY_PREFETCH_W(&res->setval(i+1), 3);
2191 }
2192 }
2193 /* Third loop, adiag safe */
2194 // TODO: Exact summation
2195 for (; i < end; ++i) {
2196 // diag
2197 REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2198 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2199 MAY_PREFETCH_R(mat->diag+i+1, 2);
2200 // offdiag
2201 for (unsigned int j = 0; j < od; ++j) {
2202 REGISTER const unsigned int s = mat->diagconf.get(j);
2203 /* if (s <= i) */
2204 val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2205 MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2206 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2207 if (s < (signed)dim - i) {
2208 val += mat->bdiag.get(s)[i] * vec->get(i+s);
2209 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2210 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2211 }
2212 }
2213 res->set(val, i);
2214 MAY_PREFETCH_W(&res->setval(i+1), 3);
2215 }
2216 }
2217}
2218
2219template <typename T>
2220inline void do_bdmat_vec_transmult_diagw_exact(const unsigned start, const unsigned end,
2221 TVector<T> *res, const BdMatrix<T> *mat,
2222 const Vector<T> *vec)
2223{
2224 const unsigned int dim = mat->dim;
2225 const unsigned od = mat->diagconf.size();
2226 // diagonal wise
2227 // We need to store a vector of correction values ...
2228 Vector<T> corr((T)0, end-start);
2229 // diag
2230 do_vec_vec_mul(end-start, &res->setval(start),
2231 &mat->diag[start], &vec->getcref(start));
2232 // offdiag
2233 for (unsigned j = 0; j < od; ++j) {
2234 // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2235 const int s = mat->diagconf.get(j);
2236 const int iend = MIN(end,dim-s);
2237 const int iend2 = end-s;
2238 const int istart2 = MAX((int)(start-s),0);
2239 //do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
2240 // mat->adiag.get(s)+istart2, &vec->getcref(istart2));
2241 for (int off = istart2; off < iend2; ++off) {
2242 const REGISTER T y = *(mat->adiag.get(s)+off) * vec->get(off);
2243 const REGISTER T t = res->get(off+s) + y;
2244 corr.setval(off+s-start) += (t - res->get(off+s)) - y;
2245 res->setval(off+s) = t;
2246 }
2247 //do_add_vec_vec_mul(iend-start, &res->setval(start),
2248 // mat->bdiag.get(s)+start, &vec->getcref(start+s));
2249 for (int off = start; off < iend; ++off) {
2250 const REGISTER T y = *(mat->bdiag.get(s)+off) * vec->get(off+s);
2251 const REGISTER T t = res->get(off) + y;
2252 corr.setval(off-start) += (t - res->get(off)) - y;
2253 res->setval(off) = t;
2254 }
2255 }
2256 // Apply correction
2257 for (unsigned off = start; off < end; ++off)
2258 res->setval(off) -= corr(off-start);
2259}
2260
2261template <typename T>
2262inline void do_bdmat_vec_transmult(const unsigned start, const unsigned end,
2263 TVector<T> *res, const BdMatrix<T> *mat,
2264 const Vector<T> *vec)
2265{
2266 const unsigned int dim = mat->dim;
2267 const unsigned od = mat->diagconf.size();
2268 if (do_exactsum2()) {
2269 //do_bdmat_vec_transmult_lnw(start, end, res, mat, vec);
2270 do_bdmat_vec_transmult_diagw_exact(start, end, res, mat, vec);
2271 return;
2272 }
2273 // diagonal wise
2274 // diag
2275 do_vec_vec_mul(end-start, &res->setval(start),
2276 &mat->diag[start], &vec->getcref(start));
2277 // offdiag
2278 for (unsigned j = 0; j < od; ++j) {
2279 // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2280 const int s = mat->diagconf.get(j);
2281 const int iend = MIN(end,dim-s);
2282 const int iend2 = end-s;
2283 const int istart2 = MAX((int)(start-s),0);
2284 do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
2285 mat->adiag.get(s)+istart2, &vec->getcref(istart2));
2286 do_add_vec_vec_mul(iend-start, &res->setval(start),
2287 mat->bdiag.get(s)+start, &vec->getcref(start+s));
2288 }
2289}
2290
2291#define COST_BDMATVEC_DOT(d,o,m) COST_BDMATVEC(d,o,m)
2292template <typename T>
2293inline void do_bdmat_vec_dotmult(const unsigned start, const unsigned end,
2294 TVector<T> *res, const BdMatrix<T> *mat,
2295 const Vector<T> *vec)
2296{
2297 const unsigned int od = mat->diagconf.size();
2298 const unsigned int dim = mat->dim;
2299#ifdef BDMATVEC_LNWISE
2300 // Zeilenweise, stur
2301 if (do_exactsum2()) {
2302 for (unsigned i = start; i < end; ++i) {
2303 // diag
2304 REGISTER T val ALIGN(MIN_ALIGN) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2305 REGISTER T comp(0.0);
2306 // offdiag
2307 for (unsigned j = 0; j < od; ++j) {
2308 REGISTER const unsigned s = mat->diagconf.get(j);
2309 if (LIKELY(s <= i)) {
2310 const T y = CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2311 const T t = val + y;
2312 comp += (t-val) - y;
2313 val = t;
2314 }
2315 if (LIKELY(i+s < dim)) {
2316 const T y = CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2317 const T t = val + y;
2318 comp += (t-val) - y;
2319 val = t;
2320 }
2321 }
2322 res->set(val-comp, i);
2323 }
2324 } else {
2325 for (unsigned i = start; i < end; ++i) {
2326 // diag
2327 REGISTER T val ALIGN(MIN_ALIGN) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2328 // offdiag
2329 for (unsigned j = 0; j < od; ++j) {
2330 REGISTER const unsigned s = mat->diagconf.get(j);
2331 if (LIKELY(s <= i))
2332 val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2333 if (LIKELY(i+s < dim))
2334 val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2335 }
2336 res->set(val, i);
2337 }
2338 }
2339#elif defined(BDMATVEC_LNWISE_OPT)
2340 // Zeilenweise, optimiert
2341 unsigned int i;
2342 //const unsigned dconf0 = mat->diagconf.get(0);
2343 const unsigned int maxoff = mat->maxoff;
2344 /* A very good compiler would do this for us */
2345 unsigned iend = MIN(maxoff, dim-maxoff);
2346 iend = MIN(iend, end);
2347 MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
2348 //MAY_PREFETCH_R(&mat->adiag(0), 3);
2349 //MAY_PREFETCH_R(&mat->bdiag(0), 3);
2350#ifdef USE_PREFETCH
2351 for (unsigned int _j = 0; _j < od; ++_j) {
2352 const unsigned s = mat->diagconf.get(_j);
2353 PREFETCH_R(&mat->adiag.getcref(s), 2);
2354 PREFETCH_R(&mat->bdiag.getcref(s), 2);
2355 PREFETCH_R(mat->adiag.get(s), 2);
2356 PREFETCH_R(mat->bdiag.get(s), 2);
2357 }
2358#endif
2359 MAY_PREFETCH_R(mat->diag, 2);
2360 MAY_PREFETCH_R(&vec->getcref(0), 2);
2361 MAY_PREFETCH_W(&res->setval(0), 3);
2362 /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
2363 * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
2364 */
2365 // TODO: Exact summation
2366 // NOTE: Not implemented here, as dotMult is note even used by the solvers ...
2367 for (i = start; i < iend; ++i) {
2368 // diag
2369 REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2370 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2371 MAY_PREFETCH_R(mat->diag+i+1, 2);
2372 // offdiag
2373 for (unsigned int j = 0; j < od; ++j) {
2374 REGISTER const unsigned int s = mat->diagconf.get(j);
2375 if (s <= i) {
2376 val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2377 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2378 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2379 }
2380 // s <= maxoff; i < N-maxoff => s+i < N
2381 /*if (s+i < dim)*/
2382 val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2383 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2384 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2385 }
2386 res->set(val, i);
2387 MAY_PREFETCH_W(&res->setval(i+1), 3);
2388 }
2389 iend = MIN(end, dim - maxoff);
2390 /* i = MAX(start,MIN(maxoff,dim-maxoff,end)) */
2391 if (i < iend) {
2392 /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
2393 // TODO: Exact summation
2394 for (; i < iend; ++i) {
2395 // diag
2396 REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2397 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2398 MAY_PREFETCH_R(mat->diag+i+1, 2);
2399 // offdiag, no checking necessary
2400 for (unsigned int j = 0; j < od; ++j) {
2401 REGISTER const unsigned int s = mat->diagconf.get(j);
2402 val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2403 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2404 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2405 val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2406 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2407 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2408 }
2409 res->set(val, i);
2410 MAY_PREFETCH_W(&res->setval(i+1), 3);
2411 }
2412 } else {
2413 iend = MIN(maxoff, end);
2414 /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
2415 // TODO: Exact summation
2416 for (; i < iend; ++i) {
2417 // diag
2418 REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2419 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2420 MAY_PREFETCH_R(mat->diag+i+1, 2);
2421 // offdiag, no checking necessary
2422 for (unsigned int j = 0; j < od; ++j) {
2423 REGISTER const unsigned int s = mat->diagconf.get(j);
2424 if (s <= i) {
2425 val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2426 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2427 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2428 }
2429 if (s < (signed)dim - i) {
2430 val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2431 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2432 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2433 }
2434 }
2435 res->set(val, i);
2436 MAY_PREFETCH_W(&res->setval(i+1), 3);
2437 }
2438 }
2439 /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
2440 * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
2441 */
2442 // TODO: Exact summation
2443 for (; i < end; ++i) {
2444 // diag
2445 REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2446 MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2447 MAY_PREFETCH_R(mat->diag+i+1, 2);
2448 // offdiag
2449 for (unsigned int j = 0; j < od; ++j) {
2450 REGISTER const unsigned int s = mat->diagconf.get(j);
2451 /* if (s <= i) */
2452 val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2453 MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2454 MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2455 if (s < (signed)dim - i) {
2456 val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2457 MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2458 MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2459 }
2460 }
2461 res->set(val, i);
2462 MAY_PREFETCH_W(&res->setval(i+1), 3);
2463 }
2464#elif defined(BDMATVEC_DIAGWISE)
2465 //Diagonalweise
2466 // FIXME: I see no way we could implement exact summation here ...
2467 // diag
2468 do_vec_vec_cmul(end-start, &res->setval(start),
2469 &mat->diag[start], &vec->getcref(start));
2470 // offdiag
2471 for (unsigned j = 0; j < od; j++) {
2472 // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2473 const int s = mat->diagconf.get(j);
2474 const int iend = MIN(end,dim-s);
2475 const int iend2 = end-s;
2476 const int istart2 = MAX((int)(start-s),0);
2477 do_add_vec_vec_cmul(iend-start, &res->setval(start),
2478 mat->adiag.get(s)+start, &vec->getcref(start+s));
2479 do_add_vec_vec_cmul(iend2-istart2, &res->setval(istart2+s),
2480 mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
2481 }
2482#endif
2483}
2484
2485#if defined(SMP) && !defined(NOSMP_BDMATVEC)
2486
2487INST(template <typename T> class BdMatrix friend void job_bdmat_vec_mult(struct thr_ctrl*);)
2488template <typename T>
2490{
2492 (TVector<T>*)(tc->t_par[0]), (const BdMatrix<T>*)(tc->t_par[1]),
2493 (const Vector<T>*)(tc->t_par[2]));
2494}
2495
2496INST(template <typename T> class BdMatrix friend void job_bdmat_vec_transmult(struct thr_ctrl*);)
2497template <typename T>
2499{
2501 (TVector<T>*)(tc->t_par[0]), (const BdMatrix<T>*)(tc->t_par[1]),
2502 (const Vector<T>*)(tc->t_par[2]));
2503}
2504
2505
2506 /* TODO: Adjust dynamically (or at least depending on OPT target). */
2507# if !defined(SMP_BDMATSLICE)
2508# define SMP_BDMATSLICE 32768
2509# endif
2510# define SMP_BDMATSLICE2 (SMP_BDMATSLICE/sizeof(T))
2511
2512template <typename T>
2514{
2515 TVector<T> tv(dim);
2516 const unsigned n_thr = threads_avail(dim / SMP_BDMATSLICE2);
2517 update_n_thr(n_thr);
2518 BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2519 if (LIKELY(n_thr < 2))
2520 do_bdmat_vec_mult<T>(0UL, dim, &tv, this, &vec);
2521 else {
2522 PREFETCH_R_MANY(this->diag, 2);
2523 PREFETCH_R_MANY(vec.vec, 2);
2524 PREFETCH_W_MANY(tv.vec, 3);
2525 const unsigned long first = slice_offset(1, n_thr, dim, (T*)0);
2526 unsigned long st, en = first;
2527 for (unsigned t = 0; t < n_thr-1; ++t) {
2528 st = en; en = slice_offset(t+2, n_thr, dim, (T*)0);
2530 st, en, &tv, this, &vec, (void*)0);
2531 }
2532 do_bdmat_vec_mult<T>(0UL, first, &tv, this, &vec);
2533 for (unsigned s = 0; s < n_thr-1; ++s)
2534 thread_wait(s);
2535 }
2536 return tv;
2537}
2538
2539// transpose-vector multiplication
2540template <typename T>
2542{
2543 TVector<T> tv(dim);
2544 const unsigned n_thr = threads_avail(dim / SMP_BDMATSLICE2);
2545 update_n_thr(n_thr);
2546 BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2547 if (LIKELY(n_thr < 2 ))
2548 do_bdmat_vec_transmult<T> (0UL, dim, &tv, this, &vec);
2549 else {
2550 PREFETCH_R_MANY(this->diag, 2);
2551 PREFETCH_R_MANY(vec.vec, 2);
2552 PREFETCH_W_MANY(tv.vec, 3);
2553 const unsigned long first = slice_offset(1, n_thr, dim, (T*)0);
2554 unsigned long st, en = first;
2555 for (unsigned t = 0; t < n_thr-1; ++t) {
2556 st = en; en = slice_offset(t+2, n_thr, dim, (T*)0);
2558 st, en, &tv, this, &vec, (void*)0);
2559 }
2560 do_bdmat_vec_transmult<T>(0UL, first, &tv, this, &vec);
2561 for (unsigned s = 0; s < n_thr-1; ++s)
2562 thread_wait(s);
2563 }
2564 return tv;
2565}
2566
2567#else /* SMP */
2568
2569template <typename T>
2571{
2572 TVector<T> tv(dim);
2573 BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2574 do_bdmat_vec_mult<T>(0, dim, &tv, this, &vec);
2575 return tv;
2576}
2577
2578/* bicg and qmr solvers use this */
2579template <typename T>
2581{
2582 TVector<T> tv(dim);
2583 BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2584 do_bdmat_vec_transmult<T>(0, dim, &tv, this, &vec);
2585 return tv;
2586}
2587
2588#endif /* SMP */
2589
2590/* This is not currently used by anyone ... */
2591template <typename T>
2593{
2594 TVector<T> tv(dim);
2595 BCHK(vec.size() != dim, BdMatrixErr, Dot Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2596 do_bdmat_vec_dotmult<T>(0, dim, &tv, this, &vec);
2597 return tv;
2598}
2599
2600
2601
2602//template <typename T>
2603//TVector<T> BdMatrix<T>::transMult (const TVector<T>& tvec) const;
2604//template <typename T>
2605//TVector<T> BdMatrix<T>::transMult (const TSVector<T>& tsvec) const;
2606
2607
2608template <typename T>
2610{
2611 if (LIKELY(ma.dim != dim))
2612 return false;
2613 if (LIKELY(dim == 0 || ((diag == ma.diag) && (adiag == ma.adiag) && (bdiag == ma.bdiag))))
2614 return true;
2615 if (TBCICOMP(ma.diag, diag, T, dim))
2616 return false;
2617 unsigned i; // MSC++ annoyances
2618 for (i = 0; i < diagconf.size(); ++i) {
2619 const unsigned di = diagconf.get(i);
2620 if (ma.adiag.get(di)) {
2621 if (TBCICOMP(ma.adiag.get(di), adiag.get(di), T, (dim-di))) return false;
2622 if (TBCICOMP(ma.bdiag.get(di), bdiag.get(di), T, (dim-di))) return false;
2623 } else {
2624 for (unsigned j = 0; j < dim-di; ++j)
2625 if (MATH__ fabs(adiag.get(di)[j]) >= BD_MINVAL || MATH__ fabs(bdiag(di)[j]) >= BD_MINVAL)
2626 return false;
2627 }
2628 }
2629 if (LIKELY(diagconf == ma.diagconf))
2630 return true;
2631 for (i = 0; i < ma.diagconf.size(); ++i) {
2632 const unsigned di = ma.diagconf.get(i);
2633 if (!diagconf.contains (di)) {
2634 for (unsigned j = 0; j < dim-di; ++j)
2635 if (UNLIKELY(MATH__ fabs(ma.adiag.get(di)[j]) >= BD_MINVAL || MATH__ fabs(ma.bdiag.get(di)[j]) >= BD_MINVAL))
2636 return false;
2637 }
2638 }
2639 return true;
2640}
2641
2642template <typename T>
2644{
2645 BCHK(dim != v.size(), BdMatrixErr, multiply rowwise with wrong size, v.size(), *this);
2646 // diag
2647 for (unsigned j = 0; j < dim; ++j)
2648 diag[j] *= v.get(j);
2649 // offdiags
2650 for (unsigned od = 0; od < diagconf.size(); ++od) {
2651 const unsigned off = diagconf.get(od);
2652 for (unsigned i = 0; i < (dim-off); ++i)
2653 adiag.get(off)[i] *= v.get(i);
2654 for (unsigned k = 0; k < (dim-off); ++k)
2655 bdiag.get(off)[k] *= v.get(k+off);
2656 }
2657 return *this;
2658}
2659
2660template <typename T>
2662{
2663 BCHK(dim != v.size(), BdMatrixErr, divide rowwise with wrong size, v.size(), *this);
2664 // diag
2665 for (unsigned j = 0; j < dim; ++j)
2666 diag[j] /= v.get(j);
2667 // offdiags
2668 for (unsigned od = 0; od < diagconf.size(); ++od) {
2669 const unsigned off = diagconf.get(od);
2670 for (unsigned i = 0; i < (dim-off); ++i)
2671 adiag.get(off)[i] /= v.get(i);
2672 for (unsigned k = 0; k < (dim-off); ++k)
2673 bdiag.get(off)[k] /= v.get(k+off);
2674 }
2675 return *this;
2676}
2677
2678template <typename T>
2679BdMatrix<T>& BdMatrix<T>::mult_row(const T& f, const unsigned i)
2680{
2681 BCHK(i>dim, BdMatrixErr, mult_row index out of range, i, *this);
2682 diag[i] *= f;
2683 for (unsigned od = 0; od < diagconf.size(); ++od) {
2684 const unsigned off = diagconf.get(od);
2685 if (i < dim-off)
2686 adiag.get(off)[i] *= f;
2687 if (i >= off)
2688 bdiag.get(off)[i-off] *= f;
2689 }
2690 return *this;
2691}
2692
2693// Don't do it with integer types
2694template <typename T>
2695BdMatrix<T>& BdMatrix<T>::div_row(const T& d, const unsigned i)
2696{
2697 BCHK(i>dim, BdMatrixErr, div_row index out of range, i, *this);
2698 BCHK(d == (T)0, BdMatrixErr, div_row by zero, i, *this);
2699 T f = (T)1.0/d;
2700 diag[i] *= f;
2701 for (unsigned od = 0; od < diagconf.size(); ++od) {
2702 const unsigned off = diagconf.get(od);
2703 if (i < dim-off)
2704 adiag.get(off)[i] *= f;
2705 if (i >= off)
2706 bdiag.get(off)[i-off] *= f;
2707 }
2708 return *this;
2709}
2710
2711template <typename T>
2712STD__ ostream& operator << (STD__ ostream& ostr, const BdMatrix<T>& mat)
2713{
2714 //if (!mat.outset)
2715 // mat.setoutopts();
2716 //ostr << '(';
2717 for (unsigned i = 0; i < mat.dim; ++i) {
2718 for (unsigned j = 0; j < mat.dim; ++j) {
2719 if (UNLIKELY(mat.outset)) {
2720 ostr.width(mat.outw);
2721 ostr.precision(mat.outp);
2722 ostr.fill(mat.outf);
2723 }
2724 if (i == j || mat.diagconf.contains(CSTD__ abs((int)i-(int)j)))
2725 ostr << mat.get(i, j) << ' ';
2726 else
2727 ostr << (T)0 << ' ';
2728 }
2729 //if (i == (mat.dim - 1))
2730 // ostr << ')' << STD__ endl;
2731 //else
2732 ostr << "\n";
2733 }
2734 EXPCHK(mat.dummy!=(T)0, BdMatrixErr, dummy not zero, 0, ostr);
2735 return ostr;
2736}
2737
2738template <typename T>
2739STD__ istream& operator >> (STD__ istream& istr, BdMatrix<T>& mat)
2740{
2741 T tmp;
2742 for (unsigned i = 0; i < mat.dim; ++i) {
2743 for (unsigned j = 0; j < mat.dim; ++j) {
2744 istr >> tmp;
2745 if (i == j || mat.diagconf.contains(CSTD__ abs((int)i-(int)j)))
2746 mat.setval(tmp, i, j);
2747 else
2748 if (tmp != (T)0)
2749 STD__ cerr << "Element (" << i << "," << j
2750 << ") should be zero!" << STD__ endl;
2751 }
2752 }
2753 return istr;
2754}
2755
2756template <typename T>
2758{
2759 const unsigned _DIM = dim;
2760 TMatrix<T> tm ((T)0, _DIM, _DIM);
2761 // diag
2762 for (unsigned i = 0; i < _DIM; ++i)
2763 tm.setval(diag[i], i, i);
2764 // offdiags
2765 for (unsigned j = 0; j < diagconf.size(); ++j) {
2766 const unsigned di = diagconf.get(j);
2767 for (unsigned k = 0; k < _DIM-di; ++k) {
2768 tm.setval(adiag.get(di)[k], k, k+di);
2769 tm.setval(bdiag.get(di)[k], k+di, k);
2770 }
2771 }
2772 return tm;
2773}
2774
2775#if 0
2776template <typename T>
2778{
2779 Matrix<T> tm ((T)0, dim, dim); unsigned i;
2780 // diag
2781 for (i = 0; i < dim; ++i)
2782 tm.setval(diag[i], i, i);
2783 // offdiags
2784 for (unsigned j = 0; j < diagconf.size(); ++j)
2785 for (unsigned k = 0; k < dim-(i=diagconf.get(j)); ++k) {
2786 tm.setval(adiag.get(i)[k], k, k+i);
2787 tm.setval(bdiag.get(i)[k], k+i, k);
2788 }
2789 return tm;
2790}
2791#endif
2792
2793#if 0
2794template <typename T>
2796{
2799 T d = det();
2800 //STD__ cout << "Det: " << d << STD__ endl;
2801 if (MATH__ fabs(d) < BD_MINVAL) {
2802#ifndef TBCI_DISABLE_EXCEPT
2803 throw (BdMatrixErr("Trying to invert singular Matrix!"));
2804#else
2805 STD__ cerr << "Trying to invert singular Matrix!" << STD__ endl;
2806 return inv;
2807#endif
2808 }
2809 d = 1/d;
2810 for (unsigned i = 0; i < dim; i++)
2811 for (unsigned j = 0; j < dim; j++) {
2812 ri(0) = i; ci(0) = j;
2813 inv.setval(subdet(ri, ci) * ((i+j)%2? -d: d), j, i);
2814 }
2815 return inv;
2816}
2817#endif /* 0 */
2818
2819#if defined(SMP) && defined(HAVE_LIBNUMA)
2820template <typename T>
2821int numa_opt_bdmat_helper(const BdMatrix<T>& bm, const BVector<unsigned>& cfg,
2822 const int node, bool fault_in,
2823 const unsigned first, const unsigned last)
2824{
2825 int moved = 0;
2826 const unsigned int sz = bm.rows();
2827 /* Diag */
2828 const T* diag = &(((BdMatrix<T>&)bm).setval(0,0));
2829 moved = do_numa_move_pages(node, fault_in,
2830 (unsigned long)(diag+first),
2831 (unsigned long)(diag+last));
2832 /* Now do off-diags */
2833 for (unsigned i = 0; i < cfg.size(); ++i) {
2834 const unsigned off = cfg.get(i);
2835 const unsigned dln = sz - off;
2836 const T* adiag = &(((BdMatrix<T>&)bm).setval(0,off));
2837 const T* bdiag = &(((BdMatrix<T>&)bm).setval(off,0));
2838 if (first < dln)
2839 moved += do_numa_move_pages(node, fault_in,
2840 (unsigned long)(adiag+first),
2841 (unsigned long)(adiag+MIN(dln,last)));
2842 if (last > off)
2843 moved += do_numa_move_pages(node, fault_in,
2844 (unsigned long)(bdiag+MAX(0,(signed)(first-off))),
2845 (unsigned long)(bdiag+last-off));
2846 }
2847 return moved;
2848}
2849
2850template <typename T>
2851void numa_opt_bdmat_job(struct thr_ctrl *tc)
2852{
2853 const BdMatrix<T>& bm(*(const BdMatrix<T>*)tc->t_par[2]);
2854 const BVector<unsigned>& bmcfg(*(const BVector<unsigned>*)tc->t_par[3]);
2855 /* Stick with convention that size, off are node, fault_in resp. */
2856 const int node = tc->t_size;
2857 const bool fault_in = tc->t_off;
2858 /* start and end are params 0 and 1 */
2859 const unsigned st = (unsigned long)tc->t_par[0];
2860 const unsigned en = (unsigned long)tc->t_par[1];
2861 tc->t_res_l = numa_opt_bdmat_helper(bm, bmcfg, node, fault_in, st, en);
2862}
2863
2864template <typename T>
2865int numa_optimize(const BdMatrix<T>& bm, bool fault_in)
2866{
2867 if (!numa_avail && !fault_in)
2868 return 0;
2869 /* use some heuristic to decide for the num of threads */
2870 const unsigned sz = bm.rows();
2871 const unsigned n_thr = threads_avail (sz / SMP_BDMATSLICE2);
2872 //update_n_thr(n_thr);
2873
2874 BVector<unsigned> bmcfg(bm.getcfg());
2875 /* OK, so how do we distribute a BandMatrix to threads?
2876 * 1. We do it by row, reflecting that this is how we later
2877 * use it in BdMat - Vector multiplication
2878 * 2. A BdMat consists of several vectors, one for the diagonal
2879 * and a variable number of others for the off-diagonals.
2880 * All these vectors need to be split to threads.
2881 * 3. Problem: Can we even hope to come close to page boundaries?
2882 */
2883 if (LIKELY(n_thr < 2)) {
2884 if (fault_in)
2885 numa_opt_bdmat_helper(bm, bmcfg,
2886 main_numa_node, fault_in,
2887 0, sz);
2888 return 0;
2889 } else {
2890 const unsigned first = slice_offset(1, n_thr, sz, (T*)0);
2891 unsigned long st, en = first;
2892 /* Start threads */
2893 unsigned long res = 0;
2894 for (unsigned t = 0; t < n_thr-1; ++t) {
2895 st = en; en = slice_offset(t+2, n_thr, sz, (T*)0);
2896 thread_start_off (t, (thr_job_t)numa_opt_bdmat_job<T>,
2897 threads[t].numa_node, fault_in,
2898 st, en, &bm, &bmcfg, (void*)0);
2899 }
2900 /* The first slice is handled by the main thread */
2901 res = numa_opt_bdmat_helper(bm, bmcfg,
2902 main_numa_node, fault_in, 0, first);
2903 /* Wait for the end */
2904 for (unsigned t = 0; t < n_thr-1; ++t) {
2905 job_output out;
2906 thread_wait (t, &out);
2907 res += out.t_res_l;
2908 }
2909 //fprintf(stderr, "NUMA Optimize Matrix %p: %li pages moved\n", m.getrowptr(0), res);
2910 return res;
2911 }
2912}
2913#else
2914template <typename T>
2915int numa_optimize(const BdMatrix<T>& bm, bool fault_in)
2916{
2917 /* FIXME: We should emulate fault_in here, no? */
2918 return 0;
2919}
2920#endif
2921
2922
2923
2925
2926#endif /* TBCI_BAND_MATRIX_H */
const Vector< T > const Vector< T > const Vector< T > & p
Definition LM_fit.h:98
long int Vector< T > & index
Definition LM_fit.h:69
int i
Definition LM_fit.h:71
doublereal y
Definition TOMS_707.C:27
void do_bdmat_vec_transmult_lnw_opt(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
#define INTDIVEQ(T)
#define BDMATDBG(x)
Definition band_matrix.h:32
STD__ istream & operator>>(STD__ istream &istr, BdMatrix< T > &mat)
#define SFORALL_T(op)
#define SMP_BDMATSLICE2
void do_bdmat_vec_mult_lnw_opt(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
int numa_optimize(const BdMatrix< T > &bm, bool fault_in)
void do_bdmat_vec_dotmult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
void do_bdmat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
#define SFORALL_S(op)
BdMatrix< T > transpose(BdMatrix< T > &mat)
#define STDDEF_SS(op)
#define STDDEF_ST(op)
#define MAY_PREFETCH_R(adr, loc)
void do_bdmat_vec_mult_diagw_exact(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
void do_bdmat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
void do_bdmat_vec_transmult_diagw_exact(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
#define MAY_PREFETCH_W(adr, loc)
void do_bdmat_vec_transmult_lnw(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
void job_bdmat_vec_transmult(struct thr_ctrl *tc)
void job_bdmat_vec_mult(struct thr_ctrl *tc)
void do_bdmat_vec_mult_lnw(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
STD__ ostream & operator<<(STD__ ostream &ostr, const BdMatrix< T > &mat)
#define BD_MINVAL
Definition band_matrix.h:21
#define CSTD__
Definition basics.h:340
#define STD__
Definition basics.h:338
#define HOT
Definition basics.h:495
#define BCHK(cond, exc, txt, ind, rtval)
Definition basics.h:575
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition basics.h:100
#define MIN(a, b)
Definition basics.h:655
#define MIN_ALIGN2
Definition basics.h:424
#define FRIEND_TBCI2__
Definition basics.h:335
#define NAMESPACE_END
Definition basics.h:323
#define TBCICOMP(n, o, t, s)
Definition basics.h:981
#define INST(x)
Definition basics.h:238
#define EXPCHK(cond, exc, txt, ind, rtval)
Definition basics.h:630
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition basics.h:748
#define NAMESPACE_TBCI
Definition basics.h:317
#define PREFETCH_W_MANY(addr, loc)
Definition basics.h:765
#define BCHKNR(cond, exc, txt, ind)
Definition basics.h:586
#define UNLIKELY(expr)
Definition basics.h:101
#define REGISTER
Definition basics.h:108
#define MATH__
Definition basics.h:339
#define NOINST
Definition basics.h:244
#define TBCIFILL(n, v, t, s)
Definition basics.h:912
#define ALIGN(x)
Definition basics.h:444
#define TBCICOPY(n, o, t, s)
Definition basics.h:895
#define PREFETCH_R_MANY(addr, loc)
Definition basics.h:764
#define COLD
Definition basics.h:496
#define MIN_ALIGN
Definition basics.h:421
#define MAX(a, b)
Definition basics.h:656
#define FGD
Definition basics.h:144
#define T
Definition bdmatlib.cc:20
provides basic Vector functionality but arithmetic operators (+=, - , *, /...).
Definition bvector.h:68
unsigned long size() const HOT
Definition bvector.h:144
bool contains(const T &, unsigned long *=0) const
Definition bvector.h:580
BVector< T > & resize(const BVector< T > &)
Actually it's a resize and copy (some people would expect the assignment op to do this).
Definition bvector.h:361
T * vec
Definition bvector.h:73
BVector< T > & remove(const unsigned long)
Definition bvector.h:435
const T & getcref(const unsigned long idx) const
Definition bvector.h:129
tbci_traits< T >::const_refval_type get(const unsigned long idx) const HOT
Definition bvector.h:132
T & set(const unsigned long idx) HOT
Definition bvector.h:134
BVector< T > & append(const T &)
performs poorly
Definition bvector.h:412
exception class
Definition band_matrix.h:47
BdMatrixErr(const char *t, const long i=0)
Definition band_matrix.h:51
BdMatrixErr(const BdMatrixErr &be)
Definition band_matrix.h:53
virtual ~BdMatrixErr()
Definition band_matrix.h:55
The class BdMatrix is an implementation to store and do operations on sparse Matrices with a band str...
tbci_traits< T >::const_refval_type get(const unsigned int, const unsigned int) const HOT
BVector< T * > adiag
BdMatrix< T > & clear()
bool operator==(const BdMatrix< T > &) const
friend void FRIEND_TBCI2__ do_bdmat_vec_mult FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
TVector< T > get_col(const unsigned int) const
Construct Vector from column.
BdMatrix< T > & setunit(const T &=1)
BdMatrix< T > & expand(unsigned=0)
const BVector< unsigned int > & getcfg() const
ro access to config BVector
unsigned long arsz
TVector< T > transMult(const Vector< T > &) const HOT
Transposed Matrix-Vector multiplication.
BdMatrix< T > & removediag(const unsigned)
friend void FRIEND_TBCI2__ do_bdmat_vec_transmult FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
BdMatrix< T > & reconfig(const BVector< unsigned int > &) COLD
BdMatrix< T > & operator-=(const BdMatrix< T > &)
BdMatrix< T > inverse() const
BdMatrix< T > operator/(const T &) const
BVector< T * > bdiag
pointers to upper and lower diagonals
unsigned int size() const
size: incompatibility to Matrix (!)
BdMatrix< T > & div_rows(const Vector< T > &)
friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_diagw_exact FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
friend void FRIEND_TBCI2__ do_bdmat_vec_mult_diagw_exact FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
void do_copy(const BdMatrix< T > &)
T * check(const unsigned r, const unsigned c) const HOT
T & operator()(const unsigned int, const unsigned int) HOT
rw member access
T trace() const
BdMatrix< T > & operator+=(const BdMatrix< T > &)
friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw_opt FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
T * check_internal(const unsigned r, const unsigned c) const HOT
bool operator!=(const BdMatrix< T > &ma) const
BdMatrix< T > & transpose()
transpose() does change the object!
unsigned int getmaxoff() const
max distance from main diagonal
BdMatrix< T > operator-() const
unsigned int maxoff
size, max offset from diag
friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw_opt FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
BdMatrix(const BdMatrix< U > &bm)
BdMatrix< T > & operator*=(const T &)
BdMatrix< T > & setval(const T &v, const unsigned int r, const unsigned int c)
autoinsert, sparse class compatible
void free_diags(const unsigned)
Implementation alternative: All memory is allocated in one big chunk (default).
TVector< T > dotMult(const Vector< T > &) const
TVector< T > get_row(const unsigned int) const
Construct Vector from row.
BdMatrix< T > & set_row(const Vector< T > &val, const unsigned, const unsigned=0)
Overwrite row with Vector.
BdMatrix< T > & adddiag(const unsigned)
friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
BdMatrix< T > & operator=(const BdMatrix< T > &)
The assignment operators in TBCI are NOT resizing.
unsigned int rows() const
number of rows
BdMatrix< T > & operator/=(const T &)
unsigned int dim
BdMatrix< T > & autoinsert(const T &, const unsigned, const unsigned)
BdMatrix< T > operator+(const BdMatrix< T > &) const
BdMatrix< T > operator*(const BdMatrix< T > &) const
void destroy()
destroy object explicitly
friend void FRIEND_TBCI2__ do_bdmat_vec_dotmult FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
BdMatrix< T > & resize(const unsigned int)
unsigned int noelem() const
Number of elements that are actually stored.
friend NOINST int lu_decomp FGDT(BdMatrix< T > &) HOT
void constructor(const unsigned int, const bool=true)
BdMatrix< T > & div_row(const T &, const unsigned)
static const char * mat_info()
BdMatrix< T > & mult_rows(const Vector< T > &)
Linewise multiplication.
BdMatrix< T > & mult_row(const T &, const unsigned)
BdMatrix< T > & fill(const T &=0)
BdMatrix< T > & resize(const BdMatrix< T > &bdm)
Resizing assignment.
friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw FGDT(const unsigned, const unsigned, TVector< T > *, const BdMatrix< T > *, const Vector< T > *)
const BdMatrix< T > & setoutopts(int w=5, int p=4, char f=' ') const
friend NOINST void gaussj FGDT(const BdMatrix< T > &, const Matrix< T > &)
friend class BdMatrix
BVector< unsigned > diagconf
configuration
unsigned int columns() const
number of columns
T aligned_value_type TALIGN(MIN_ALIGN2)
~BdMatrix() THROWEXCEPT
NumErr()
Definition except.h:65
unsigned int rows() const
number of rows
Definition matrix.h:317
unsigned int columns() const
number of columns
Definition matrix.h:315
T & setval(const T &val, const unsigned int r, const unsigned int c)
Definition matrix.h:281
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition vector.h:73
tbci_traits< T >::const_refval_type get(const unsigned long i) const
Definition vector.h:190
const T & getcref(const unsigned long i) const
Definition vector.h:188
unsigned long size() const
Definition vector.h:104
T & set(const T &val, const unsigned long i) const
Definition vector.h:194
T & setval(const unsigned long i) const
Definition vector.h:192
double fabs(const TBCI__ cplx< T > &c)
Definition cplx.h:746
NAMESPACE_END NAMESPACE_CPLX TBCI__ cplx< T > conj(const TBCI__ cplx< T > &c)
Definition cplx.h:663
#define THROWEXCEPT
Definition except.h:114
#define abs(x)
Definition f2c.h:178
#define min(a, b)
Definition f2c.h:180
return c
Definition f_matrix.h:760
#define _DIM
Definition f_matrix.h:39
NAMESPACE_TBCI char gaussj(Matrix< T > &a, Matrix< T > &b)
F_TMatrix< double > inv(const F_Matrix< double > &A, int overwriteA=0)
Definition lapack.cpp:181
#define TBCIDELETE(t, v, sz)
#define REALLOC(v, os, t, s)
#define NEW(t, s)
int lu_decomp(Matrix< T > &) HOT
LU decomposes the TBCI::Matrix mat.
Definition lu_solver.h:94
const unsigned end
const unsigned TMatrix< T > * res
const unsigned ac
void thread_start_off(const int thr_no, thr_job_t job, const unsigned long off, const unsigned long sz,...)
Definition smp.cc:979
void thread_wait(const int thr_no, struct job_output *out)
Definition smp.cc:997
int main_numa_node
Definition smp.cc:110
struct thr_struct * threads
Definition smp.cc:106
int numa_avail
Definition smp.cc:105
void(* thr_job_t)(struct thr_ctrl *)
Before the double inclusion guard on purpose!
Definition smp.h:126
#define threads_avail(x)
Definition smp.h:322
int numa_node
Definition smp.h:5
long t_res_l
Definition smp.h:148
long t_res_l
Definition smp.h:179
unsigned long t_off
Definition smp.h:172
unsigned long t_size
Definition smp.h:171
void * t_par[6]
Definition smp.h:173
unsigned int do_exactsum2()
Definition tbci_param.h:151