TBCI Numerical high perf. C++ Library  2.8.0
band_matrix.h
Go to the documentation of this file.
1 
6 // Last change: 98/11/03 KG
7 /* $Id: band_matrix.h,v 1.64.2.119 2022/11/03 17:28:10 garloff Exp $ */
8 
9 #ifndef TBCI_BAND_MATRIX_H
10 #define TBCI_BAND_MATRIX_H
11 
12 //#define BDMAT_NEW_ALLOC
13 
14 #include "tbci/basics.h"
15 #include "tbci/vector.h"
16 #include "tbci/matrix.h"
17 
18 // Important: Values smaller than this will be considered ZERO
19 // The default is reasonable for doubles which are of order 1
20 #ifndef BD_MINVAL
21 # define BD_MINVAL 1e-16
22 #endif
23 
25 #ifndef BDMAT_NEW_ALLOC
26 # define BDMAT_ONE_BLOCK
27 #endif
28 
29 #ifdef BDMAT_MEM_TRACE
30 # define BDMATDBG(x) x
31 #else
32 # define BDMATDBG(x)
33 #endif
34 
35 // Avoid -fguiding-decls
36 #if !defined(NO_GD) && !defined(AUTO_DECL)
37 # include "tbci/band_matrix_gd.h"
38 #endif
39 
41 
42 #ifndef TBCI_DISABLE_EXCEPT
43 //# include "except.h" is in basics.h
44 
46 class BdMatrixErr : public NumErr
47 {
48  public:
50  : NumErr("Error in var conf Band_Matrix class") {}
51  BdMatrixErr(const char* t, const long i = 0)
52  : NumErr(t, i) {}
54  : NumErr(be) {}
55  virtual ~BdMatrixErr() throw() {}
56 };
57 #endif /* TBCI_DISABLE_EXCEPT */
58 
59 #ifdef PRAGMA_I
60 # pragma interface "band_matrix.h"
61 #endif
62 
63 // Forward - declaration
64 template <typename T> class ILU0_BdMatrixPreconditioner;
65 template <typename T> class DILU_BdMatrixPreconditioner;
66 
102 template <typename T>
103 class BdMatrix : public Matrix_Sig<T>
104 {
107  protected:
108 
109  unsigned int dim, maxoff;
110  unsigned long arsz;
113  T* diag;
114  //BVector<unsigned int> bestrow;
115 
116  //protected constructor
117  void constructor(const unsigned int, const bool = true);
118  T* check(const unsigned r, const unsigned c) const HOT;
119  T* check_internal(const unsigned r, const unsigned c) const HOT;
120  inline void free_diags(const unsigned);
121  inline void do_copy(const BdMatrix<T>&);
122 
123  private:
124  mutable T dummy ALIGN(MIN_ALIGN);
125  mutable int outw, outp, outf;
126  mutable bool outset;
127 
128  inline bool search2(unsigned int&, const unsigned int, const unsigned int) HOT;
129 
130  public:
131 
132  typedef T value_type;
133  typedef T element_type;
134  typedef T aligned_value_type TALIGN(MIN_ALIGN2);
135  // constructors and destructors
136 
138  explicit BdMatrix(const unsigned int = 0);
140  explicit BdMatrix(const unsigned int, const unsigned int);
142  explicit BdMatrix(const unsigned int, const BVector<unsigned int>&);
144  BdMatrix(const BdMatrix<T>&);
146  BdMatrix(const T&, const unsigned int);
147  BdMatrix(const T&, const unsigned int, const BVector<unsigned int>&);
149  BdMatrix(const T&, const unsigned int, unsigned int);
151  BdMatrix(const Matrix<T>&);
153 
154 
156  void destroy()
157  {
159  dim = 0; arsz = 0;
160  adiag.resize(0); bdiag.resize(0); diagconf.resize(0);
161  }
162 
163  // allow instantiation (Matrix_Sig)
164  /*virtual*/ static const char* mat_info()
165  { return "BdMatrix"; }
166 
167 #ifndef HAVE_PROMOTION_BUG
168  // Promotion (only explicit)
169 # ifndef HAVE_GCC295_TMPLFRNDCLS_BUG
170  template <typename U> friend class BdMatrix;
171  template <typename U> explicit BdMatrix(const BdMatrix<U>& bm)
172  : diagconf(bm.diagconf), adiag(bm.dim), bdiag(bm.dim)
173  {
174  constructor(bm.dim);
175  for (unsigned k=0; k<dim; ++k)
176  diag[k] = bm.diag[k];
177  for (unsigned i=0; i<diagconf.size(); ++i) {
178  const unsigned di = diagconf.get(i);
179  for (unsigned j=0; j<dim-di; ++j) {
180  adiag.get(di)[j] = bm.adiag.get(di)[j];
181  bdiag.get(di)[j] = bm.bdiag.get(di)[j];
182  }
183  }
184  if (bm.outset) {
185  outw = bm.outw; outp = bm.outp;
186  outf = bm.outf; outset = true;
187  }
188  }
189 # endif
190 #endif
191 
193  operator TMatrix<T>() const;
194  //operator Matrix<T>() const;
195 
197  T& operator() (const unsigned int, const unsigned int) HOT;
199  typename tbci_traits<T>::const_refval_type
200  operator() (const unsigned int, const unsigned int) const HOT;
201  typename tbci_traits<T>::const_refval_type
202  get(const unsigned int, const unsigned int) const HOT;
203 
205  BdMatrix<T>& setval(const T& v, const unsigned int r,
206  const unsigned int c)
207  { return autoinsert(v,r,c); }
208  T& setval(const unsigned int r, const unsigned int c);
209  //const T& getcref(const unsigned r, const unsigned c) const
210  //{ return (*this)(r,c); }
211 
212  //T val(const unsigned int i, const unsigned int j) const
213  // { return this->operator() (i, j); }
214 
215  // Basic operations
218 
219  BdMatrix<T>& resize(const unsigned int);
220  BdMatrix<T>& resize(const T&, const unsigned int);
221  BdMatrix<T>& resize(const unsigned int, const BVector<unsigned int>&);
222  BdMatrix<T>& resize(const T&, const unsigned int, const BVector<unsigned int>&);
225  { resize(bdm.dim, bdm.diagconf); return *this = bdm; }
226 
228  BdMatrix<T>& setunit(const T& = 1); //unity Matrix
229  BdMatrix<T>& fill(const T& = 0);
230  BdMatrix<T>& clear();
231  BdMatrix<T>& adddiag(const unsigned);
232  BdMatrix<T>& removediag(const unsigned);
233  BdMatrix<T>& autoinsert(const T&, const unsigned, const unsigned);
234  BdMatrix<T>& expand(unsigned = 0);
236  BdMatrix<T>& set_row(const Vector<T>& val, const unsigned, const unsigned = 0);
238  TVector<T> get_row(const unsigned int) const;
240  TVector<T> get_col(const unsigned int) const;
241 
243  inline unsigned int size() const { return dim; }
245  inline unsigned int noelem() const { return arsz; }
247  inline unsigned int rows() const { return dim; }
249  inline unsigned int columns() const { return dim; }
250 
252  const BVector<unsigned int>& getcfg() const { return diagconf; }
254  unsigned int getmaxoff() const { return maxoff; }
255 
256  T trace() const
257  {
258  T r ALIGN(MIN_ALIGN) = 0;
259  for (REGISTER unsigned i = 0; i < dim; ++i)
260  r += diag[i];
261  return r;
262  }
263  //T sdet(unsigned int, BVector<unsigned int>&, BVector<unsigned int>&);
264  //T subdet(BVector<unsigned int>, BVector<unsigned int>);
265  //T det() { return subdet(BVector<unsigned>(0), BVector<unsigned>(0)); }
266 
269  BdMatrix<T>& operator = (const T&);
272  //BdMatrix<T>& operator *= (BdMatrix<T>&);
273  BdMatrix<T>& operator *= (const T&);
274  BdMatrix<T>& operator /= (const T&);
275 
276  // Don't need friendship here. Good!
277 #ifdef BDMAT_TRANS_FRIENDS
279 #endif
280 
281  bool operator == (const BdMatrix<T>&) const;
282  bool operator != (const BdMatrix<T>& ma) const { return !(*this == ma); }
283 
284  BdMatrix<T> operator + (const BdMatrix<T>&) const;
285  BdMatrix<T> operator - (const BdMatrix<T>&) const;
286  BdMatrix<T> operator * (const BdMatrix<T>&) const;
287  BdMatrix<T> operator * (const T&) const;
288  // Friendship also not required
289 #ifdef BDMAT_OP_FRIENDS
290  friend NOINST BdMatrix<T> operator * FGD (const T&, const BdMatrix<T>&);
291 #endif
292 
293  BdMatrix<T> operator / (const T&) const;
294  BdMatrix<T> operator - () const;
295 
296  /*
297  * friend Vector<T> rvec FGD (const BdMatrix<T>&);
298  * friend Vector<T> cvec FGD (const BdMatrix<T>&);
299  */
300 
302  TVector<T> operator * (const Vector<T>&) const HOT;
304  TVector<T> transMult(const Vector<T>&) const HOT;
305  //TVector<T> transMult(const TVector<T>&) const;
306  //TVector<T> transMult(const TSVector<T>&) const;
307  TVector<T> dotMult(const Vector<T>&) const;
308 
309  TMatrix<T> operator * (const Matrix<T>&) const;
310 
312  BdMatrix<T>& mult_rows(const Vector<T>&);
313  BdMatrix<T>& div_rows(const Vector<T>&);
314  BdMatrix<T>& mult_row(const T&, const unsigned);
315  BdMatrix<T>& div_row(const T&, const unsigned);
316 
317  // I/O operations
318 
319  friend STD__ istream& operator >> FGD (STD__ istream&, BdMatrix<T>&);
320  friend STD__ ostream& operator << FGD (STD__ ostream&, const BdMatrix<T>&);
321  const BdMatrix<T>& setoutopts(int w = 5, int p = 4, char f= ' ') const
322  { outw = w; outp = p; outf = f; outset = true; return *this; }
323 
324  // Solver
325  friend NOINST void /* FRIEND_TBCI2__ */ gaussj FGDT (const BdMatrix<T>&,
326  const Matrix<T>&);
327  friend NOINST int /* FRIEND_TBCI2__ */ lu_decomp FGDT (BdMatrix<T>&) HOT;
328 
329  // Inversion by determinants
330  BdMatrix<T> inverse() const;
331 
332  // Friends for bdmat-vec mult
333  friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw FGDT (const unsigned, const unsigned,
334  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
335  friend void FRIEND_TBCI2__ do_bdmat_vec_mult_lnw_opt FGDT (const unsigned, const unsigned,
336  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
337  friend void FRIEND_TBCI2__ do_bdmat_vec_mult_diagw_exact FGDT (const unsigned, const unsigned,
338  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
339  friend void FRIEND_TBCI2__ do_bdmat_vec_mult FGDT (const unsigned, const unsigned,
340  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
341  friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw FGDT (const unsigned, const unsigned,
342  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
343  friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_lnw_opt FGDT (const unsigned, const unsigned,
344  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
345  friend void FRIEND_TBCI2__ do_bdmat_vec_transmult_diagw_exact FGDT (const unsigned, const unsigned,
346  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
347  friend void FRIEND_TBCI2__ do_bdmat_vec_transmult FGDT (const unsigned, const unsigned,
348  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
349  friend void FRIEND_TBCI2__ do_bdmat_vec_dotmult FGDT (const unsigned, const unsigned,
350  TVector<T> *, const BdMatrix<T> *, const Vector<T> *);
351 };
352 
353 
355 #ifdef BDMAT_ONE_BLOCK
356 
357 template <typename T>
358 inline void BdMatrix<T>::free_diags(const unsigned dummy)
359 {
360  BDMATDBG(STD__ cerr << "BdMat free " << diag << " (" << dim << ")" << STD__ endl);
361  if (dim)
362  TBCIDELETE(T, diag, arsz);
363 }
364 
365 template <typename T>
366 inline void BdMatrix<T>::do_copy(const BdMatrix<T>& bdm)
367 {
368  TBCICOPY(diag, bdm.diag, T, arsz);
369  // In case standard setup has been changed, e.g. by transpose
370  for (unsigned i = 0; i < diagconf.size(); ++i) {
371  const unsigned di = diagconf.get(i);
372  adiag.set(di) = (T*)((long)diag + (long)bdm.adiag.get(di) - (long)bdm.diag);
373  bdiag.set(di) = (T*)((long)diag + (long)bdm.bdiag.get(di) - (long)bdm.diag);
374  }
375 }
376 
377 template <typename T>
378 void BdMatrix<T>::constructor(const unsigned int d, const bool delout)
379 {
380  dim = d; maxoff = 0;
381  //maxoff = findmax(diagconf);
382  BCHKNR(dim > 0 && maxoff >= dim, BdMatrixErr, Wrong config Vector, maxoff);
383  adiag.resize((T*)0, d); bdiag.resize((T*)0, d); //initialization with NULL Vectors
384  unsigned long sz = dim; int t;
385  const unsigned long dcs = diagconf.size();
386  for (REGISTER unsigned j = 0; j < dcs; ++j)
387  if ((t = ((signed)dim - (signed)diagconf.get(j))) > 0)
388  sz += 2 * t;
389  if (UNLIKELY(sz)) {
390  diag = NEW(T, sz);
391  if (LIKELY(diag)) {
392  adiag.set(0) = diag; bdiag.set(0) = diag; sz = dim;
393  for (unsigned i = 0; i < dcs; ++i) {
394  const unsigned int di = diagconf.get(i);
395  int s = (signed)dim - (signed)di;
396  if (UNLIKELY(s <= 0 || s >= (signed)dim)) {
397  STD__ cerr << "Wrong config. Vector supplied!" << STD__ endl;
398  s = 0;
399  } else {
400  adiag.set(di) = diag+sz; sz += s;
401  bdiag.set(di) = diag+sz; sz += s;
402  }
403  if (LIKELY(di > maxoff))
404  maxoff = di;
405  }
406  } else { // alloc failure
407  adiag.resize(0); bdiag.resize(0); dim = 0;
408  STD__ cerr << "BdMat ctor: malloc failed to get " << sz << " bytes!" << STD__ endl;
409  BCHKNR(!diag, BdMatrixErr, memory alloc failure, sz);
410  sz = 0;
411  }
412  }
413  else
414  diag = (T*)0;
415 
416  arsz = sz; dummy = (T)0;
417  BDMATDBG(STD__ cerr << "BdMat alloc " << diag << " (" << sz << ")" << STD__ endl);
418  if (delout)
419  outset = false;
420 }
421 
422 #else /* ! BDMAT_ONE_BLOCK */
423 
424 /* Free diags until diagconf(max)-1 */
425 template <typename T>
426 inline void BdMatrix<T>::free_diags(const unsigned u)
427 {
428  for (int i = u-1; i >= 0; --i) {
429  // The diags could have been swapped by transpose ()
430  REGISTER const unsigned di = diagconf.get(i);
431  REGISTER T* ptr = MIN(adiag.get(di), bdiag.get(di));
432  BDMATDBG(STD__ cerr << "BdMat free " << ptr << STD__ endl);
433  TBCIDELETE(T, ptr, 2*(dim-di));
434  }
435  BDMATDBG(STD__ cerr << "BdMat free " << diag << " (" << dim << ")" << STD__ endl);
436  if (LIKELY(dim))
437  TBCIDELETE(T, diag, dim);
438 }
439 
440 template <typename T>
441 inline void BdMatrix<T>::do_copy(const BdMatrix<T>& bdm)
442 {
443  TBCICOPY(diag, bdm.diag, T, dim);
444  for (unsigned i = 0; i < diagconf.size(); ++i) {
445  REGISTER const unsigned di = diagconf.get(i);
446  TBCICOPY(adiag.get(di), bdm.adiag.get(di), T, (dim-di));
447  TBCICOPY(bdiag.get(di), bdm.bdiag.get(di), T, (dim-di));
448  }
449 }
450 
451 template <typename T>
452 void BdMatrix<T>::constructor(const unsigned int d, const bool delout)
453 {
454  unsigned long sz = 0; unsigned i = 0;
455  dim = d; maxoff = 0;
456  //maxoff = findmax(diagconf);
457  BCHKNR (dim > 0 && maxoff >= dim, BdMatrixErr, Wrong config Vector, maxoff);
458  //initialization with NULL Vectors
459  adiag.resize((T*)0, d);
460  bdiag.resize((T*)0, d);
461 
462  if (LIKELY(!dim))
463  diag = (T*)0;
464  else {
465  diag = NEW(T,dim);
466  if (UNLIKELY(!diag))
467  goto memallocfail;
468  sz += dim;
469  adiag.set(0) = diag; bdiag.set(0) = diag; // paranoia
470  }
471 
472  while (i < diagconf.size()) {
473  const unsigned di = diagconf.get(i);
474  int s = (signed)dim - (signed)di;
475  if (UNLIKELY(s <= 0 || s >= (signed)dim)) {
476  BCHKNR(true, BdMatrixErr, Illegal Config BVector, di);
477  // Try to salvage
478  diagconf.remove(i);
479  } else {
480  T* ptr = NEW(T, 2*s);
481  if (UNLIKELY(!ptr))
482  goto memallocfail;
483  adiag.set(di) = ptr;
484  bdiag.set(di) = ptr+s;
485  if (LIKELY(di > maxoff))
486  maxoff = di;
487  sz += 2*s; ++i;
488  }
489  }
490  arsz = sz; dummy = (T)0;
491  if (delout)
492  outset = false;
493  BDMATDBG(STD__ cerr << "BdMat alloc " << diag << " (" << sz << ")" << STD__ endl);
494  return;
495  memallocfail:
496  free_diags(i);
497  adiag.resize(0); bdiag.resize(0); dim = 0; //sz = 0;
498  STD__ cerr << "BdMat ctor: mem alloc (" << sz << ") failed!" << STD__ endl;
499  BCHKNR(1, BdMatrixErr, mem alloc failure, sz);
500  return;
501 }
502 
503 #endif /* BDMAT_ONE_BLOCK */
504 
505 template <typename T>
506 inline BdMatrix<T>::BdMatrix(const unsigned int d)
507 : diagconf(1, 1), adiag(d), bdiag(d)
508 {
509  if (UNLIKELY(d <= 1))
510  diagconf.resize(0); // default: Tridiagonal shape
511  constructor(d); clear();
512 }
513 
514 template <typename T>
515 inline BdMatrix<T>::BdMatrix(const unsigned int d, const unsigned int e)
516 : diagconf(1, 1), adiag(d), bdiag(d)
517 {
518  BCHKNR(e != d, BdMatrixErr, "Non square BdMatrix shape impossible!", e);
519  if (UNLIKELY(d <= 1))
520  diagconf.resize(0); // default: Tridiagonal shape
521  constructor(d); clear();
522 }
523 
524 template <typename T>
525 inline BdMatrix<T>::BdMatrix(const unsigned int d, const BVector<unsigned int>& conf)
526 : diagconf(conf), adiag(d), bdiag(d)
527 {
528  constructor(d); clear();
529 }
530 
531 
532 template <typename T>
534 : diagconf(mat.diagconf), adiag(mat.dim), bdiag(mat.dim)
535 {
536  constructor(mat.dim); do_copy(mat);
537 
538  if (UNLIKELY(mat.outset)) {
539  outw = mat.outw; outp = mat.outp;
540  outf = mat.outf; outset = true;
541  }
542 }
543 
544 template <typename T>
545 inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d)
546 : diagconf(1, 1), adiag(d), bdiag(d)
547 {
548  if (UNLIKELY(d <= 1))
549  diagconf.resize(0); // default: Tridiagonal shape
550  constructor(d);
551  setunit(val);
552 }
553 
554 template <typename T>
555 inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d, unsigned int mo)
556 : diagconf(mo), adiag(d), bdiag(d)
557 {
558  if (mo >= d)
559  mo = d - 1;
560  diagconf.resize(mo);
561  for (REGISTER unsigned int t = 0; t < mo; ++t)
562  diagconf.set(t) = t+1;
563 
564  constructor(d);
565  setunit(val);
566 }
567 
568 
569 template <typename T>
570 inline BdMatrix<T>::BdMatrix(const T& val, const unsigned int d,
571  const BVector<unsigned int>& conf)
572 : diagconf(conf), adiag(d), bdiag(d)
573 {
574  constructor(d);
575  setunit(val);
576 }
577 
578 
579 template <typename T>
581 : diagconf(0, 0), adiag(mat.rows()), bdiag(mat.columns())
582 {
583  BCHKNR(mat.columns() != mat.rows(), BdMatrixErr, Only quadratic matrices, 0);
584 
585  //set up conf Vector
586  unsigned int ds = 0;
587  for (unsigned int l = 1; l < mat.columns(); ++l) {
588  for (unsigned REGISTER j = 0;
589  (signed)j < ((signed)(mat.rows() - l)); ++j) {
590  if (mat(j, j+l) != T(0) || mat(j+l, j) != T(0)) {
591  diagconf.resize(++ds);
592  diagconf.set(ds - 1) = l;
593  break;
594  }
595  }
596  }
597 
598  //std::cout << diagconf << std::endl;
599  constructor(mat.columns());
600 
601  //copy diagonal
602  for (REGISTER unsigned int k = 0; k < dim; ++k)
603  diag[k] = mat(k, k);
604  //copy configured offdiags
605  for (unsigned int i = 0; i < diagconf.size(); ++i) {
606  const unsigned int dci = diagconf.get(i);
607  for (REGISTER unsigned int j = 0; j < dim - dci; ++j) {
608  adiag.get(dci)[j] = mat(j, j + dci);
609  bdiag.get(dci)[j] = mat(dci + j, j);
610  }
611  }
612 }
613 
614 template <typename T>
616 {
617  BCHKNR(dummy!=(T)0, BdMatrixErr, dummy not zero, 0);
619 }
620 
621 
622 template <typename T>
623 inline bool BdMatrix<T>::search2(unsigned int& cfg, const unsigned int i,
624  const unsigned int j)
625 {
626  for (REGISTER int t = 0; (unsigned)t < diagconf.size(); ++t) {
627  if (UNLIKELY(CSTD__ abs((int)(j - i)) == (int)diagconf.get(t))) {
628  cfg = t;
629  return true;
630  }
631  }
632  return false;
633 }
634 
635 template <typename T>
636 inline T* BdMatrix<T>::check_internal(const unsigned i, const unsigned j) const
637 {
638  const int df = i-j;
639  if (LIKELY(!df))
640  return diag+i;
641  if (CSTD__ abs(df) > (int)maxoff)
642  return (T*)0;
643 
644  T* ptr;
645  if (df > 0) {
646  if (LIKELY(ptr = bdiag.get( df)))
647  return ptr+j;
648  else
649  return (T*)0;
650  } else {
651  if (LIKELY(ptr = adiag.get(-df)))
652  return ptr+i;
653  else
654  return (T*)0;
655  }
656 }
657 
658 template <typename T>
659 inline T* BdMatrix<T>::check(const unsigned i, const unsigned j) const
660 {
661  EXPCHK(i>=dim, BdMatrixErr, Invalid row index, i, (T*)0);
662  EXPCHK(j>=dim, BdMatrixErr, Invalid col index, j, (T*)0);
663  return check_internal(i, j);
664 }
665 
666 
667 template <typename T>
668 inline T& BdMatrix<T>::operator() (const unsigned int r, const unsigned int c)
669 {
670  REGISTER T* pt = check(r, c);
671  if (LIKELY(pt))
672  return *pt;
673  else {
674  //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
675  return (dummy = (T)0);
676  }
677 }
678 
679 template <typename T>
680 inline typename tbci_traits<T>::const_refval_type
681 BdMatrix<T>::operator() (const unsigned int r, const unsigned int c) const
682 {
683  REGISTER T* pt = check(r, c);
684  if (LIKELY(pt))
685  return *pt;
686  else {
687  //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
688  return (dummy = (T)0);
689  }
690 }
691 
692 template <typename T>
693 inline typename tbci_traits<T>::const_refval_type
694 BdMatrix<T>::get(const unsigned int r, const unsigned int c) const
695 {
696  REGISTER T* pt = check_internal(r, c);
697  if (LIKELY(pt))
698  return *pt;
699  else {
700  //EXPCHK(dummy != (T)0, BdMatrixErr, dummy not zero, 0, (dummy=(T)0));
701  return (dummy = (T)0);
702  }
703 }
704 
705 
706 template <typename T>
707 T& BdMatrix<T>::setval(const unsigned int r, const unsigned int c)
708 {
709  //Auto-insert
710  REGISTER T* pt = check(r, c);
711  if (LIKELY(pt))
712  return *pt;
713  else {
714  adddiag(CSTD__ abs((int)(r-c)));
715  pt = check(r, c);
716  BCHK(!pt, BdMatrixErr, Internal autoinsert error, 0, dummy=(T)0);
717  //*pt = (T)0;
718  }
719  return *pt;
720 }
721 
722 
723 #ifdef BDMAT_ONE_BLOCK
724 template <typename T>
725 inline BdMatrix<T>& BdMatrix<T>::setunit(const T& val)
726 {
727  TBCIFILL(diag, val, T, dim);
728  if (arsz > dim)
729  TBCIFILL((diag+dim), (T)0, T, (arsz-dim));
730  dummy = (T)0;
731  return *this;
732 }
733 #else
734 template <typename T>
735 inline BdMatrix<T>& BdMatrix<T>::setunit(const T& val)
736 {
737  TBCIFILL(diag, val, T, dim);
738  for (unsigned i = 0; i < diagconf.size(); ++i) {
739  const REGISTER unsigned di = diagconf.get(i);
740  TBCIFILL(adiag.get(di), (T)0, T, (dim-di));
741  TBCIFILL(bdiag.get(di), (T)0, T, (dim-di));
742  }
743  dummy = (T)0;
744  return *this;
745 }
746 #endif
747 
748 template <typename T>
749 BdMatrix<T>& BdMatrix<T>::resize(const unsigned int nd)
750 {
751  if (nd == dim)
752  return *this;
753 
754  T* oldd(diag); unsigned int od = dim;
755 #ifdef BDMAT_ONE_BLOCK
756  unsigned int oldar = arsz;
757 #endif
758  BVector<T*> oldad(adiag), oldbd(bdiag);
759  BVector<unsigned> oldconf(diagconf);
760 
761  //compatibility with constructor
762  if (od <= 1 && nd > 1)
763  diagconf.resize(1, 1);
764  constructor(nd, false); //dim = nd;
765  int min = MIN(nd, od);
766 
767  if (min) {
768  //copy diagonal
769  TBCICOPY(diag, oldd, T, min);
770  if (nd > od)
771  TBCIFILL((diag+min), (T)0, T, (nd-dim));
772  //copy offdiags
773  for (unsigned int t = 0; t < diagconf.size(); ++t) {
774  const REGISTER int s = diagconf.get(t);
775  if (min - s > 0) {
776  TBCICOPY(adiag.get(s), oldad.get(s), T, (min-s));
777  TBCICOPY(bdiag.get(s), oldbd.get(s), T, (min-s));
778  if (nd > od) {
779  TBCIFILL((adiag.get(s)+(min-s)), (T)0, T, (nd-dim));
780  TBCIFILL((bdiag.get(s)+(min-s)), (T)0, T, (nd-dim));
781  }
782  }
783  }
784  }
785 
786 #ifndef BDMAT_ONE_BLOCK
787  for (unsigned i = 0; i < oldconf.size(); ++i)
788  TBCIDELETE(T, MIN(oldad.get(oldconf.get(i)), oldbd.get(oldconf.get(i))),
789  2*(od-oldconf.get(i)));
790  if (od)
791  TBCIDELETE(T, oldd, od);
792 #else
793  if (oldar)
794  TBCIDELETE(T, oldd, oldar);
795 #endif
796  return *this;
797 }
798 
799 
800 template <typename T>
801 BdMatrix<T>& BdMatrix<T>::resize(const T& val, const unsigned int nd)
802 {
803  if (nd != dim) {
804  if (dim)
806  //compatibility with constructor
807  if (dim <= 1 && nd > 1)
808  diagconf.resize(1, 1);
809  constructor(nd, false);
810  }
811  setunit(val);
812  return *this;
813 }
814 
815 
816 //Warning: This resize() will NOT preserve data
817 template <typename T>
818 BdMatrix<T>& BdMatrix<T>::resize(const unsigned int nd, const BVector<unsigned int>& cfg)
819 {
820  if (nd == dim && cfg == diagconf)
821  return *this;
822  if (dim)
824  diagconf.copy(cfg);
825  constructor(nd, false);
826  clear();
827  return *this;
828 }
829 
830 
831 template <typename T>
832 BdMatrix<T>& BdMatrix<T>::resize(const T& val, const unsigned int nd,
833  const BVector<unsigned int>& cfg)
834 {
835  resize(nd, cfg);
836  //setunit(val);
837  //clear(); is called by resize(nd, cfg);
838  for (unsigned i = 0; i < dim; ++i)
839  diag[i] = val;
840  return *this;
841 }
842 
843 
844 //reconfigures a BdMatrix and preserves its contents
845 template <typename T>
847 {
848  if (dim == 0 || cfg == diagconf)
849  return *this;
850  T* oldd = diag;
851 #ifdef BDMAT_ONE_BLOCK
852  unsigned int oldar = arsz;
853 #else
854  unsigned int olddim = dim;
855 #endif
856  BVector<T*> oldad(adiag), oldbd(bdiag);
857  BVector<unsigned> oldconf(diagconf);
858 
859  diagconf.resize(cfg); //diagconf = cfg;
860  constructor(dim, false);
861  //delete junk
862  clear();
863  //copy elements: diagonal
864  TBCICOPY(diag, oldd, T, dim);
865  // offdiag
866  for (REGISTER unsigned t = 0; t < diagconf.size(); ++t) {
867  const unsigned int dct = diagconf.get(t);
868  if (oldad(dct)) {
869  TBCICOPY(adiag.get(dct), oldad.get(dct), T, (dim-dct));
870  TBCICOPY(bdiag.get(dct), oldbd.get(dct), T, (dim-dct));
871  }
872  }
873 #ifndef BDMAT_ONE_BLOCK
874  for (unsigned i = 0; i < oldconf.size(); ++i)
875  TBCIDELETE(T, MIN(oldad.get(oldconf.get(i)), oldbd.get(oldconf.get(i))),
876  2*(olddim-oldconf.get(i)));
877  if (oldd)
878  TBCIDELETE(T, oldd, olddim);
879 #else
880  if (oldar)
881  TBCIDELETE(T, oldd, oldar);
882 #endif
883  return *this;
884 }
885 
886 #ifdef BDMAT_ONE_BLOCK
887 template <typename T>
888 BdMatrix<T>& BdMatrix<T>::adddiag(const unsigned ndg)
889 {
890  const int newsz = dim - ndg; /* no of elements in new diagonal */
891  BCHK(newsz <= 0, BdMatrixErr, Diag out of range, ndg, *this);
892  // Diag already stored ?
893  if (LIKELY(diagconf.contains(ndg)))
894  return *this;
895  // Save old position (for diff)
896  T* olddiag = diag;
897  REALLOC(diag, arsz, T, (arsz + 2 * newsz));
898  if (UNLIKELY(!diag)) {
899  diag = olddiag;
900  STD__ cerr << "Allocation error" << STD__ endl;
901  return *this;
902  }
903  BDMATDBG(STD__ cerr << "BdMat realloc " << diag << " ("
904  << arsz+2*newsz << ")" << STD__ endl);
905  // Delete new elements
906  // Calculate new pointers
907  /* difference in units of sizeof(T) could fail and actually did, as NO
908  * alignment is guaranteed by new ! Ughh ! Hard to find bug ! KG, 98/09/04
909  * (Note: Stroustrup (3rd ed.) does not say anything about alignment for
910  * operator new, so I can't tell whether this is a bug or not.)
911  * maybe overloading of the new operator would be a good idea to ensure
912  * alignment */
913  long diff = (long)diag - (long)olddiag; //long is better for 64 bit platforms ..
914  if (LIKELY(diff)) {
915  //STD__ cout << "DEBUG: diag " << olddiag << " -> " << diag << " , ad0 " << adiag(0) << " -> " << adiag(0) + diff << STD__ endl;
916  /* ANSI C++ is really stupid about forbidding pointer arithmetic,
917  * never produced such ugly code before! KG, 98/09/04 */
918  adiag.set(0) = (T*)((long)adiag.get(0) + diff);
919  bdiag.set(0) = (T*)((long)bdiag.get(0) + diff);
920  const unsigned dcs = diagconf.size();
921  for (unsigned i = 0; i < dcs; ++i) {
922  const unsigned di = diagconf.get(i);
923  adiag.set(di) = (T*)((long)adiag.get(di) + diff);
924  bdiag.set(di) = (T*)((long)bdiag.get(di) + diff);
925  }
926  //STD__ cout << " -> " << adiag(0) << STD__ endl;
927  }
928  // Paranoia
929  BCHKNR(adiag.get(0) != diag || bdiag.get(0) != diag, BdMatrixErr, adddiag: mismatch while doing pointer arithmetic, diag - adiag.get(0));
930  // Set pointers for new diagonals
931  diagconf.append(ndg);
932  adiag.set(ndg) = diag+arsz; bdiag.set(ndg) = diag+arsz+newsz;
933  TBCIFILL((diag+arsz), (T)0, T, (unsigned)(2*newsz));
934  // New sizes ...
935  if (ndg > maxoff)
936  maxoff = ndg;
937  arsz += 2 * newsz;
938  //STD__ cout << "DEBUG: diagconf, diag, arsz: " << diagconf << " - " << diag << " - "<< arsz << "\n adiag: " << adiag << "\n bdiag: " << bdiag << STD__ endl;
939  return *this;
940 }
941 
942 #else /* BDMAT_ONE_BLOCK */
943 
944 template <typename T>
945 BdMatrix<T>& BdMatrix<T>::adddiag(const unsigned ndg)
946 {
947  const int newsz = dim - ndg; /* no of elements in new diagonal */
948  BCHK(newsz <= 0, BdMatrixErr, Diag out of range, ndg, *this);
949  // Diag already stored ?
950  if (LIKELY(diagconf.contains(ndg)))
951  return *this;
952  T* ptr = NEW(T, 2*newsz);
953  if (UNLIKELY(!ptr)) {
954  BCHK(1, BdMatrixErr, Allocation failed in adddiag, 2*newsz*sizeof(T), *this);
955  STD__ cerr << "BdMat adddiag allocation failed!" << STD__ endl;
956  return *this;
957  }
958  adiag.set(ndg) = ptr; bdiag.set(ndg) = ptr+newsz;
959  TBCIFILL(ptr, (T)0, T, (2*newsz));
960  diagconf.append(ndg);
961  // New sizes ...
962  if (ndg > maxoff)
963  maxoff = ndg;
964  arsz += 2 * newsz;
965  BDMATDBG(STD__ cerr << "BdMat adddiag " << ptr << " (" << arsz << ")" << STD__ endl);
966  return *this;
967 }
968 
969 #endif /* BDMAT_ONE_BLOCK */
970 
971 // Only really functional for new (not ONE_BLOCK) mem alloc strategy
972 template <typename T>
974 {
975  if (dg > maxoff)
976  return *this;
977 #ifdef BDMAT_ONE_BLOCK
978  TBCIFILL(adiag.get(dg), (T)0, T, (dim-dg));
979  TBCIFILL(bdiag.get(dg), (T)0, T, (dim-dg));
980 #else
981  if (adiag.get(dg)) {
982  BDMATDBG(STD__ cerr << "BdMat rmvdiag "
983  << MIN(adiag.get(dg), bdiag.get(dg)) << STD__ endl);
984  TBCIDELETE(T, MIN(adiag.get(dg),bdiag.get(dg)), 2*(dim-dg));
985  adiag.set(dg) = (T*)0; bdiag.set(dg) = (T*)0;
986  unsigned ix; diagconf.contains(dg, &ix);
987  diagconf.remove(ix); arsz -= 2*(dim-dg);
988  if (maxoff == dg)
989  maxoff = diagconf.max();
990  }
991 #endif
992  return *this;
993 }
994 
995 template <typename T>
996 BdMatrix<T>& BdMatrix<T>::autoinsert(const T& val, const unsigned r, const unsigned c)
997 {
998  REGISTER T* pt = check(r, c);
999  if (LIKELY(pt)) {
1000  *pt = val;
1001  return *this;
1002  } else {
1003  if (LIKELY(MATH__ fabs(val) >= BD_MINVAL)) {
1004  adddiag(CSTD__ abs((int)(r-c)));
1005  pt = check(r, c);
1006  BCHK(!pt, BdMatrixErr, Internal autoinsert error, 0, *this);
1007  *pt = val;
1008  }
1009  }
1010  return *this;
1011 }
1012 
1013 template <typename T>
1014 BdMatrix<T>& BdMatrix<T>::set_row(const Vector<T>& val, const unsigned r, const unsigned s)
1015 {
1016  for (unsigned i=0; i<val.size(); ++i)
1017  autoinsert(val(i), r, i+s);
1018  return *this;
1019 }
1020 
1021 template <typename T>
1022 TVector<T> BdMatrix<T>::get_row(const unsigned int r) const
1023 {
1024  TVector<T> ret(0, dim);
1025  BCHK(r >= dim, BdMatrixErr, get_row idx out of bounds, r, ret);
1026  ret.set(diag[r], r);
1027  for (unsigned i = 0; i < diagconf.size(); ++i) {
1028  const REGISTER unsigned di = diagconf.get(i);
1029  if (LIKELY(di < dim-r))
1030  ret.set(adiag.get(di)[r] , r+di);
1031  if (LIKELY(r >= di))
1032  ret.set(bdiag.get(di)[r-di], r-di);
1033  }
1034  return ret;
1035 }
1036 
1037 template <typename T>
1038 TVector<T> BdMatrix<T>::get_col(const unsigned int c) const
1039 {
1040  TVector<T> ret(0, dim);
1041  BCHK(c >= dim, BdMatrixErr, get_col idx out of bounds, c, ret);
1042  ret.set(diag[c], c);
1043  for (unsigned i = 0; i < diagconf.size(); i++) {
1044  const REGISTER unsigned di = diagconf.get(i);
1045  if (LIKELY(di < dim-c))
1046  ret.set(bdiag.get(di)[c] , c+di);
1047  if (LIKELY(c >= di))
1048  ret.set(adiag.get(di)[c-di], c-di);
1049  }
1050  return ret;
1051 }
1052 
1053 #ifdef BDMAT_ONE_BLOCK
1054 template <typename T>
1055 inline BdMatrix<T>& BdMatrix<T>::fill(const T& val)
1056 {
1057  TBCIFILL(diag, val, T, arsz);
1058  return *this;
1059 }
1060 
1061 template <typename T>
1063 {
1064  TBCIFILL(diag, (T)0, T, arsz);
1065  return *this;
1066 }
1067 
1068 #else
1069 
1070 template <typename T>
1071 inline BdMatrix<T>& BdMatrix<T>::fill(const T& val)
1072 {
1073  TBCIFILL(diag, val, T, dim);
1074  for (unsigned i = 0; i < diagconf.size(); ++i) {
1075  const REGISTER unsigned di = diagconf.get(i);
1076  TBCIFILL(adiag.get(di), val, T, (dim-di));
1077  TBCIFILL(bdiag.get(di), val, T, (dim-di));
1078  }
1079  return *this;
1080 }
1081 
1082 template <typename T>
1084 {
1085  TBCIFILL(diag, (T)0, T, dim);
1086  for (unsigned i = 0; i < diagconf.size(); ++i) {
1087  const REGISTER unsigned di = diagconf.get(i);
1088  TBCIFILL(adiag.get(di), (T)0, T, (dim-di));
1089  TBCIFILL(bdiag.get(di), (T)0, T, (dim-di));
1090  }
1091  return *this;
1092 }
1093 #endif
1094 
1095 // expand to outermost diagonal (or m, if given)
1096 template <typename T>
1098 {
1099  if (UNLIKELY(m == 0))
1100  m = maxoff;
1101  BCHK(m>dim, BdMatrixErr, Wrong expansion size, m, *this);
1102  if (LIKELY(maxoff == m && arsz == (dim + 2*dim*m - m*(m+1))))
1103  return *this;
1104 #ifdef BDMAT_ONE_BLOCK
1105  BVector<unsigned> newconf(m);
1106  for (unsigned i = 0; i < m; ++i)
1107  newconf.set(i) = i+1;
1108  return reconfig(newconf);
1109 #else
1110  for (unsigned i=1; i<=m; ++i)
1111  adddiag(i);
1112  return *this;
1113 #endif
1114 }
1115 
1116 
1117 //#warning "Calculation of (sub)determinants not yet implemented!"
1118 #if 0
1119 template <typename T>
1120 T BdMatrix<T>::sdet(unsigned int level, BVector<unsigned int>& arow,
1121  BVector<unsigned int>& acol)
1122 {
1123  T el;
1124  int sz = arow.size();
1125  //if (sz < 1)
1126  // STD__ cerr << "FEHLER!" << STD__ endl;
1127  if (sz == 1)
1128  return this->operator() (arow(0), acol(0));
1129  if (sz == 2) {
1130  el = (this->operator() (arow(0), acol(0)) \
1131  * this->operator() (arow(1), acol(1)) \
1132  - this->operator() (arow(0), acol(1)) \
1133  * this->operator() (arow(1), acol(0)));
1134  return el;
1135  }
1136  T tmp = 0;
1137  //if (sz>=20) STD__ cout << sz << " ";
1139  int r = bestrow(level++); //row to expand
1140  int rw = 0; int mul;
1141  //setup rows
1142  for (REGISTER int i = 0; i < sz; i++) {
1143  if (arow(i) == r) {
1144  mul = (i%2? -1: 1); rw--;
1145  } else
1146  ar(rw) = arow(i);
1147  rw++;
1148  }
1149  for (REGISTER int j = 0; j < sz; j++) {
1150  el = this->operator() (r, acol(j));
1151  if (MATH__ fabs(el) < BD_MINVAL)
1152  continue;
1153  //setup cols if necessary
1154 #if 0
1155  int cl = 0;
1156  for (REGISTER int t = 0; t < sz; t++) {
1157  if (t == j)
1158  cl--;
1159  else
1160  ac(cl) = acol(t);
1161  cl++;
1162  }
1163 #else
1164  TBCICOPY(&(ac(0)), &(acol(0)), int, j);
1165  TBCICOPY(&(ac(j)), &(acol(j+1)), int, (sz-j-1));
1166 #endif
1167 
1168  tmp += (j%2? -1: 1) * sdet(level, ar, ac) * el;
1169  }
1170  return tmp * mul;
1171 }
1172 
1173 template <typename T>
1175 {
1176  if (ri.size() != ci.size()) {
1177 #ifndef TBCI_DISABLE_EXCEPT
1178  throw (BdMatrixErr("Unequal # of lines and rows to ignore!"));
1179 #else
1180  STD__ cerr << "Unequal # of lines and rows to ignore!" << STD__ endl;
1181  return 0;
1182 #endif
1183  }
1184 
1185  BVector<unsigned int> actrow(dim - ri.size());
1186  BVector<unsigned int> actcol(dim - ci.size());
1187  BVector<unsigned int> rowfill(dim);
1188  bestrow.resize(dim);
1189 
1190  unsigned int rw = 0; unsigned int cl = 0;
1191  for (REGISTER int j = 0; j < dim; j++) {
1192  if (ri.contains (j))
1193  rw--;
1194  else
1195  actrow(rw) = j;
1196  if (ci.contains(j))
1197  cl--;
1198  else
1199  actcol(cl) = j;
1200  rw++; cl++;
1201  }
1202 
1203  //STD__ cout << actrow << STD__ endl; STD__ cout << actcol << STD__ endl;
1204 
1205  // (pseudo-) count # of non-zero elements of rows
1206  matmem<T>* el = offdiag.setfirst(); int elemi = (el? el->row: -1);
1207 
1208  for (int i = 0; i < dim; i++) {
1209  if (!(actrow.contains(i))) {
1210  rowfill(i) = dim + 1; continue;
1211  }
1212 
1213  int fill = 0;
1214  if (actcol.contains(i))
1215  fill++;
1216  if (actcol.contains(i+1) && i+1 < dim)
1217  fill++;
1218  if (actcol.contains(i-1) && i-1 >= 0)
1219  fill++;
1220 
1221  for (REGISTER int s = 0; s < diagconf.size(); s++) {
1222  if (actcol.contains(i+diagconf.get(s)) && i+diagconf.get(s) < dim)
1223  fill++;
1224  if (actcol.contains(i-diagconf.get(s)) && i-diagconf.get(s) >= 0)
1225  fill++;
1226  }
1227 
1228  while (el && elemi == i) {
1229  if (actcol.contains(el->col))
1230  fill++;
1231  el = offdiag.setnext(); elemi = (el? el->row: -1);
1232  }
1233 
1234  rowfill(i) = fill;
1235  }
1236  //STD__ cout << "rowfill:" << rowfill << STD__ endl;
1237  //sort
1238  for (int k = 0; k < dim; k++) {
1239  int min = dim + 2; int minr = -1;
1240  for (int s = 0; s < dim; s++)
1241  if (rowfill(s) < min) {
1242  min = rowfill(s); minr = s;
1243  }
1244  bestrow(k) = minr;
1245  if (minr >= 0)
1246  rowfill(minr) = dim + 1;
1247  }
1248  return sdet(0, actrow, actcol);
1249 }
1250 #endif /* 0 */
1251 
1252 
1253 template <typename T>
1255 {
1256  BCHK(mat.dim != dim, BdMatrixErr, Assignment from wrong size Matrix, mat.dim, *this);
1257  if (diagconf != mat.diagconf) {
1258 #ifdef BOUNDCHECK
1259  STD__ cout << "old:" << mat.diagconf << "\nnew:" << diagconf << STD__ endl;
1260  STD__ cout << "Warning: Automatic reconfiguring BdMatrix in assignment!" << STD__ endl;
1261 #endif
1263  diagconf.resize(mat.diagconf.size());
1264  diagconf = mat.diagconf;
1265  constructor(dim);
1266  }
1267 
1268  dummy = mat.dummy;
1269  do_copy(mat);
1270  return *this;
1271 }
1272 
1273 
1274 //writes value to diagonal and deletes off-diags
1275 template <typename T>
1277 {
1278  setunit(val); return *this;
1279 }
1280 
1281 
1282 // Fully flexible now!
1283 #define SFORALL_S(op) \
1284 template <typename T> \
1285 BdMatrix<T>& BdMatrix<T>::operator op (const BdMatrix<T>& mat) \
1286 { \
1287  BCHK(mat.dim != dim, BdMatrixErr, Adding wrong size BdMatrix, mat.dim, *this); \
1288  unsigned i, j; \
1289  for (i = 0; i < dim; ++i) \
1290  diag[i] op mat.diag[i]; \
1291  for (j = 0; j < diagconf.size(); ++j) { \
1292  const REGISTER unsigned di = diagconf.get(j); \
1293  if (LIKELY(mat.adiag.get(di))) \
1294  for (i = 0; i < (dim-di); ++i) { \
1295  adiag.set(di)[i] op mat.adiag.get(di)[i]; \
1296  bdiag.set(di)[i] op mat.bdiag.get(di)[i]; \
1297  } \
1298  } \
1299  if (diagconf == mat.diagconf) \
1300  return *this; \
1301  for (j = 0; j < mat.diagconf.size(); ++j) { \
1302  const REGISTER unsigned di = mat.diagconf.get(j); \
1303  if (!diagconf.contains(di)) { \
1304  adddiag(di); \
1305  for (i = 0; i < (dim-di); ++i) { \
1306  adiag.set(di)[i] op mat.adiag.get(di)[i]; \
1307  bdiag.set(di)[i] op mat.bdiag.get(di)[i]; \
1308  } \
1309  } \
1310  } \
1311  return *this; \
1312 }
1313 
1316 
1317 
1318 #define SFORALL_T(op) \
1319 template <typename T> \
1320 BdMatrix<T>& BdMatrix<T>::operator op (const T& mult) \
1321 { \
1322  for (REGISTER unsigned int i = 0; i < dim; ++i) \
1323  diag[i] op mult; \
1324  for (unsigned j = 0; j < diagconf.size(); ++j) { \
1325  const REGISTER unsigned di = diagconf.get(j); \
1326  for (unsigned k = 0; k < (dim-di); ++k) { \
1327  adiag.set(di)[k] op mult; \
1328  bdiag.set(di)[k] op mult; \
1329  } \
1330  } \
1331  return *this; \
1332 }
1333 
1335 //SFORALL_T(+=)
1336 //SFORALL_T(-=)
1337 
1338 //The inline is necessary to prevent linker errors
1339 template <typename T>
1340 inline BdMatrix<T>& BdMatrix<T>::operator /= (const T& div)
1341 {
1342  BCHK((div==(T)0), BdMatrixErr, "Divide BdMatrix by zero", 0, *this);
1343  REGISTER T mult ALIGN(MIN_ALIGN) = (T)1 / div;
1344  for (REGISTER unsigned int i = 0; i < dim; ++i)
1345  diag[i] *= mult;
1346  for (unsigned j = 0; j < diagconf.size(); ++j) {
1347  const REGISTER unsigned di = diagconf.get(j);
1348  for (unsigned k = 0; k < (dim-di); ++k) {
1349  adiag.set(di)[k] *= mult;
1350  bdiag.set(di)[k] *= mult;
1351  }
1352  }
1353  return *this;
1354 }
1355 
1356 #define INTDIVEQ(T) \
1357 template <> \
1358 inline BdMatrix < T >& BdMatrix< T >::operator /= (const T & div) \
1359 { \
1360  BCHK((div==( T )0), BdMatrixErr, "Divide BdMatrix by zero", 0, *this); \
1361  for (REGISTER unsigned i = 0; i < dim; ++i) \
1362  diag[i] /= div; \
1363  for (unsigned j = 0; j < diagconf.size(); ++j) { \
1364  const REGISTER unsigned di = diagconf.get(j); \
1365  for (unsigned k = 0; k < (dim-di); ++k) { \
1366  adiag.set(di)[k] /= div; \
1367  bdiag.set(di)[k] /= div; \
1368  } \
1369  } \
1370  return *this; \
1371 }
1372 
1374 INTDIVEQ(unsigned int)
1375 
1376 template <typename T>
1377 inline BdMatrix<T>& BdMatrix<T>::transpose()
1378 {
1379 #if 0 //def BDMAT_ONE_BLOCK
1380  // A swap_vectors implementation would be faster!
1381  for (REGISTER unsigned int t = 0; t < diagconf.size(); ++t) {
1382  const unsigned int s = diagconf.get(t);
1383  T* tmp = adiag.get(s); adiag.set(s) = bdiag.get(s); bdiag.set(s) = tmp;
1384  }
1385 #else
1386  adiag.swap(bdiag);
1387 #endif
1388  return *this;
1389 }
1390 
1391 INST(template <typename T> class BdMatrix friend BdMatrix<T> transpose(BdMatrix<T>&);)
1392 template <typename T>
1394 {
1395  return (BdMatrix<T>(mat)).transpose();
1396 }
1397 
1398 #define STDDEF_ST(op) \
1399 template <typename T> \
1400 inline BdMatrix<T> BdMatrix<T>::operator op (const T& val) const \
1401 { \
1402  return (BdMatrix<T> (*this)) op##= val; \
1403 }
1404 
1407 //STDDEF_ST(+)
1408 //STDDEF_ST(-)
1409 
1410 #define STDDEF_SS(op) \
1411 template <typename T> \
1412 BdMatrix<T> BdMatrix<T>::operator op (const BdMatrix<T>& m2) const \
1413 { \
1414  return (BdMatrix<T> (*this)) op##= m2; \
1415 }
1416 
1419 
1420 
1421 template <typename T>
1422 BdMatrix<T> BdMatrix<T>::operator * (const BdMatrix<T>& mat2) const
1423 {
1424  const unsigned int newoff = maxoff + mat2.maxoff;
1425  //const unsigned int moff = MAX(maxoff, mat2.maxoff);
1426  BdMatrix<T> prd(0, dim, newoff);
1427  BCHK(dim != mat2.dim, BdMatrixErr, Multiplying diff size matrices, 0, prd);
1428 
1429  for (int i = 0; i < (signed)dim; ++i) {
1430  /* A really good compiler would do this for us */
1431  const int jend = MIN(dim, i+newoff+1);
1432  const int kstart = MAX(0, i-(signed)maxoff);
1433  const int kend = MIN(dim, i+maxoff+1);
1434  /* TODO: Exact summing */
1435  for (int j = MAX(0,i-(signed)newoff); j < jend; ++j) {
1436  REGISTER T el ALIGN(MIN_ALIGN) = T(0);
1437  for (REGISTER int k = kstart; k < kend; ++k)
1438  el += this->get(i, k) * mat2.get(k, j);
1439  //prd.setval(i, j, el);
1440  if (el != T(0))
1441  prd(i, j) = el; /* autoinsert ? */
1442  }
1443  }
1444  return prd;
1445 }
1446 
1447 INST(template <typename T> class BdMatrix friend BdMatrix<T> operator * (const T&, const BdMatrix<T>&);)
1448 template <typename T>
1449 inline BdMatrix<T> operator * (const T& v, const BdMatrix<T>& m)
1450 { return m*v; }
1451 // Assuming commutativity here!
1452 
1453 #if 0
1454 template <typename T>
1456 {
1457  TMatrix<T> tm (mat.rows(), mat.columns());
1458  if (UNLIKELY(tm.rows() != dim)) {
1459 #ifndef TBCI_DISABLE_EXCEPT
1460  throw (BdMatrixErr("Multiplying Matrix with Matrix: wrong sizes!"));
1461 #else
1462  STD__ cerr << "Multiplying Matrix with Matrix: wrong sizes!" << STD__ endl;
1463  return tm;
1464 #endif
1465  }
1466 
1467  T val ALIGN(MIN_ALIGN);
1468  for (int c = 0; c < tm.columns(); c++)
1469  for (int i = 0; i < mat.rows(); i++) {
1470  val = 0;
1471  for (REGISTER int j=0; j < dim; j++)
1472  val += this->operator()(i, j) * mat(j, c);
1473  tm(i, c) = val;
1474  }
1475  return tm
1476 }
1477 #endif /* 0 */
1478 
1479 #ifdef USE_PREFETCH
1480 # define MAY_PREFETCH_R(adr,loc) PREFETCH_R(adr,loc)
1481 # define MAY_PREFETCH_W(adr,loc) PREFETCH_W(adr,loc)
1482 #else
1483 # define MAY_PREFETCH_R(adr,loc) do {} while (0)
1484 # define MAY_PREFETCH_W(adr,loc) do {} while (0)
1485 #endif
1486 
1487 
1488 template <typename T>
1490 {
1491  BdMatrix<T> neg(dim, diagconf);
1492  //diagonal
1493  for (REGISTER unsigned int j = 0; j < dim; ++j)
1494  neg.diag[j] = -diag[j];
1495  //offdiag
1496  for (unsigned int t = 0; t < diagconf.size(); ++t) {
1497  const unsigned int dc = diagconf.get(t);
1498  for (REGISTER unsigned int i = 0; i < (dim - dc); ++i) {
1499  neg.adiag.set(dc)[i] = -adiag.get(dc)[i];
1500  neg.bdiag.set(dc)[i] = -bdiag.get(dc)[i];
1501  }
1502  }
1503  return neg;
1504 }
1505 
1506 /* Default if optimized linewise multiply */
1507 #if !defined(BDMATVEC_LNWISE) && !defined(BDMATVEC_LNWISE_OPT) && !defined(BDMATVEC_DIAGWISE)
1508 # define BDMATVEC_LNWISE_OPT
1509 #endif
1510 
1511 /* Cost of these ops w/ dim dxd, o offdiags and a maxoffset of m */
1512 #define COST_BDMATVEC_LN(d,o,m) (d*(COST_MULT+2*COST_UNIT_LOAD+COST_UNIT_STORE+COST_LOOP\
1513  +o*(COST_LOOP+2*COST_BRANCH+COST_UNIT_LOAD))+(2*d-m)*o*(3*COST_NU_LOAD+COST_ADD+COST_MULT))
1514 #define COST_BDMATVEC_LNO(d,o,m) (d*(COST_MULT+2*COST_UNIT_LOAD+COST_UNIT_STORE+COST_LOOP\
1515  +o*(COST_LOOP+COST_UNIT_LOAD))+(2*d-m)*o*(3*COST_NU_LOAD+COST_ADD+COST_MULT+COST_BRANCH/2))
1516 #define COST_BDMATVEC_DIAG(d,o,m) (d*(2*COST_UNIT_LOAD+COST_MULT+COST_UNIT_STORE+COST_LOOP)\
1517  +o*(6*COST_NU_LOAD+COST_LOOP+3*COST_UNIT_LOAD+2*COST_NU_STORE\
1518  +(2*d-m)*(3*COST_UNIT_LOAD+COST_LOOP+COST_MULT+COST_ADD+COST_UNIT_STORE)))
1519 
1520 template <typename T>
1521 inline void do_bdmat_vec_mult_lnw(const unsigned start, const unsigned end,
1522  TVector<T> *res, const BdMatrix<T> *mat,
1523  const Vector<T> *vec)
1524 
1525 {
1526  const unsigned int od = mat->diagconf.size();
1527  const unsigned int dim = mat->dim;
1528  // Zeilenweise, stur
1529  // TODO: Exact summation
1530  if (do_exactsum2()) {
1531  for (unsigned i = start; i < end; ++i) {
1532  // diag
1533  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1534  REGISTER T comp(0.0);
1535  // offdiag
1536  for (unsigned j = 0; j < od; ++j) {
1537  REGISTER const unsigned s = mat->diagconf.get(j);
1538  if (LIKELY(s <= i)) {
1539  const REGISTER T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1540  const REGISTER T t = val+y;
1541  comp += (t-val) - y;
1542  val = t;
1543  }
1544  if (LIKELY(i+s < dim)) {
1545  const REGISTER T y = mat->adiag.get(s)[i] * vec->get(i+s);
1546  const REGISTER T t = val+y;
1547  comp += (t-val) - y;
1548  val = t;
1549  }
1550  }
1551  res->set(val-comp, i);
1552  }
1553  } else {
1554  for (unsigned i = start; i < end; ++i) {
1555  // diag
1556  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1557  // offdiag
1558  for (unsigned j = 0; j < od; ++j) {
1559  REGISTER const unsigned s = mat->diagconf.get(j);
1560  if (LIKELY(s <= i))
1561  val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1562  if (LIKELY(i+s < dim))
1563  val += mat->adiag.get(s)[i] * vec->get(i+s);
1564  }
1565  res->set(val, i);
1566  }
1567  }
1568 }
1569 
1570 template <typename T>
1571 inline void do_bdmat_vec_mult_lnw_opt(const unsigned start, const unsigned end,
1572  TVector<T> *res, const BdMatrix<T> *mat,
1573  const Vector<T> *vec)
1574 
1575 {
1576  const unsigned int od = mat->diagconf.size();
1577  const unsigned int dim = mat->dim;
1578  // Zeilenweise, optimiert
1579  unsigned int i;
1580  //const unsigned dconf0 = mat->diagconf.get(0);
1581  const unsigned int maxoff = mat->maxoff;
1582  /* A very good compiler would do this for us */
1583  unsigned iend = MIN(maxoff, dim-maxoff);
1584  iend = MIN(iend, end);
1585  MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
1586  //MAY_PREFETCH_R(&mat->adiag.get(0), 3);
1587  //MAY_PREFETCH_R(&mat->bdiag.get(0), 3);
1588 #ifdef USE_PREFETCH
1589  for (unsigned int _j = 0; _j < od; ++_j) {
1590  const unsigned s = mat->diagconf.get(_j);
1591  PREFETCH_R(&mat->adiag.getcref(s), 2);
1592  PREFETCH_R(&mat->bdiag.getcref(s), 2);
1593  PREFETCH_R(mat->adiag.get(s), 2);
1594  PREFETCH_R(mat->bdiag.get(s), 2);
1595  }
1596 #endif
1597  MAY_PREFETCH_R(mat->diag, 2);
1598  MAY_PREFETCH_R(&vec->getcref(0), 2);
1599  MAY_PREFETCH_W(&res->setval(0), 3);
1600  if (do_exactsum2()) {
1601  /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
1602  * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
1603  */
1604  for (i = start; i < iend; ++i) {
1605  // diag
1606  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1607  REGISTER T comp(0.0);
1608  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1609  MAY_PREFETCH_R(mat->diag+i+1, 2);
1610  // offdiag
1611  for (unsigned int j = 0; j < od; ++j) {
1612  REGISTER const unsigned int s = mat->diagconf.get(j);
1613  if (s <= i) {
1614  const T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1615  const T t = val + y;
1616  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1617  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1618  comp += (t-val) - y;
1619  val = t;
1620  }
1621  // s <= maxoff; i < N-maxoff => s+i < N
1622  /*if (s+i < dim)*/
1623  const T y = mat->adiag.get(s)[i] * vec->get(i+s);
1624  const T t = val + y;
1625  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1626  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1627  comp += (t-val) - y;
1628  val = t;
1629  }
1630  res->set(val-comp, i);
1631  MAY_PREFETCH_W(&res->setval(i+1), 3);
1632  }
1633  iend = MIN(end, dim - maxoff);
1634  if (i < iend) {
1635  /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
1636  for (; i < iend; ++i) {
1637  // diag
1638  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1639  REGISTER T comp(0.0);
1640  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1641  MAY_PREFETCH_R(mat->diag+i+1, 2);
1642  // offdiag, no checking necessary
1643  for (unsigned int j = 0; j < od; ++j) {
1644  REGISTER const unsigned int s = mat->diagconf.get(j);
1645  T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1646  T t = val + y;
1647  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1648  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1649  comp += (t-val) - y;
1650  val = t;
1651  y = mat->adiag.get(s)[i] * vec->get(i+s);
1652  t = val + y;
1653  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1654  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1655  comp += (t-val) - y;
1656  val = t;
1657  }
1658  res->set(val-comp, i);
1659  MAY_PREFETCH_W(&res->setval(i+1), 3);
1660  }
1661  } else {
1662  iend = MIN(maxoff, end);
1663  /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
1664  // TODO: Exact summation
1665  for (; i < iend; ++i) {
1666  // diag
1667  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1668  REGISTER T comp(0.0);
1669  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1670  MAY_PREFETCH_R(mat->diag+i+1, 2);
1671  // offdiag, no checking necessary
1672  for (unsigned int j = 0; j < od; ++j) {
1673  REGISTER const unsigned int s = mat->diagconf.get(j);
1674  if (s <= i) {
1675  const T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1676  const T t = val + y;
1677  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1678  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1679  comp += (t-val) - y;
1680  val = t;
1681  }
1682  if (s < (signed)dim - i) {
1683  const T y = mat->adiag.get(s)[i] * vec->get(i+s);
1684  const T t = val + y;
1685  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1686  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1687  comp += (t-val) - y;
1688  val = t;
1689  }
1690  }
1691  res->set(val-comp, i);
1692  MAY_PREFETCH_W(&res->setval(i+1), 3);
1693  }
1694  }
1695  /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
1696  * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
1697  */
1698  for (; i < end; ++i) {
1699  // diag
1700  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1701  REGISTER T comp(0.0);
1702  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1703  MAY_PREFETCH_R(mat->diag+i+1, 2);
1704  // offdiag
1705  for (unsigned int j = 0; j < od; ++j) {
1706  REGISTER const unsigned int s = mat->diagconf.get(j);
1707  /* if (s <= i) */
1708  T y = mat->bdiag.get(s)[i-s] * vec->get(i-s);
1709  T t = val + y;
1710  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1711  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1712  comp += (t-val) - y;
1713  val = t;
1714  if (s < (signed)dim - i) {
1715  y = mat->adiag.get(s)[i] * vec->get(i+s);
1716  t = val + y;
1717  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1718  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1719  comp += (t-val) - y;
1720  val = t;
1721  }
1722  }
1723  res->set(val-comp, i);
1724  MAY_PREFETCH_W(&res->setval(i+1), 3);
1725  }
1726  } else { /* Not exactsum2 */
1727  /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
1728  * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
1729  */
1730  for (i = start; i < iend; ++i) {
1731  // diag
1732  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1733  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1734  MAY_PREFETCH_R(mat->diag+i+1, 2);
1735  // offdiag
1736  for (unsigned int j = 0; j < od; ++j) {
1737  REGISTER const unsigned int s = mat->diagconf.get(j);
1738  if (s <= i) {
1739  val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1740  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1741  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1742  }
1743  // s <= maxoff; i < N-maxoff => s+i < N
1744  /*if (s+i < dim)*/
1745  val += mat->adiag.get(s)[i] * vec->get(i+s);
1746  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1747  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1748  }
1749  res->set(val, i);
1750  MAY_PREFETCH_W(&res->setval(i+1), 3);
1751  }
1752  iend = MIN(end, dim - maxoff);
1753  if (i < iend) {
1754  /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
1755  for (; i < iend; ++i) {
1756  // diag
1757  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1758  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1759  MAY_PREFETCH_R(mat->diag+i+1, 2);
1760  // offdiag, no checking necessary
1761  for (unsigned int j = 0; j < od; ++j) {
1762  REGISTER const unsigned int s = mat->diagconf.get(j);
1763  val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1764  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1765  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1766  val += mat->adiag.get(s)[i] * vec->get(i+s);
1767  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1768  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1769  }
1770  res->set(val, i);
1771  MAY_PREFETCH_W(&res->setval(i+1), 3);
1772  }
1773  } else {
1774  iend = MIN(maxoff, end);
1775  /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
1776  for (; i < iend; ++i) {
1777  // diag
1778  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1779  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1780  MAY_PREFETCH_R(mat->diag+i+1, 2);
1781  // offdiag, no checking necessary
1782  for (unsigned int j = 0; j < od; ++j) {
1783  REGISTER const unsigned int s = mat->diagconf.get(j);
1784  if (s <= i) {
1785  val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1786  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1787  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1788  }
1789  if (s < (signed)dim - i) {
1790  val += mat->adiag.get(s)[i] * vec->get(i+s);
1791  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1792  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1793  }
1794  }
1795  res->set(val, i);
1796  MAY_PREFETCH_W(&res->setval(i+1), 3);
1797  }
1798  }
1799  /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
1800  * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
1801  */
1802  for (; i < end; ++i) {
1803  // diag
1804  REGISTER T val ALIGN(MIN_ALIGN2) = mat->diag[i] * vec->get(i);
1805  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
1806  MAY_PREFETCH_R(mat->diag+i+1, 2);
1807  // offdiag
1808  for (unsigned int j = 0; j < od; ++j) {
1809  REGISTER const unsigned int s = mat->diagconf.get(j);
1810  /* if (s <= i) */
1811  val += mat->bdiag.get(s)[i-s] * vec->get(i-s);
1812  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
1813  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
1814  if (s < (signed)dim - i) {
1815  val += mat->adiag.get(s)[i] * vec->get(i+s);
1816  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
1817  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
1818  }
1819  }
1820  res->set(val, i);
1821  MAY_PREFETCH_W(&res->setval(i+1), 3);
1822  }
1823  }
1824 }
1825 
1826 template <typename T>
1827 inline void do_bdmat_vec_mult_diagw_exact(const unsigned start, const unsigned end,
1828  TVector<T> *res, const BdMatrix<T> *mat,
1829  const Vector<T> *vec)
1830 
1831 {
1832  const unsigned int od = mat->diagconf.size();
1833  const unsigned int dim = mat->dim;
1834  //Diagonal wise
1835  // We need to store a complete vector full of compensation values
1836  // and this is actually FASTER than the linewise algo
1837  // that we could employ alternatively.
1838  // TODO: Find heuristics to determine what's better
1839  // (current suspicion: for rather sparse matrices, the lnw is better,
1840  // for rather full matrices, the compensation vector will work best)
1841  // diag
1842  Vector<T> corr((T)0, end-start);
1843  do_vec_vec_mul(end-start, &res->setval(start),
1844  &mat->diag[start], &vec->getcref(start));
1845  // offdiag
1846  for (unsigned j = 0; j < od; ++j) {
1847  // s ist die Nummer der Diagonalen (1<=s<=maxoff)
1848  const unsigned s = mat->diagconf.get(j);
1849  const unsigned iend = MIN(end,dim-s);
1850  const unsigned iend2 = MAX(0,(int)(end-s));
1851  const unsigned istart2 = MAX((int)(start-s),0);
1852  //do_add_vec_vec_mul(iend-start, &res->setval(start),
1853  // mat->adiag.get(s)+start, &vec->getcref(start+s));
1854  for (unsigned off = start; off < iend; ++off) {
1855  const REGISTER T y = *(mat->adiag.get(s)+off) * vec->get(s+off);
1856  const REGISTER T t = res->get(off) + y;
1857  corr.setval(off-start) += (t - res->get(off)) - y;
1858  res->setval(off) = t;
1859  }
1860  //do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
1861  // mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
1862  for (unsigned off = istart2; off < iend2; ++off) {
1863  const REGISTER T y = *(mat->bdiag.get(s)+off) * vec->get(off);
1864  const REGISTER T t = res->get(s+off) + y;
1865  corr.setval(s+off-start) += (t - res->get(s+off)) - y;
1866  res->setval(s+off) = t;
1867  }
1868  }
1869  /* Apply correction */
1870  for (unsigned off = start; off < end; ++off)
1871  res->setval(off) -= corr.get(off-start);
1872 }
1873 
1874 template <typename T>
1875 inline void do_bdmat_vec_mult(const unsigned start, const unsigned end,
1876  TVector<T> *res, const BdMatrix<T> *mat,
1877  const Vector<T> *vec)
1878 
1879 {
1880  const unsigned int dim = mat->dim;
1881  const unsigned int od = mat->diagconf.size();
1882  // Diagonal wise
1883  // TODO: Find heuristics to determine what's better
1884  // (current suspicion: for rather sparse matrices, the lnw is better,
1885  // for rather full matrices, the compensation vector will work best)
1886  if (do_exactsum2()) {
1887  if (od < 8)
1888  do_bdmat_vec_mult_lnw(start, end, res, mat, vec);
1889  else
1890  do_bdmat_vec_mult_diagw_exact(start, end, res, mat, vec);
1891  return;
1892  }
1893  // diag
1894  do_vec_vec_mul(end-start, &res->setval(start),
1895  &mat->diag[start], &vec->getcref(start));
1896  // offdiag
1897  for (unsigned j = 0; j < od; ++j) {
1898  // s ist die Nummer der Diagonalen (1<=s<=maxoff)
1899  const unsigned s = mat->diagconf.get(j);
1900  const unsigned iend = MIN(end,dim-s);
1901  const unsigned iend2 = MAX(0,(int)(end-s));
1902  const unsigned istart2 = MAX((int)(start-s),0);
1903  if (LIKELY(iend > start))
1904  do_add_vec_vec_mul(iend-start, &res->setval(start),
1905  mat->adiag.get(s)+start, &vec->getcref(start+s));
1906  if (LIKELY(iend2 > istart2))
1907  do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
1908  mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
1909  }
1910 }
1911 
1912 #if defined(BDMATVEC_LNWISE)
1913 # define COST_BDMATVEC(d,o,m) COST_BDMATVEC_LN(d,o,m)
1914 #elif defined(BDMATVEC_LNWISE_OPT)
1915 # define COST_BDMATVEC(d,o,m) COST_BDMATVEC_LNO(d,o,m)
1916 #elif defined(BDMATVEC_DIAGWISE)
1917 # define COST_BDMATVEC(d,o,m) COST_BDMATVEC_DIAG(d,o,m)
1918 #endif
1919 
1920 #define COST_BDMATVEC_TRANS(d,o,m) COST_BDMATVEC(d,o,m)
1921 template <typename T>
1922 inline void do_bdmat_vec_transmult_lnw(const unsigned start, const unsigned end,
1923  TVector<T> *res, const BdMatrix<T> *mat,
1924  const Vector<T> *vec)
1925 {
1926  const unsigned od = mat->diagconf.size();
1927  const unsigned int dim = mat->dim;
1928  if (do_exactsum2()) {
1929  for (unsigned i = start; i < end; ++i) {
1930  //diagonal
1931  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1932  REGISTER T comp(0.0);
1933  //off-diagonals
1934  for (unsigned int j = 0; j < od; ++j) {
1935  REGISTER const unsigned s = mat->diagconf.get(j);
1936  if (LIKELY(s <= i)) {
1937  const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
1938  const T t = val + y;
1939  comp += (t-val) - y;
1940  val = t;
1941  }
1942  if (LIKELY(i+s < dim)) {
1943  const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
1944  const T t = val + y;
1945  comp += (t-val) - y;
1946  val = t;
1947  }
1948  }
1949  res->set(val-comp, i);
1950  }
1951  } else {
1952  for (unsigned i = start; i < end; ++i) {
1953  //diagonal
1954  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
1955  //off-diagonals
1956  for (unsigned int j = 0; j < od; ++j) {
1957  REGISTER const unsigned s = mat->diagconf.get(j);
1958  if (LIKELY(s <= i))
1959  val += mat->adiag.get(s)[i-s] * vec->get(i-s);
1960  if (LIKELY(i+s < dim))
1961  val += mat->bdiag.get(s)[i] * vec->get(i+s);
1962  }
1963  res->set(val, i);
1964  }
1965  }
1966 }
1967 
1968 template <typename T>
1969 inline void do_bdmat_vec_transmult_lnw_opt(const unsigned start, const unsigned end,
1970  TVector<T> *res, const BdMatrix<T> *mat,
1971  const Vector<T> *vec)
1972 {
1973  const unsigned od = mat->diagconf.size();
1974  const unsigned int dim = mat->dim;
1975  // Zeilenweise, optimiert
1976  //const unsigned dconf0 = mat->diagconf.get(0);
1977  unsigned int i;
1978  const unsigned int maxoff = mat->maxoff;
1979  /* A very good compiler would do this for us */
1980  unsigned iend = MIN(maxoff, mat->dim-maxoff);
1981  iend = MIN(end, iend);
1982  MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
1983  //MAY_PREFETCH_R(&mat->adiag.getcref(0), 2);
1984  //MAY_PREFETCH_R(&mat->bdiag.getcref(0), 2);
1985 #ifdef USE_PREFETCH
1986  for (unsigned int _j = 0; _j < od; ++_j) {
1987  const unsigned s = mat->diagconf.get(_j);
1988  PREFETCH_R(&mat->adiag.getcref(s), 2);
1989  PREFETCH_R(&mat->bdiag.getcref(s), 2);
1990  PREFETCH_R(mat->adiag.get(s), 2);
1991  PREFETCH_R(mat->bdiag.get(s), 2);
1992  }
1993 #endif
1994  MAY_PREFETCH_R(mat->diag, 2);
1995  MAY_PREFETCH_R(&vec->getcref(0), 2);
1996  MAY_PREFETCH_W(&res->setval(0), 3);
1997  /* First loop: Need to check adiag, as i-s could be smaller than 0
1998  * bdiag safe, as i < N-maxoff and s <= maxoff => i+s < N
1999  */
2000  if (do_exactsum2()) {
2001  for (i = start; i < iend; ++i) {
2002  // diag
2003  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2004  REGISTER T comp(0.0);
2005  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2006  MAY_PREFETCH_R(mat->diag+i+1, 2);
2007  // offdiag
2008  for (unsigned int j = 0; j < od; ++j) {
2009  REGISTER const unsigned int s = mat->diagconf.get(j);
2010  if (s <= i) {
2011  const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2012  const T t = val + y;
2013  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2014  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2015  comp += (t-val) - y;
2016  val = t;
2017  }
2018  /*if (s < mat->dim - i)*/
2019  const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
2020  const T t = val + y;
2021  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2022  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2023  comp += (t-val) - y;
2024  val = t;
2025  }
2026  res->set(val-comp, i);
2027  MAY_PREFETCH_W(&res->setval(i+1), 3);
2028  }
2029  iend = MIN(end, dim - maxoff);
2030  if (i < iend) {
2031  /* Second loop, no checks necessary, 2*maxoff < N */
2032  // TODO: Exact summation
2033  for (; i < iend; ++i) {
2034  // diag
2035  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2036  REGISTER T comp(0.0);
2037  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2038  MAY_PREFETCH_R(mat->diag+i+1, 2);
2039  // offdiag, no checking necessary
2040  for (unsigned int j = 0; j < od; ++j) {
2041  REGISTER const unsigned int s = mat->diagconf.get(j);
2042  T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2043  T t = val + y;
2044  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2045  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2046  comp += (t-val) - y;
2047  val = t;
2048  y = mat->bdiag.get(s)[i] * vec->get(i+s);
2049  t = val + y;
2050  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2051  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2052  comp += (t-val) - y;
2053  val = t;
2054  }
2055  res->set(val-comp, i);
2056  MAY_PREFETCH_W(&res->setval(i+1), 3);
2057  }
2058  } else {
2059  iend = MIN(end, maxoff);
2060  /* Second loop, both checks necessary */
2061  // TODO: Exact summation
2062  for (; i < iend; ++i) {
2063  // diag
2064  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2065  REGISTER T comp(0.0);
2066  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2067  MAY_PREFETCH_R(mat->diag+i+1, 2);
2068  // offdiag, no checking necessary
2069  for (unsigned int j = 0; j < od; ++j) {
2070  REGISTER const unsigned int s = mat->diagconf.get(j);
2071  if (s <= i) {
2072  const T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2073  const T t = val + y;
2074  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2075  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2076  comp += (t-val) - y;
2077  val = t;
2078  }
2079  if (s < (signed)dim - i) {
2080  const T y = mat->bdiag.get(s)[i] * vec->get(i+s);
2081  const T t = val + y;
2082  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2083  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2084  comp += (t-val) - y;
2085  val = t;
2086  }
2087  }
2088  res->set(val-comp, i);
2089  MAY_PREFETCH_W(&res->setval(i+1), 3);
2090  }
2091  }
2092  /* Third loop, adiag safe */
2093  // TODO: Exact summation
2094  for (; i < end; ++i) {
2095  // diag
2096  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2097  REGISTER T comp(0.0);
2098  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2099  MAY_PREFETCH_R(mat->diag+i+1, 2);
2100  // offdiag
2101  for (unsigned int j = 0; j < od; ++j) {
2102  REGISTER const unsigned int s = mat->diagconf.get(j);
2103  /* if (s <= i) */
2104  T y = mat->adiag.get(s)[i-s] * vec->get(i-s);
2105  T t = val + y;
2106  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2107  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2108  comp += (t-val) - y;
2109  val = t;
2110  if (s < (signed)dim - i) {
2111  y = mat->bdiag.get(s)[i] * vec->get(i+s);
2112  t = val + y;
2113  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2114  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2115  comp += (t-val) - y;
2116  val = t;
2117  }
2118  }
2119  res->set(val-comp, i);
2120  MAY_PREFETCH_W(&res->setval(i+1), 3);
2121  }
2122  } else {
2123  for (i = start; i < iend; ++i) {
2124  // diag
2125  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2126  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2127  MAY_PREFETCH_R(mat->diag+i+1, 2);
2128  // offdiag
2129  for (unsigned int j = 0; j < od; ++j) {
2130  REGISTER const unsigned int s = mat->diagconf.get(j);
2131  if (s <= i) {
2132  val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2133  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2134  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2135  }
2136  /*if (s < mat->dim - i)*/
2137  val += mat->bdiag.get(s)[i] * vec->get(i+s);
2138  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2139  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2140  }
2141  res->set(val, i);
2142  MAY_PREFETCH_W(&res->setval(i+1), 3);
2143  }
2144  iend = MIN(end, dim - maxoff);
2145  if (i < iend) {
2146  /* Second loop, no checks necessary, 2*maxoff < N */
2147  // TODO: Exact summation
2148  for (; i < iend; ++i) {
2149  // diag
2150  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2151  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2152  MAY_PREFETCH_R(mat->diag+i+1, 2);
2153  // offdiag, no checking necessary
2154  for (unsigned int j = 0; j < od; ++j) {
2155  REGISTER const unsigned int s = mat->diagconf.get(j);
2156  val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2157  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2158  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2159  val += mat->bdiag.get(s)[i] * vec->get(i+s);
2160  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2161  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2162  }
2163  res->set(val, i);
2164  MAY_PREFETCH_W(&res->setval(i+1), 3);
2165  }
2166  } else {
2167  iend = MIN(end, maxoff);
2168  /* Second loop, both checks necessary */
2169  // TODO: Exact summation
2170  for (; i < iend; ++i) {
2171  // diag
2172  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2173  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2174  MAY_PREFETCH_R(mat->diag+i+1, 2);
2175  // offdiag, no checking necessary
2176  for (unsigned int j = 0; j < od; ++j) {
2177  REGISTER const unsigned int s = mat->diagconf.get(j);
2178  if (s <= i) {
2179  val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2180  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2181  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2182  }
2183  if (s < (signed)dim - i) {
2184  val += mat->bdiag.get(s)[i] * vec->get(i+s);
2185  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2186  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2187  }
2188  }
2189  res->set(val, i);
2190  MAY_PREFETCH_W(&res->setval(i+1), 3);
2191  }
2192  }
2193  /* Third loop, adiag safe */
2194  // TODO: Exact summation
2195  for (; i < end; ++i) {
2196  // diag
2197  REGISTER T val ALIGN(MIN_ALIGN) = mat->diag[i] * vec->get(i);
2198  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2199  MAY_PREFETCH_R(mat->diag+i+1, 2);
2200  // offdiag
2201  for (unsigned int j = 0; j < od; ++j) {
2202  REGISTER const unsigned int s = mat->diagconf.get(j);
2203  /* if (s <= i) */
2204  val += mat->adiag.get(s)[i-s] * vec->get(i-s);
2205  MAY_PREFETCH_R(mat->adiag.get(s)+i+1-s, 2);
2206  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2207  if (s < (signed)dim - i) {
2208  val += mat->bdiag.get(s)[i] * vec->get(i+s);
2209  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1, 2);
2210  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2211  }
2212  }
2213  res->set(val, i);
2214  MAY_PREFETCH_W(&res->setval(i+1), 3);
2215  }
2216  }
2217 }
2218 
2219 template <typename T>
2220 inline void do_bdmat_vec_transmult_diagw_exact(const unsigned start, const unsigned end,
2221  TVector<T> *res, const BdMatrix<T> *mat,
2222  const Vector<T> *vec)
2223 {
2224  const unsigned int dim = mat->dim;
2225  const unsigned od = mat->diagconf.size();
2226  // diagonal wise
2227  // We need to store a vector of correction values ...
2228  Vector<T> corr((T)0, end-start);
2229  // diag
2230  do_vec_vec_mul(end-start, &res->setval(start),
2231  &mat->diag[start], &vec->getcref(start));
2232  // offdiag
2233  for (unsigned j = 0; j < od; ++j) {
2234  // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2235  const int s = mat->diagconf.get(j);
2236  const int iend = MIN(end,dim-s);
2237  const int iend2 = end-s;
2238  const int istart2 = MAX((int)(start-s),0);
2239  //do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
2240  // mat->adiag.get(s)+istart2, &vec->getcref(istart2));
2241  for (int off = istart2; off < iend2; ++off) {
2242  const REGISTER T y = *(mat->adiag.get(s)+off) * vec->get(off);
2243  const REGISTER T t = res->get(off+s) + y;
2244  corr.setval(off+s-start) += (t - res->get(off+s)) - y;
2245  res->setval(off+s) = t;
2246  }
2247  //do_add_vec_vec_mul(iend-start, &res->setval(start),
2248  // mat->bdiag.get(s)+start, &vec->getcref(start+s));
2249  for (int off = start; off < iend; ++off) {
2250  const REGISTER T y = *(mat->bdiag.get(s)+off) * vec->get(off+s);
2251  const REGISTER T t = res->get(off) + y;
2252  corr.setval(off-start) += (t - res->get(off)) - y;
2253  res->setval(off) = t;
2254  }
2255  }
2256  // Apply correction
2257  for (unsigned off = start; off < end; ++off)
2258  res->setval(off) -= corr(off-start);
2259 }
2260 
2261 template <typename T>
2262 inline void do_bdmat_vec_transmult(const unsigned start, const unsigned end,
2263  TVector<T> *res, const BdMatrix<T> *mat,
2264  const Vector<T> *vec)
2265 {
2266  const unsigned int dim = mat->dim;
2267  const unsigned od = mat->diagconf.size();
2268  if (do_exactsum2()) {
2269  //do_bdmat_vec_transmult_lnw(start, end, res, mat, vec);
2270  do_bdmat_vec_transmult_diagw_exact(start, end, res, mat, vec);
2271  return;
2272  }
2273  // diagonal wise
2274  // diag
2275  do_vec_vec_mul(end-start, &res->setval(start),
2276  &mat->diag[start], &vec->getcref(start));
2277  // offdiag
2278  for (unsigned j = 0; j < od; ++j) {
2279  // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2280  const int s = mat->diagconf.get(j);
2281  const int iend = MIN(end,dim-s);
2282  const int iend2 = end-s;
2283  const int istart2 = MAX((int)(start-s),0);
2284  do_add_vec_vec_mul(iend2-istart2, &res->setval(istart2+s),
2285  mat->adiag.get(s)+istart2, &vec->getcref(istart2));
2286  do_add_vec_vec_mul(iend-start, &res->setval(start),
2287  mat->bdiag.get(s)+start, &vec->getcref(start+s));
2288  }
2289 }
2290 
2291 #define COST_BDMATVEC_DOT(d,o,m) COST_BDMATVEC(d,o,m)
2292 template <typename T>
2293 inline void do_bdmat_vec_dotmult(const unsigned start, const unsigned end,
2294  TVector<T> *res, const BdMatrix<T> *mat,
2295  const Vector<T> *vec)
2296 {
2297  const unsigned int od = mat->diagconf.size();
2298  const unsigned int dim = mat->dim;
2299 #ifdef BDMATVEC_LNWISE
2300  // Zeilenweise, stur
2301  if (do_exactsum2()) {
2302  for (unsigned i = start; i < end; ++i) {
2303  // diag
2304  REGISTER T val ALIGN(MIN_ALIGN) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2305  REGISTER T comp(0.0);
2306  // offdiag
2307  for (unsigned j = 0; j < od; ++j) {
2308  REGISTER const unsigned s = mat->diagconf.get(j);
2309  if (LIKELY(s <= i)) {
2310  const T y = CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2311  const T t = val + y;
2312  comp += (t-val) - y;
2313  val = t;
2314  }
2315  if (LIKELY(i+s < dim)) {
2316  const T y = CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2317  const T t = val + y;
2318  comp += (t-val) - y;
2319  val = t;
2320  }
2321  }
2322  res->set(val-comp, i);
2323  }
2324  } else {
2325  for (unsigned i = start; i < end; ++i) {
2326  // diag
2327  REGISTER T val ALIGN(MIN_ALIGN) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2328  // offdiag
2329  for (unsigned j = 0; j < od; ++j) {
2330  REGISTER const unsigned s = mat->diagconf.get(j);
2331  if (LIKELY(s <= i))
2332  val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2333  if (LIKELY(i+s < dim))
2334  val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2335  }
2336  res->set(val, i);
2337  }
2338  }
2339 #elif defined(BDMATVEC_LNWISE_OPT)
2340  // Zeilenweise, optimiert
2341  unsigned int i;
2342  //const unsigned dconf0 = mat->diagconf.get(0);
2343  const unsigned int maxoff = mat->maxoff;
2344  /* A very good compiler would do this for us */
2345  unsigned iend = MIN(maxoff, dim-maxoff);
2346  iend = MIN(iend, end);
2347  MAY_PREFETCH_R(&mat->diagconf.getcref(0), 2);
2348  //MAY_PREFETCH_R(&mat->adiag(0), 3);
2349  //MAY_PREFETCH_R(&mat->bdiag(0), 3);
2350 #ifdef USE_PREFETCH
2351  for (unsigned int _j = 0; _j < od; ++_j) {
2352  const unsigned s = mat->diagconf.get(_j);
2353  PREFETCH_R(&mat->adiag.getcref(s), 2);
2354  PREFETCH_R(&mat->bdiag.getcref(s), 2);
2355  PREFETCH_R(mat->adiag.get(s), 2);
2356  PREFETCH_R(mat->bdiag.get(s), 2);
2357  }
2358 #endif
2359  MAY_PREFETCH_R(mat->diag, 2);
2360  MAY_PREFETCH_R(&vec->getcref(0), 2);
2361  MAY_PREFETCH_W(&res->setval(0), 3);
2362  /* First loop: Need to check bdiag, as i-s = i-(i-j) = j could be negative
2363  * adiag safe, as i < iend <=> i < N-maxoff and s <= maxoff => i+s < N
2364  */
2365  // TODO: Exact summation
2366  // NOTE: Not implemented here, as dotMult is note even used by the solvers ...
2367  for (i = start; i < iend; ++i) {
2368  // diag
2369  REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2370  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2371  MAY_PREFETCH_R(mat->diag+i+1, 2);
2372  // offdiag
2373  for (unsigned int j = 0; j < od; ++j) {
2374  REGISTER const unsigned int s = mat->diagconf.get(j);
2375  if (s <= i) {
2376  val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2377  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2378  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2379  }
2380  // s <= maxoff; i < N-maxoff => s+i < N
2381  /*if (s+i < dim)*/
2382  val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2383  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2384  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2385  }
2386  res->set(val, i);
2387  MAY_PREFETCH_W(&res->setval(i+1), 3);
2388  }
2389  iend = MIN(end, dim - maxoff);
2390  /* i = MAX(start,MIN(maxoff,dim-maxoff,end)) */
2391  if (i < iend) {
2392  /* Second loop: No checks needed; only executed if 2*maxoff <= mat->dim */
2393  // TODO: Exact summation
2394  for (; i < iend; ++i) {
2395  // diag
2396  REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2397  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2398  MAY_PREFETCH_R(mat->diag+i+1, 2);
2399  // offdiag, no checking necessary
2400  for (unsigned int j = 0; j < od; ++j) {
2401  REGISTER const unsigned int s = mat->diagconf.get(j);
2402  val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2403  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2404  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2405  val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2406  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2407  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2408  }
2409  res->set(val, i);
2410  MAY_PREFETCH_W(&res->setval(i+1), 3);
2411  }
2412  } else {
2413  iend = MIN(maxoff, end);
2414  /* Second loop: Both checks needed; only executed if 2*maxoff > mat->dim */
2415  // TODO: Exact summation
2416  for (; i < iend; ++i) {
2417  // diag
2418  REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2419  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2420  MAY_PREFETCH_R(mat->diag+i+1, 2);
2421  // offdiag, no checking necessary
2422  for (unsigned int j = 0; j < od; ++j) {
2423  REGISTER const unsigned int s = mat->diagconf.get(j);
2424  if (s <= i) {
2425  val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2426  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2427  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2428  }
2429  if (s < (signed)dim - i) {
2430  val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2431  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2432  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2433  }
2434  }
2435  res->set(val, i);
2436  MAY_PREFETCH_W(&res->setval(i+1), 3);
2437  }
2438  }
2439  /* Third loop: Need to check to adiag, as i+s = i+(j-i) = j could be >= N
2440  * bdiag safe, as s <= maxoff and i >= maxoff => i-s >= 0
2441  */
2442  // TODO: Exact summation
2443  for (; i < end; ++i) {
2444  // diag
2445  REGISTER T val ALIGN(MIN_ALIGN2) = CPLX__ conj(mat->diag[i]) * vec->get(i);
2446  MAY_PREFETCH_R(&vec->getcref(i+1), 2);
2447  MAY_PREFETCH_R(mat->diag+i+1, 2);
2448  // offdiag
2449  for (unsigned int j = 0; j < od; ++j) {
2450  REGISTER const unsigned int s = mat->diagconf.get(j);
2451  /* if (s <= i) */
2452  val += CPLX__ conj(mat->bdiag.get(s)[i-s]) * vec->get(i-s);
2453  MAY_PREFETCH_R(mat->bdiag.get(s)+i+1-s,2);
2454  MAY_PREFETCH_R(&vec->getcref(i+1-s), 2);
2455  if (s < (signed)dim - i) {
2456  val += CPLX__ conj(mat->adiag.get(s)[i]) * vec->get(i+s);
2457  MAY_PREFETCH_R(mat->adiag.get(s)+i+1,2);
2458  MAY_PREFETCH_R(&vec->getcref(i+1+s), 2);
2459  }
2460  }
2461  res->set(val, i);
2462  MAY_PREFETCH_W(&res->setval(i+1), 3);
2463  }
2464 #elif defined(BDMATVEC_DIAGWISE)
2465  //Diagonalweise
2466  // FIXME: I see no way we could implement exact summation here ...
2467  // diag
2468  do_vec_vec_cmul(end-start, &res->setval(start),
2469  &mat->diag[start], &vec->getcref(start));
2470  // offdiag
2471  for (unsigned j = 0; j < od; j++) {
2472  // s ist die Nummer der Diagonalen (1<=s<=maxoff)
2473  const int s = mat->diagconf.get(j);
2474  const int iend = MIN(end,dim-s);
2475  const int iend2 = end-s;
2476  const int istart2 = MAX((int)(start-s),0);
2477  do_add_vec_vec_cmul(iend-start, &res->setval(start),
2478  mat->adiag.get(s)+start, &vec->getcref(start+s));
2479  do_add_vec_vec_cmul(iend2-istart2, &res->setval(istart2+s),
2480  mat->bdiag.get(s)+istart2, &vec->getcref(istart2));
2481  }
2482 #endif
2483 }
2484 
2485 #if defined(SMP) && !defined(NOSMP_BDMATVEC)
2486 
2487 INST(template <typename T> class BdMatrix friend void job_bdmat_vec_mult(struct thr_ctrl*);)
2488 template <typename T>
2490 {
2491  do_bdmat_vec_mult(tc->t_off, tc->t_size,
2492  (TVector<T>*)(tc->t_par[0]), (const BdMatrix<T>*)(tc->t_par[1]),
2493  (const Vector<T>*)(tc->t_par[2]));
2494 }
2495 
2496 INST(template <typename T> class BdMatrix friend void job_bdmat_vec_transmult(struct thr_ctrl*);)
2497 template <typename T>
2499 {
2501  (TVector<T>*)(tc->t_par[0]), (const BdMatrix<T>*)(tc->t_par[1]),
2502  (const Vector<T>*)(tc->t_par[2]));
2503 }
2504 
2505 
2506  /* TODO: Adjust dynamically (or at least depending on OPT target). */
2507 # if !defined(SMP_BDMATSLICE)
2508 # define SMP_BDMATSLICE 32768
2509 # endif
2510 # define SMP_BDMATSLICE2 (SMP_BDMATSLICE/sizeof(T))
2511 
2512 template <typename T>
2514 {
2515  TVector<T> tv(dim);
2516  const unsigned n_thr = threads_avail(dim / SMP_BDMATSLICE2);
2517  update_n_thr(n_thr);
2518  BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2519  if (LIKELY(n_thr < 2))
2520  do_bdmat_vec_mult<T>(0UL, dim, &tv, this, &vec);
2521  else {
2522  PREFETCH_R_MANY(this->diag, 2);
2523  PREFETCH_R_MANY(vec.vec, 2);
2524  PREFETCH_W_MANY(tv.vec, 3);
2525  const unsigned long first = slice_offset(1, n_thr, dim, (T*)0);
2526  unsigned long st, en = first;
2527  for (unsigned t = 0; t < n_thr-1; ++t) {
2528  st = en; en = slice_offset(t+2, n_thr, dim, (T*)0);
2529  thread_start_off(t, (thr_job_t)job_bdmat_vec_mult<T>,
2530  st, en, &tv, this, &vec, (void*)0);
2531  }
2532  do_bdmat_vec_mult<T>(0UL, first, &tv, this, &vec);
2533  for (unsigned s = 0; s < n_thr-1; ++s)
2534  thread_wait(s);
2535  }
2536  return tv;
2537 }
2538 
2539 // transpose-vector multiplication
2540 template <typename T>
2542 {
2543  TVector<T> tv(dim);
2544  const unsigned n_thr = threads_avail(dim / SMP_BDMATSLICE2);
2545  update_n_thr(n_thr);
2546  BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2547  if (LIKELY(n_thr < 2 ))
2548  do_bdmat_vec_transmult<T> (0UL, dim, &tv, this, &vec);
2549  else {
2550  PREFETCH_R_MANY(this->diag, 2);
2551  PREFETCH_R_MANY(vec.vec, 2);
2552  PREFETCH_W_MANY(tv.vec, 3);
2553  const unsigned long first = slice_offset(1, n_thr, dim, (T*)0);
2554  unsigned long st, en = first;
2555  for (unsigned t = 0; t < n_thr-1; ++t) {
2556  st = en; en = slice_offset(t+2, n_thr, dim, (T*)0);
2557  thread_start_off(t, (thr_job_t)job_bdmat_vec_transmult<T>,
2558  st, en, &tv, this, &vec, (void*)0);
2559  }
2560  do_bdmat_vec_transmult<T>(0UL, first, &tv, this, &vec);
2561  for (unsigned s = 0; s < n_thr-1; ++s)
2562  thread_wait(s);
2563  }
2564  return tv;
2565 }
2566 
2567 #else /* SMP */
2568 
2569 template <typename T>
2571 {
2572  TVector<T> tv(dim);
2573  BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2574  do_bdmat_vec_mult<T>(0, dim, &tv, this, &vec);
2575  return tv;
2576 }
2577 
2578 /* bicg and qmr solvers use this */
2579 template <typename T>
2580 TVector<T> BdMatrix<T>::transMult (const Vector<T>& vec) const
2581 {
2582  TVector<T> tv(dim);
2583  BCHK(vec.size() != dim, BdMatrixErr, Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2584  do_bdmat_vec_transmult<T>(0, dim, &tv, this, &vec);
2585  return tv;
2586 }
2587 
2588 #endif /* SMP */
2589 
2590 /* This is not currently used by anyone ... */
2591 template <typename T>
2593 {
2594  TVector<T> tv(dim);
2595  BCHK(vec.size() != dim, BdMatrixErr, Dot Multiplication BdMatrix with Vector: wrong sizes, 0, tv);
2596  do_bdmat_vec_dotmult<T>(0, dim, &tv, this, &vec);
2597  return tv;
2598 }
2599 
2600 
2601 
2602 //template <typename T>
2603 //TVector<T> BdMatrix<T>::transMult (const TVector<T>& tvec) const;
2604 //template <typename T>
2605 //TVector<T> BdMatrix<T>::transMult (const TSVector<T>& tsvec) const;
2606 
2607 
2608 template <typename T>
2610 {
2611  if (LIKELY(ma.dim != dim))
2612  return false;
2613  if (LIKELY(dim == 0 || ((diag == ma.diag) && (adiag == ma.adiag) && (bdiag == ma.bdiag))))
2614  return true;
2615  if (TBCICOMP(ma.diag, diag, T, dim))
2616  return false;
2617  unsigned i; // MSC++ annoyances
2618  for (i = 0; i < diagconf.size(); ++i) {
2619  const unsigned di = diagconf.get(i);
2620  if (ma.adiag.get(di)) {
2621  if (TBCICOMP(ma.adiag.get(di), adiag.get(di), T, (dim-di))) return false;
2622  if (TBCICOMP(ma.bdiag.get(di), bdiag.get(di), T, (dim-di))) return false;
2623  } else {
2624  for (unsigned j = 0; j < dim-di; ++j)
2625  if (MATH__ fabs(adiag.get(di)[j]) >= BD_MINVAL || MATH__ fabs(bdiag(di)[j]) >= BD_MINVAL)
2626  return false;
2627  }
2628  }
2629  if (LIKELY(diagconf == ma.diagconf))
2630  return true;
2631  for (i = 0; i < ma.diagconf.size(); ++i) {
2632  const unsigned di = ma.diagconf.get(i);
2633  if (!diagconf.contains (di)) {
2634  for (unsigned j = 0; j < dim-di; ++j)
2635  if (UNLIKELY(MATH__ fabs(ma.adiag.get(di)[j]) >= BD_MINVAL || MATH__ fabs(ma.bdiag.get(di)[j]) >= BD_MINVAL))
2636  return false;
2637  }
2638  }
2639  return true;
2640 }
2641 
2642 template <typename T>
2644 {
2645  BCHK(dim != v.size(), BdMatrixErr, multiply rowwise with wrong size, v.size(), *this);
2646  // diag
2647  for (unsigned j = 0; j < dim; ++j)
2648  diag[j] *= v.get(j);
2649  // offdiags
2650  for (unsigned od = 0; od < diagconf.size(); ++od) {
2651  const unsigned off = diagconf.get(od);
2652  for (unsigned i = 0; i < (dim-off); ++i)
2653  adiag.get(off)[i] *= v.get(i);
2654  for (unsigned k = 0; k < (dim-off); ++k)
2655  bdiag.get(off)[k] *= v.get(k+off);
2656  }
2657  return *this;
2658 }
2659 
2660 template <typename T>
2662 {
2663  BCHK(dim != v.size(), BdMatrixErr, divide rowwise with wrong size, v.size(), *this);
2664  // diag
2665  for (unsigned j = 0; j < dim; ++j)
2666  diag[j] /= v.get(j);
2667  // offdiags
2668  for (unsigned od = 0; od < diagconf.size(); ++od) {
2669  const unsigned off = diagconf.get(od);
2670  for (unsigned i = 0; i < (dim-off); ++i)
2671  adiag.get(off)[i] /= v.get(i);
2672  for (unsigned k = 0; k < (dim-off); ++k)
2673  bdiag.get(off)[k] /= v.get(k+off);
2674  }
2675  return *this;
2676 }
2677 
2678 template <typename T>
2679 BdMatrix<T>& BdMatrix<T>::mult_row(const T& f, const unsigned i)
2680 {
2681  BCHK(i>dim, BdMatrixErr, mult_row index out of range, i, *this);
2682  diag[i] *= f;
2683  for (unsigned od = 0; od < diagconf.size(); ++od) {
2684  const unsigned off = diagconf.get(od);
2685  if (i < dim-off)
2686  adiag.get(off)[i] *= f;
2687  if (i >= off)
2688  bdiag.get(off)[i-off] *= f;
2689  }
2690  return *this;
2691 }
2692 
2693 // Don't do it with integer types
2694 template <typename T>
2695 BdMatrix<T>& BdMatrix<T>::div_row(const T& d, const unsigned i)
2696 {
2697  BCHK(i>dim, BdMatrixErr, div_row index out of range, i, *this);
2698  BCHK(d == (T)0, BdMatrixErr, div_row by zero, i, *this);
2699  T f = (T)1.0/d;
2700  diag[i] *= f;
2701  for (unsigned od = 0; od < diagconf.size(); ++od) {
2702  const unsigned off = diagconf.get(od);
2703  if (i < dim-off)
2704  adiag.get(off)[i] *= f;
2705  if (i >= off)
2706  bdiag.get(off)[i-off] *= f;
2707  }
2708  return *this;
2709 }
2710 
2711 template <typename T>
2712 STD__ ostream& operator << (STD__ ostream& ostr, const BdMatrix<T>& mat)
2713 {
2714  //if (!mat.outset)
2715  // mat.setoutopts();
2716  //ostr << '(';
2717  for (unsigned i = 0; i < mat.dim; ++i) {
2718  for (unsigned j = 0; j < mat.dim; ++j) {
2719  if (UNLIKELY(mat.outset)) {
2720  ostr.width(mat.outw);
2721  ostr.precision(mat.outp);
2722  ostr.fill(mat.outf);
2723  }
2724  if (i == j || mat.diagconf.contains(CSTD__ abs((int)i-(int)j)))
2725  ostr << mat.get(i, j) << ' ';
2726  else
2727  ostr << (T)0 << ' ';
2728  }
2729  //if (i == (mat.dim - 1))
2730  // ostr << ')' << STD__ endl;
2731  //else
2732  ostr << "\n";
2733  }
2734  EXPCHK(mat.dummy!=(T)0, BdMatrixErr, dummy not zero, 0, ostr);
2735  return ostr;
2736 }
2737 
2738 template <typename T>
2739 STD__ istream& operator >> (STD__ istream& istr, BdMatrix<T>& mat)
2740 {
2741  T tmp;
2742  for (unsigned i = 0; i < mat.dim; ++i) {
2743  for (unsigned j = 0; j < mat.dim; ++j) {
2744  istr >> tmp;
2745  if (i == j || mat.diagconf.contains(CSTD__ abs((int)i-(int)j)))
2746  mat.setval(tmp, i, j);
2747  else
2748  if (tmp != (T)0)
2749  STD__ cerr << "Element (" << i << "," << j
2750  << ") should be zero!" << STD__ endl;
2751  }
2752  }
2753  return istr;
2754 }
2755 
2756 template <typename T>
2758 {
2759  const unsigned _DIM = dim;
2760  TMatrix<T> tm ((T)0, _DIM, _DIM);
2761  // diag
2762  for (unsigned i = 0; i < _DIM; ++i)
2763  tm.setval(diag[i], i, i);
2764  // offdiags
2765  for (unsigned j = 0; j < diagconf.size(); ++j) {
2766  const unsigned di = diagconf.get(j);
2767  for (unsigned k = 0; k < _DIM-di; ++k) {
2768  tm.setval(adiag.get(di)[k], k, k+di);
2769  tm.setval(bdiag.get(di)[k], k+di, k);
2770  }
2771  }
2772  return tm;
2773 }
2774 
2775 #if 0
2776 template <typename T>
2778 {
2779  Matrix<T> tm ((T)0, dim, dim); unsigned i;
2780  // diag
2781  for (i = 0; i < dim; ++i)
2782  tm.setval(diag[i], i, i);
2783  // offdiags
2784  for (unsigned j = 0; j < diagconf.size(); ++j)
2785  for (unsigned k = 0; k < dim-(i=diagconf.get(j)); ++k) {
2786  tm.setval(adiag.get(i)[k], k, k+i);
2787  tm.setval(bdiag.get(i)[k], k+i, k);
2788  }
2789  return tm;
2790 }
2791 #endif
2792 
2793 #if 0
2794 template <typename T>
2796 {
2797  BdMatrix<T> inv(dim);
2799  T d = det();
2800  //STD__ cout << "Det: " << d << STD__ endl;
2801  if (MATH__ fabs(d) < BD_MINVAL) {
2802 #ifndef TBCI_DISABLE_EXCEPT
2803  throw (BdMatrixErr("Trying to invert singular Matrix!"));
2804 #else
2805  STD__ cerr << "Trying to invert singular Matrix!" << STD__ endl;
2806  return inv;
2807 #endif
2808  }
2809  d = 1/d;
2810  for (unsigned i = 0; i < dim; i++)
2811  for (unsigned j = 0; j < dim; j++) {
2812  ri(0) = i; ci(0) = j;
2813  inv.setval(subdet(ri, ci) * ((i+j)%2? -d: d), j, i);
2814  }
2815  return inv;
2816 }
2817 #endif /* 0 */
2818 
2819 #if defined(SMP) && defined(HAVE_LIBNUMA)
2820 template <typename T>
2821 int numa_opt_bdmat_helper(const BdMatrix<T>& bm, const BVector<unsigned>& cfg,
2822  const int node, bool fault_in,
2823  const unsigned first, const unsigned last)
2824 {
2825  int moved = 0;
2826  const unsigned int sz = bm.rows();
2827  /* Diag */
2828  const T* diag = &(((BdMatrix<T>&)bm).setval(0,0));
2829  moved = do_numa_move_pages(node, fault_in,
2830  (unsigned long)(diag+first),
2831  (unsigned long)(diag+last));
2832  /* Now do off-diags */
2833  for (unsigned i = 0; i < cfg.size(); ++i) {
2834  const unsigned off = cfg.get(i);
2835  const unsigned dln = sz - off;
2836  const T* adiag = &(((BdMatrix<T>&)bm).setval(0,off));
2837  const T* bdiag = &(((BdMatrix<T>&)bm).setval(off,0));
2838  if (first < dln)
2839  moved += do_numa_move_pages(node, fault_in,
2840  (unsigned long)(adiag+first),
2841  (unsigned long)(adiag+MIN(dln,last)));
2842  if (last > off)
2843  moved += do_numa_move_pages(node, fault_in,
2844  (unsigned long)(bdiag+MAX(0,(signed)(first-off))),
2845  (unsigned long)(bdiag+last-off));
2846  }
2847  return moved;
2848 }
2849 
2850 template <typename T>
2851 void numa_opt_bdmat_job(struct thr_ctrl *tc)
2852 {
2853  const BdMatrix<T>& bm(*(const BdMatrix<T>*)tc->t_par[2]);
2854  const BVector<unsigned>& bmcfg(*(const BVector<unsigned>*)tc->t_par[3]);
2855  /* Stick with convention that size, off are node, fault_in resp. */
2856  const int node = tc->t_size;
2857  const bool fault_in = tc->t_off;
2858  /* start and end are params 0 and 1 */
2859  const unsigned st = (unsigned long)tc->t_par[0];
2860  const unsigned en = (unsigned long)tc->t_par[1];
2861  tc->t_res_l = numa_opt_bdmat_helper(bm, bmcfg, node, fault_in, st, en);
2862 }
2863 
2864 template <typename T>
2865 int numa_optimize(const BdMatrix<T>& bm, bool fault_in)
2866 {
2867  if (!numa_avail && !fault_in)
2868  return 0;
2869  /* use some heuristic to decide for the num of threads */
2870  const unsigned sz = bm.rows();
2871  const unsigned n_thr = threads_avail (sz / SMP_BDMATSLICE2);
2872  //update_n_thr(n_thr);
2873 
2874  BVector<unsigned> bmcfg(bm.getcfg());
2875  /* OK, so how do we distribute a BandMatrix to threads?
2876  * 1. We do it by row, reflecting that this is how we later
2877  * use it in BdMat - Vector multiplication
2878  * 2. A BdMat consists of several vectors, one for the diagonal
2879  * and a variable number of others for the off-diagonals.
2880  * All these vectors need to be split to threads.
2881  * 3. Problem: Can we even hope to come close to page boundaries?
2882  */
2883  if (LIKELY(n_thr < 2)) {
2884  if (fault_in)
2885  numa_opt_bdmat_helper(bm, bmcfg,
2886  main_numa_node, fault_in,
2887  0, sz);
2888  return 0;
2889  } else {
2890  const unsigned first = slice_offset(1, n_thr, sz, (T*)0);
2891  unsigned long st, en = first;
2892  /* Start threads */
2893  unsigned long res = 0;
2894  for (unsigned t = 0; t < n_thr-1; ++t) {
2895  st = en; en = slice_offset(t+2, n_thr, sz, (T*)0);
2896  thread_start_off (t, (thr_job_t)numa_opt_bdmat_job<T>,
2897  threads[t].numa_node, fault_in,
2898  st, en, &bm, &bmcfg, (void*)0);
2899  }
2900  /* The first slice is handled by the main thread */
2901  res = numa_opt_bdmat_helper(bm, bmcfg,
2902  main_numa_node, fault_in, 0, first);
2903  /* Wait for the end */
2904  for (unsigned t = 0; t < n_thr-1; ++t) {
2905  job_output out;
2906  thread_wait (t, &out);
2907  res += out.t_res_l;
2908  }
2909  //fprintf(stderr, "NUMA Optimize Matrix %p: %li pages moved\n", m.getrowptr(0), res);
2910  return res;
2911  }
2912 }
2913 #else
2914 template <typename T>
2915 int numa_optimize(const BdMatrix<T>& bm, bool fault_in)
2916 {
2917  /* FIXME: We should emulate fault_in here, no? */
2918  return 0;
2919 }
2920 #endif
2921 
2922 
2923 
2925 
2926 #endif /* TBCI_BAND_MATRIX_H */
#define TBCICOPY(n, o, t, s)
Definition: basics.h:894
TVector< T > get_col(const unsigned int) const
Construct Vector from column.
Definition: band_matrix.h:1038
void do_copy(const BdMatrix< T > &)
Definition: band_matrix.h:366
BdMatrix< T > transpose(BdMatrix< T > &mat)
Definition: band_matrix.h:1393
void * t_par[6]
Definition: smp.h:173
#define BCHKNR(cond, exc, txt, ind)
Definition: basics.h:586
#define MIN_ALIGN
Definition: basics.h:421
#define ALIGN(x)
Definition: basics.h:444
T & setval(const unsigned long i) const
Definition: vector.h:192
long t_res_l
Definition: smp.h:179
BVector< T > & swap(BVector< T > &v)
Definition: bvector.h:451
const Vector< T > const Vector< T > const Vector< T > & p
Definition: LM_fit.h:97
#define _DIM
Definition: f_matrix.h:39
BdMatrix< T > & expand(unsigned=0)
Definition: band_matrix.h:1097
void job_bdmat_vec_transmult(struct thr_ctrl *tc)
Definition: band_matrix.h:2498
BdMatrix< T > operator/(const T &) const
Definition: band_matrix.h:1405
const T & getcref(const unsigned long idx) const
Definition: bvector.h:129
The class BdMatrix is an implementation to store and do operations on sparse Matrices with a band str...
Definition: band_matrix.h:103
int numa_avail
Definition: smp.cc:105
STD__ istream & operator>>(STD__ istream &istr, BdMatrix< T > &mat)
Definition: band_matrix.h:2739
BdMatrix< T > & operator+=(const BdMatrix< T > &)
Definition: band_matrix.h:1314
void free_diags(const unsigned)
Implementation alternative: All memory is allocated in one big chunk (default)
Definition: band_matrix.h:358
const unsigned end
void do_bdmat_vec_mult_lnw_opt(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1571
#define REGISTER
Definition: basics.h:108
return c
Definition: f_matrix.h:760
double fabs(const int a)
Definition: basics.h:1215
const BVector< unsigned int > & getcfg() const
ro access to config BVector
Definition: band_matrix.h:252
#define SFORALL_S(op)
Definition: band_matrix.h:1283
#define MIN(a, b)
Definition: basics.h:655
F_TMatrix< double > inv(const F_Matrix< double > &A, int overwriteA=0)
Definition: lapack.cpp:181
const unsigned ac
#define NAMESPACE_TBCI
Definition: basics.h:317
unsigned int dim
Definition: band_matrix.h:109
#define MAY_PREFETCH_R(adr, loc)
Definition: band_matrix.h:1483
BdMatrix< T > & adddiag(const unsigned)
Definition: band_matrix.h:888
unsigned long size() const HOT
Definition: bvector.h:144
BdMatrix< T > & reconfig(const BVector< unsigned int > &) COLD
Definition: band_matrix.h:846
exception base class for the TBCI NumLib
Definition: except.h:58
friend NOINST void gaussj FGDT(const BdMatrix< T > &, const Matrix< T > &)
NAMESPACE_TBCI char gaussj(Matrix< T > &a, Matrix< T > &b)
Definition: gauss_jordan.h:46
#define SFORALL_T(op)
Definition: band_matrix.h:1318
Common interface definition (signature) for all Matrices.
Definition: matrix_sig.h:45
tbci_traits< T >::const_refval_type get(const unsigned int, const unsigned int) const HOT
Definition: band_matrix.h:694
tbci_traits< T >::const_refval_type get(const unsigned long i) const
Definition: vector.h:190
#define MIN_ALIGN2
Definition: basics.h:424
bool operator==(const BdMatrix< T > &) const
Definition: band_matrix.h:2609
unsigned int getmaxoff() const
max distance from main diagonal
Definition: band_matrix.h:254
unsigned int columns() const
number of columns
Definition: band_matrix.h:249
BdMatrix< T > & div_row(const T &, const unsigned)
Definition: band_matrix.h:2695
BVector< T * > bdiag
pointers to upper and lower diagonals
Definition: band_matrix.h:112
#define BCHK(cond, exc, txt, ind, rtval)
Definition: basics.h:575
T * check(const unsigned r, const unsigned c) const HOT
Definition: band_matrix.h:659
#define THROWEXCEPT
Definition: except.h:114
long t_res_l
Definition: smp.h:148
#define TBCIFILL(n, v, t, s)
Definition: basics.h:910
#define UNLIKELY(expr)
Definition: basics.h:101
Definition: smp.h:168
#define REALLOC(v, os, t, s)
Definition: malloc_cache.h:636
void(* thr_job_t)(struct thr_ctrl *)
Before the double inclusion guard on purpose!
Definition: smp.h:126
#define FRIEND_TBCI2__
Definition: basics.h:335
#define STDDEF_ST(op)
Definition: band_matrix.h:1398
BdMatrix< T > & set_row(const Vector< T > &val, const unsigned, const unsigned=0)
Overwrite row with Vector.
Definition: band_matrix.h:1014
BdMatrix< T > & operator=(const BdMatrix< T > &)
The assignment operators in TBCI are NOT resizing.
Definition: band_matrix.h:1254
T & set(const T &val, const unsigned long i) const
Definition: vector.h:194
exception class
Definition: band_matrix.h:46
T & setval(const T &val, const unsigned int r, const unsigned int c)
Definition: matrix.h:281
T aligned_value_type TALIGN(MIN_ALIGN2)
Definition: band_matrix.h:134
#define NEW(t, s)
Definition: malloc_cache.h:633
tbci_traits< T >::const_refval_type get(const unsigned long idx) const HOT
Definition: bvector.h:132
#define BD_MINVAL
Definition: band_matrix.h:21
for(REGISTER T *p1=c.vec,*p2=b.vec;p1< c.endvec;p1++, p2++)*p1
unsigned int size() const
size: incompatibility to Matrix (!)
Definition: band_matrix.h:243
#define INTDIVEQ(T)
Definition: band_matrix.h:1356
struct thr_struct * threads
Definition: smp.cc:106
#define PREFETCH_R_MANY(addr, loc)
Definition: basics.h:764
void do_bdmat_vec_mult_lnw(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1521
int lu_decomp(Matrix< T > &) HOT
LU decomposes the TBCI::Matrix mat.
Definition: lu_solver.h:94
#define CSTD__
Definition: basics.h:340
unsigned long t_size
Definition: smp.h:171
void do_bdmat_vec_mult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1875
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
Matrix< T > & fill(const T &v=(T) 0)
Definition: matrix.h:1655
unsigned int columns() const
number of columns
Definition: matrix.h:315
bool contains(const T &, unsigned long *=0) const
Definition: bvector.h:580
TVector< T > get_row(const unsigned int) const
Construct Vector from row.
Definition: band_matrix.h:1022
BdMatrix< T > & fill(const T &=0)
Definition: band_matrix.h:1055
void do_bdmat_vec_transmult_diagw_exact(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:2220
void constructor(const unsigned int, const bool=true)
Definition: band_matrix.h:378
BdMatrix< T > operator-() const
Definition: band_matrix.h:1489
void destroy()
destroy object explicitly
Definition: band_matrix.h:156
#define STDDEF_SS(op)
Definition: band_matrix.h:1410
~BdMatrix() THROWEXCEPT
Definition: band_matrix.h:615
long int Vector< T > & index
Definition: LM_fit.h:69
BdMatrix(const BdMatrix< U > &bm)
Definition: band_matrix.h:171
BdMatrix< T > & transpose()
transpose() does change the object!
Definition: band_matrix.h:1377
T & operator()(const unsigned int, const unsigned int) HOT
rw member access
Definition: band_matrix.h:668
BVector< T > & append(const T &)
performs poorly
Definition: bvector.h:412
BdMatrix< T > operator*(const BdMatrix< T > &) const
Definition: band_matrix.h:1422
void do_bdmat_vec_mult_diagw_exact(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1827
unsigned int noelem() const
Number of elements that are actually stored.
Definition: band_matrix.h:245
BdMatrix< T > & div_rows(const Vector< T > &)
Definition: band_matrix.h:2661
BVector< T > & remove(const unsigned long)
Definition: bvector.h:435
#define PREFETCH_W_MANY(addr, loc)
Definition: basics.h:765
BdMatrix< T > operator+(const BdMatrix< T > &) const
Definition: band_matrix.h:1417
int conj(const int arg)
conj for elementary types
Definition: basics.h:1055
BdMatrixErr(const char *t, const long i=0)
Definition: band_matrix.h:51
BdMatrix< T > inverse() const
T & set(const unsigned long idx) HOT
Definition: bvector.h:134
unsigned int rows() const
number of rows
Definition: band_matrix.h:247
void do_bdmat_vec_transmult_lnw_opt(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1969
unsigned int maxoff
size, max offset from diag
Definition: band_matrix.h:109
const T & getcref(const unsigned long i) const
Definition: vector.h:188
void do_bdmat_vec_dotmult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:2293
unsigned long size() const
Definition: vector.h:104
BVector< unsigned > diagconf
configuration
Definition: band_matrix.h:111
#define FGD
Definition: basics.h:144
Definition: bvector.h:49
virtual ~BdMatrixErr()
Definition: band_matrix.h:55
void job_bdmat_vec_mult(struct thr_ctrl *tc)
Definition: band_matrix.h:2489
#define CPLX__
Definition: basics.h:341
#define INST(x)
Definition: basics.h:238
BdMatrix< T > & autoinsert(const T &, const unsigned, const unsigned)
Definition: band_matrix.h:996
#define EXPCHK(cond, exc, txt, ind, rtval)
Definition: basics.h:630
static const char * mat_info()
Definition: band_matrix.h:164
TVector< T > transMult(const Vector< T > &) const HOT
Transposed Matrix-Vector multiplication.
Definition: band_matrix.h:2541
int i
Definition: LM_fit.h:71
BVector< T > & resize(const BVector< T > &)
Actually it&#39;s a resize and copy (some people would expect the assignment op to do this) ...
Definition: bvector.h:361
#define MAY_PREFETCH_W(adr, loc)
Definition: band_matrix.h:1484
unsigned long dim
Definition: matrix.h:115
BdMatrix< T > & removediag(const unsigned)
Definition: band_matrix.h:973
friend class BdMatrix
Definition: band_matrix.h:170
#define STD__
Definition: basics.h:338
#define TBCIDELETE(t, v, sz)
Definition: malloc_cache.h:634
T element_type
Definition: band_matrix.h:133
#define threads_avail(x)
Definition: smp.h:322
void do_bdmat_vec_transmult_lnw(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:1922
unsigned long t_off
Definition: smp.h:172
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition: bvector.h:52
void thread_wait(const int thr_no, struct job_output *out)
Definition: smp.cc:997
#define abs(x)
Definition: f2c.h:178
T trace() const
Definition: band_matrix.h:256
#define MAX(a, b)
Definition: basics.h:656
#define NAMESPACE_END
Definition: basics.h:323
BdMatrix< T > & operator*=(const T &)
Definition: band_matrix.h:1334
BdMatrix< T > & mult_row(const T &, const unsigned)
Definition: band_matrix.h:2679
#define SMP_BDMATSLICE2
Definition: band_matrix.h:2510
BVector< T > & copy(const BVector< T > &bv)
copy does a resize, if necessary
Definition: bvector.h:484
BdMatrix< T > & resize(const BdMatrix< T > &bdm)
Resizing assignment.
Definition: band_matrix.h:224
Definition: bvector.h:54
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
BdMatrix< T > & operator/=(const T &)
Definition: band_matrix.h:1340
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
int numa_node
Definition: smp.h:133
T * vec
Definition: bvector.h:73
#define BDMATDBG(x)
Definition: band_matrix.h:32
#define T
Definition: bdmatlib.cc:20
#define TBCICOMP(n, o, t, s)
Definition: basics.h:979
#define HOT
Definition: basics.h:495
int main_numa_node
Definition: smp.cc:110
unsigned int do_exactsum2()
Definition: tbci_param.h:151
BdMatrix< T > & resize(const unsigned int)
Definition: band_matrix.h:749
int numa_optimize(const BdMatrix< T > &bm, bool fault_in)
Definition: band_matrix.h:2915
TVector< T > dotMult(const Vector< T > &) const
Definition: band_matrix.h:2592
BdMatrix< T > & mult_rows(const Vector< T > &)
Linewise multiplication.
Definition: band_matrix.h:2643
BdMatrix< T > & setval(const T &v, const unsigned int r, const unsigned int c)
autoinsert, sparse class compatible
Definition: band_matrix.h:205
T * check_internal(const unsigned r, const unsigned c) const HOT
Definition: band_matrix.h:636
#define COLD
Definition: basics.h:496
void thread_start_off(const int thr_no, thr_job_t job, const unsigned long off, const unsigned long sz,...)
Definition: smp.cc:979
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int int char v
&lt; find minimun of func on grid with resolution res
Definition: LM_fit.h:205
BVector< T * > adiag
Definition: band_matrix.h:112
#define min(a, b)
Definition: f2c.h:180
const BdMatrix< T > & setoutopts(int w=5, int p=4, char f= ' ') const
Definition: band_matrix.h:321
#define MATH__
Definition: basics.h:339
void do_bdmat_vec_transmult(const unsigned start, const unsigned end, TVector< T > *res, const BdMatrix< T > *mat, const Vector< T > *vec)
Definition: band_matrix.h:2262
BdMatrixErr(const BdMatrixErr &be)
Definition: band_matrix.h:53
BdMatrix< T > & operator-=(const BdMatrix< T > &)
Definition: band_matrix.h:1315
#define LIKELY(expr)
branch prediction note that we sometimes on purpose mark the unlikely possibility likely and vice ver...
Definition: basics.h:100
tbci_traits< T >::const_refval_type get(const unsigned r, const unsigned c) const
get, set and getcref are used internally and not for public consumption
Definition: matrix.h:288
unsigned long arsz
of saved elems,
Definition: band_matrix.h:110
unsigned int rows() const
number of rows
Definition: matrix.h:317
BdMatrix< T > & setunit(const T &=1)
Definition: band_matrix.h:725
bool operator!=(const BdMatrix< T > &ma) const
Definition: band_matrix.h:282
BdMatrix< T > & clear()
Definition: band_matrix.h:1062