TBCI Numerical high perf. C++ Library  2.8.0
bd_lu_solver.h
Go to the documentation of this file.
1 
8 /* Naming conventions:
9  * LU_xxxx operates on an already LU decomposed spmat
10  * whereas lu_xxxx does decompose it and then does
11  * the requested action (eg. determinant calculation ...)
12  * THE lu_xxxx VARIANTS THEREFORE CHANGE THE MATRIX PASSED!
13  */
14 
15 /* Written 23.06.97 by KG
16  * Last changed: 17.12.97 KG
17  * $Id: bd_lu_solver.h,v 1.27.2.20 2022/11/03 17:28:12 garloff Exp $
18  */
19 
20 #ifndef TBCI_SOLVER_BD_LU_SOLVER_H
21 #define TBCI_SOLVER_BD_LU_SOLVER_H
22 
23 #include "tbci/vector.h"
24 #include "tbci/band_matrix.h"
25 #include "tbci/matrix.h"
26 
27 #ifdef PRAGMA_I
28 # pragma interface "bd_lu_solver.h"
29 #endif
30 
31 /* When to warn because of diagonal zeroes */
32 #ifndef BDMLU_MINVAL
33 # define BDMLU_MINVAL 1e-16
34 #endif
35 
36 /* When to warn because of not converged solution */
37 #ifndef BDMLU_RES_WARN
38 # define BDMLU_RES_WARN 1e-11
39 #endif
40 
42 
53 INST(template <typename T> class BdMatrix friend int lu_decomp (BdMatrix<T>&);)
54 
55 #if 1 || defined(USE_PLAIN_VEC_KERNELS) || (defined(UNROLL_DEPTH) && UNROLL_DEPTH == 1)
56 template <typename T>
58 {
59  const unsigned matr = mat.rows();
60  //BdMatrices are always quadratic in shape: Check not necessary
61 
62  //check that LU decomp fits in mat
63  mat.expand();
64 
65  //const unsigned od = mat.diagconf.size();
66  const unsigned maxoff = mat.getmaxoff();
67 
68  // TODO: The mem access pattern might be optimized
69  for (unsigned j = 0; j < matr; ++j) {
70  unsigned i = MAX((signed)0, (signed)j - (signed)maxoff);
71  const unsigned imax = MIN(j,maxoff);
72  ALIGN2(REGISTER T tmp, MIN_ALIGN2); tmp = 0;
73  for (; i <= imax; ++i) {
74  tmp = mat.get(i, j);
75  for (REGISTER unsigned k = 0; k < i; ++k)
76  tmp -= mat.get(i, k) * mat.get(k, j);
77  mat.setval(tmp, i, j);
78  }
79  for (; i <= j; ++i) {
80  tmp = mat(i, j);
81  for (REGISTER unsigned k = j - maxoff; k < i; ++k)
82  tmp -= mat.get(i, k) * mat.get(k, j);
83  mat.setval(tmp, i, j);
84  }
85  BCHK(MATH__ fabs(tmp) == 0, BdMatrixErr, Diagonal zero encountered, j, -1);
86  T invdiag = T(1.0)/tmp;
87  //No pivoting for the first shot (OK for most FD problems)
88  // Diagonal zeros are lethal (!)
89  //#warning "We don't do pivoting, so don't supply non diagonal-dominant matrices!"
90  unsigned bound = MIN(matr, j + mat.getmaxoff() + 1);
91  for (; i < bound; ++i) {
92  tmp = mat.get(i, j);
93  for (REGISTER unsigned int k = MAX((signed)0, (signed)j - (signed)maxoff); k < j; ++k)
94  tmp -= mat.get(i, k) * mat.get(k, j);
95  // BCHK(MATH__ fabs (mat.get(j, j)) < BDMLU_MINVAL, BdMatrixErr, Diagonal zero encountered, j, -1);
96 # ifdef ERRCHECK
97  if (UNLIKELY (MATH__ fabs(tmp * invdiag) > 1/BDMLU_MINVAL))
98  STD__ cerr << "\n BdMatrixWarning: BdMatrix badly conditioned, continuing ...\n" << STD__ flush;
99 # endif
100  mat.setval(tmp * invdiag, i, j);
101  }
102  }
103  return 0;
104 }
105 
106 #else
107 
108 template <typename T>
109 int lu_decomp (BdMatrix<T>& mat)
110 {
111  const unsigned matr = mat.rows();
112  //BdMatrices are always quadratic in shape: Check not necessary
113 
114  //check that LU decomp fits in mat
115  mat.expand();
116 
117  //const unsigned od = mat.diagconf.size();
118  const unsigned maxoff = mat.getmaxoff();
119 
120  // TODO: The mem access pattern might be optimized
121  for (unsigned j = 0; j < matr; ++j) {
122  unsigned i = MAX((signed)0, (signed)j - (signed)maxoff);
123  PREFETCH_R(&mat.get(i, 0), 3);
124  PREFETCH_R(&mat.get(i, j), 3);
125  const unsigned imax = MIN(j,maxoff);
126  PREFETCH_R(&mat.get(0, j), 2);
127  PREFETCH_R(&mat.get(1, j), 2);
128  ALIGN2(REGISTER T tmp, MIN_ALIGN2);
129  for (; i <= imax; ++i) {
130  tmp = mat.get(i, j);
131  PREFETCH_R(&mat.get(i, 0), 3);
132  PREFETCH_R(&mat.get(i, 1), 3);
133  for (REGISTER unsigned k = 0; k < i; ++k) {
134  tmp -= mat.get(i, k) * mat.get(k, j);
135  PREFETCH_R(&mat.get(i, k+2),3);
136  PREFETCH_R(&mat.get(k+1, j), 2);
137  }
138  mat.setval(tmp, i, j);
139  PREFETCH_W(&mat(i+1,j),3);
140  }
141  for (; i <= j; ++i) {
142  tmp = mat.get(i, j);
143  PREFETCH_R(&mat.get(i, j-maxoff), 3);
144  PREFETCH_R(&mat.get(i, j-maxoff+1), 3);
145  REGISTER unsigned k;
146  for (k = j - maxoff; k < (signed)i-(signed)MAX(8,4*EL_PER_CL(T)); k+=4) {
147  tmp -= mat.get(i, k) * mat.get(k, j);
148  PREFETCH_R(&mat.get(i,k+4), 3);
149  PREFETCH_R(&mat.get(i,k+5), 3);
150  tmp -= mat.get(i, k+1) * mat.get(k+1, j);
151  PREFETCH_R(&mat.get(i,k+6), 3);
152  PREFETCH_R(&mat.get(i,k+7), 3);
153  tmp -= mat.get(i, k+2) * mat.get(k+2, j);
154  PREFETCH_R(&mat.get(k+4,j), 3);
155  PREFETCH_R(&mat.get(k+5,j), 3);
156  tmp -= mat.get(i, k+3) * mat.get(k+3, j);
157  PREFETCH_R(&mat.get(k+6,j), 3);
158  PREFETCH_R(&mat.get(k+7,j), 3);
159  }
160  for (; k < i; ++k)
161  tmp -= mat.get(i, k) * mat.get(k, j);
162  mat.setval(tmp, i, j);
163  }
164  BCHK(MATH__ fabs (tmp) == 0, BdMatrixErr, Diagonal zero encountered, j, -1);
165  T invdiag = T(1.0)/tmp;
166  //No pivoting for the first shot (OK for most FD problems)
167  // Diagonal zeros are lethal (!)
168  //#warning "We don't do pivoting, so don't supply non diagonal-dominant matrices!"
169  unsigned bound = MIN(matr, j + mat.getmaxoff() + 1);
170  for (; i < bound; ++i) {
171  const unsigned kstart = MAX((signed)0, (signed)j - (signed)maxoff);
172  tmp = mat.get(i, j);
173  PREFETCH_R(&mat.get(i, kstart), 3);
174  PREFETCH_R(&mat.get(i, kstart+1), 3);
175  REGISTER unsigned int k;
176  for (k = kstart; (signed)k < (signed)j-(signed)MAX(8,4*EL_PER_CL(T)); k+=4) {
177  tmp -= mat.get(i, k) * mat.get(k, j);
178  PREFETCH_R(&mat.get(i,k+4), 3);
179  PREFETCH_R(&mat.get(i,k+5), 3);
180  tmp -= mat.get(i, k+1) * mat.get(k+1, j);
181  PREFETCH_R(&mat.get(i,k+6), 3);
182  PREFETCH_R(&mat.get(i,k+7), 3);
183  tmp -= mat.get(i, k+2) * mat.get(k+2, j);
184  PREFETCH_R(&mat.get(k+4,j), 3);
185  PREFETCH_R(&mat.get(k+5,j), 3);
186  tmp -= mat.get(i, k+3) * mat.get(k+3, j);
187  PREFETCH_R(&mat.get(k+6,j), 3);
188  PREFETCH_R(&mat.get(k+7,j), 3);
189  }
190  for (; k < j; ++k)
191  tmp -= mat.get(i, k) * mat.get(k, j);
192  // BCHK(MATH__ fabs (mat.get(j, j)) < BDMLU_MINVAL, BdMatrixErr, Diagonal zero encountered, j, -1);
193 # ifdef ERRCHECK
194  if (UNLIKELY (MATH__ fabs(tmp * invdiag) > 1/BDMLU_MINVAL))
195  STD__ cerr << "\n BdMatrixWarning: BdMatrix badly conditioned, continuing ...\n" << STD__ flush;
196 # endif
197  mat.setval(tmp * invdiag, i, j);
198  }
199  }
200  return 0;
201 }
202 #endif
203 
204 
205 INST(template <typename T> class BdMatrix friend TVector<T> LU_fwd_subst(const BdMatrix<T>&, const Vector<T>&);)
206 template <typename T>
208 {
209  const unsigned ysize = y.size();
210  const unsigned maxoff = lu.getmaxoff();
211  TVector<T> x(ysize);
212  BCHK(ysize != lu.rows(), BdMatrixErr, LU fwd subst size error, ysize, x);
213  for (unsigned i = 0; i < ysize; ++i) {
214  ALIGN2(REGISTER T tmp, MIN_ALIGN2) = y(i);
215  for (REGISTER unsigned j = MAX((signed)0, (signed)i - (signed)maxoff); j < i; ++j)
216  tmp -= x.get(j) * lu.get(i, j);
217  x.set(tmp, i);
218  }
219  return x;
220 }
221 
222 INST(template <typename T> class BdMatrix friend TVector<T> LU_bkw_subst(const BdMatrix<T>&, const Vector<T>&);)
223 template <typename T>
225 {
226  const unsigned ysize = y.size();
227  const unsigned maxoff = lu.getmaxoff();
228  TVector<T> x(ysize);
229  BCHK(ysize != lu.rows(), BdMatrixErr, LU bkw subst size error, ysize, x);
230  for (int i = ysize - 1; (signed)i >= 0; --i) {
231  ALIGN2(REGISTER T tmp, MIN_ALIGN2) = y(i);
232  for (REGISTER int j = MIN(ysize - 1, i + maxoff); j > i; --j)
233  tmp -= x.get(j) * lu.get(i, j);
234  x.set(tmp / lu.get(i, i), i);
235  }
236  return x;
237 }
238 
244 INST(template <typename T> class BdMatrix friend TVector<T> LU_solve(const BdMatrix<T>&, const Vector<T>&);)
245 template <typename T>
246 inline TVector<T> LU_solve(const BdMatrix<T>& lu, const Vector<T>& b)
247 {
248  Vector<T> y (LU_fwd_subst(lu, b)); //STD__ cout << y << STD__ endl;
249  return (LU_bkw_subst(lu, y));
250 }
251 
258 INST(template <typename T> class BdMatrix friend TVector<T> lu_solve(BdMatrix<T>&, const Vector<T>&);)
259 template <typename T>
261 {
262 #ifdef ERRCHECK
263  BdMatrix<T> m_backup (mat);
264  double res;
265 #endif
266  lu_decomp (mat);
267  TVector<T> x (LU_solve (mat, b));
268 #ifdef ERRCHECK
269  // No return!, Andreas
270  Vector<T> xx(b.size()); xx.copy (x);
271  if (UNLIKELY((res = MATH__ fabs(m_backup * xx - b)/MATH__ fabs(b)) > BDMLU_RES_WARN))
272  STD__ cerr << "bd_lu_solver: Inaccurate solution, rel. residuum= " << res << STD__ endl;
273 #endif
274  return x;
275 }
276 
277 INST(template <typename T> class BdMatrix friend TMatrix<T> LU_solve(const BdMatrix<T>&, const Matrix<T>&);)
278 template <typename T>
280 {
281  TMatrix<T> x(b.rows(), b.columns());
282  Vector<T> y(b.rows()), w(b.rows());
283  const unsigned bc = b.columns();
284  for (unsigned int i = 0; i < bc; ++i) {
285  w = b.get_col(i);
286  y = LU_fwd_subst(lu, w);
287  x.set_col(LU_bkw_subst(lu, y), i);
288  }
289  return x;
290 }
291 
292 
293 INST(template <typename T> class BdMatrix friend TMatrix<T> lu_solve(BdMatrix<T>&, const Matrix<T>&);)
294 template <typename T>
295 inline TMatrix<T> lu_solve(BdMatrix<T>& mat, const Matrix<T>& b)
296 {
297  lu_decomp(mat);
298  return LU_solve(mat, b);
299 }
300 
301 #if 0
302 /* These may be needed for expm */
303 INST(template <typename T> class BdMatrix friend TMatrix<T> LU_solve (const BdMatrix<T>&, const BdMatrix<T>&);)
304 template <typename T>
305 inline TMatrix<T> LU_solve (const BdMatrix<T>& lu, const BdMatrix<T>& b)
306 { return LU_solve (lu, Matrix<T>(b)); }
307 
308 INST(template <typename T> class BdMatrix friend TMatrix<T> lu_solve (BdMatrix<T>&, const BdMatrix<T>&);)
309 template <typename T>
310 inline TMatrix<T> lu_solve (BdMatrix<T>& mat, const BdMatrix<T>& b)
311 { lu_decomp (mat); return LU_solve (mat, b); }
312 #endif
313 
315 INST(template <typename T> class BdMatrix friend T LU_det(const BdMatrix<T>&);)
316 template <typename T>
317 T LU_det(const BdMatrix<T>& lu)
318 {
319  ALIGN2(REGISTER T tmp, MIN_ALIGN2) = lu.get(0, 0);
320  const unsigned lr = lu.rows();
321  for (unsigned i = 1; i < lr; ++i)
322  tmp *= lu.get(i, i);
323  return tmp;
324 }
325 
327 INST(template <typename T> class BdMatrix friend T lu_det(BdMatrix<T>&);)
328 template <typename T>
329 inline T lu_det(BdMatrix<T>& mat)
330 {
331  lu_decomp(mat);
332  return LU_det(mat);
333 }
334 
336 INST(template <typename T> class BdMatrix friend TMatrix<T> LU_invert(const BdMatrix<T>&);)
337 template <typename T>
339 {
340  TMatrix<T> inv ((T)0, lu.columns(), lu.rows());
341  const unsigned lr = lu.rows();
342  for (unsigned int j = 0; j < lr; ++j) {
343  Vector<T> v ((T)0, lr); /* lu.columns()); */
344  v(j) = (T)1;
345  Vector<T> y (LU_solve(lu, v));
346  inv.set_col(y, j);
347  }
348  return inv;
349 }
350 
351 
353 INST(template <typename T> class BdMatrix friend TMatrix<T> lu_invert(BdMatrix<T>&);)
354 template <typename T>
356 {
357  lu_decomp(mat);
358  return LU_invert(mat);
359 }
360 
362 
363 #endif /* TBCI_SOLVER_BD_LU_SOLVER_H */
const Vector< T > const Vector< T > & x
Definition: LM_fit.h:97
BdMatrix< T > & expand(unsigned=0)
Definition: band_matrix.h:1097
TVector< T > LU_fwd_subst(const BdMatrix< T > &lu, const Vector< T > &y)
Definition: bd_lu_solver.h:207
The class BdMatrix is an implementation to store and do operations on sparse Matrices with a band str...
Definition: band_matrix.h:103
#define REGISTER
Definition: basics.h:108
double fabs(const int a)
Definition: basics.h:1215
TVector< T > get_col(const unsigned int) const
Column vector.
Definition: matrix.h:627
#define MIN(a, b)
Definition: basics.h:655
F_TMatrix< double > inv(const F_Matrix< double > &A, int overwriteA=0)
Definition: lapack.cpp:181
#define NAMESPACE_TBCI
Definition: basics.h:317
F_TMatrix< double > lu_solve(const F_Matrix< double > &A, const F_Matrix< double > &B, int overwriteA=0)
Definition: lapack.cpp:156
tbci_traits< T >::const_refval_type get(const unsigned int, const unsigned int) const HOT
Definition: band_matrix.h:694
tbci_traits< T >::const_refval_type get(const unsigned long i) const
Definition: vector.h:190
#define MIN_ALIGN2
Definition: basics.h:424
unsigned int getmaxoff() const
max distance from main diagonal
Definition: band_matrix.h:254
#define EL_PER_CL(T)
Definition: perf_opt.h:172
unsigned int columns() const
number of columns
Definition: band_matrix.h:249
#define BCHK(cond, exc, txt, ind, rtval)
Definition: basics.h:575
#define UNLIKELY(expr)
Definition: basics.h:101
TVector< T > LU_bkw_subst(const BdMatrix< T > &lu, const Vector< T > &y)
Definition: bd_lu_solver.h:224
TMatrix< T > LU_invert(const BdMatrix< T > &lu)
return the inverse Matrix for an already LU decomposed BdMatrix
Definition: bd_lu_solver.h:338
T & set(const T &val, const unsigned long i) const
Definition: vector.h:194
exception class
Definition: band_matrix.h:46
int lu_decomp(Matrix< T > &) HOT
LU decomposes the TBCI::Matrix mat.
Definition: lu_solver.h:94
#define PREFETCH_R(addr, loc)
In case gcc does not yet support __builtin_prefetch(), we have handcoded assembly with gcc for a few ...
Definition: basics.h:748
unsigned int columns() const
number of columns
Definition: matrix.h:315
#define UNROLL_DEPTH
How many iters per loop (unrolling) Trade code bloat against speed.
Definition: perf_opt.h:146
T LU_det(const BdMatrix< T > &lu)
calculates determinant of an already LU decomposed BdMatrix
Definition: bd_lu_solver.h:317
F_TMatrix< T > b
Definition: f_matrix.h:736
#define USE_PLAIN_VEC_KERNELS
file perf_opt.h Default settings for optimum performance on different architectures and compilers...
Definition: perf_opt.h:114
#define BDMLU_MINVAL
Definition: bd_lu_solver.h:33
unsigned int rows() const
number of rows
Definition: band_matrix.h:247
unsigned long size() const
Definition: vector.h:104
Definition: bvector.h:49
#define ALIGN2(v, x)
Definition: basics.h:443
#define PREFETCH_W(addr, loc)
Definition: basics.h:749
#define INST(x)
Definition: basics.h:238
#define BDMLU_RES_WARN
Definition: bd_lu_solver.h:38
int i
Definition: LM_fit.h:71
#define STD__
Definition: basics.h:338
Temporary Base Class Idiom: Class TVector is used for temporary variables.
Definition: bvector.h:52
#define MAX(a, b)
Definition: basics.h:656
#define NAMESPACE_END
Definition: basics.h:323
BVector< T > & copy(const BVector< T > &bv)
copy does a resize, if necessary
Definition: bvector.h:484
Definition: bvector.h:54
TVector< T > LU_solve(const BdMatrix< T > &lu, const Vector< T > &b)
Solve the equation Ax = b where A IS already LU decomposed.
Definition: bd_lu_solver.h:246
const Vector< T > Vector< T > Vector< T > Vector< T > & y
Definition: LM_fit.h:172
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int res
Definition: LM_fit.h:199
#define T
Definition: bdmatlib.cc:20
BdMatrix< T > & setval(const T &v, const unsigned int r, const unsigned int c)
autoinsert, sparse class compatible
Definition: band_matrix.h:205
const Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > Vector< T > long int int char v
&lt; find minimun of func on grid with resolution res
Definition: LM_fit.h:205
TMatrix< T > lu_invert(BdMatrix< T > &mat)
return the inverse Matrix for a BdMatrix by doing an LU decomposition
Definition: bd_lu_solver.h:355
#define MATH__
Definition: basics.h:339
unsigned int rows() const
number of rows
Definition: matrix.h:317
T lu_det(BdMatrix< T > &mat)
calculates the determinant of a BdMatrix by doing an LU decomposition
Definition: bd_lu_solver.h:329