QUDA  v1.1.0
A library for QCD on GPUs
inv_ca_gcr.cpp
Go to the documentation of this file.
1 #include <invert_quda.h>
2 #include <blas_quda.h>
3 #include <eigen_helper.h>
4 
5 namespace quda {
6 
7  CAGCR::CAGCR(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
8  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
9  Solver(mat, matSloppy, matPrecon, matEig, param, profile),
10  matMdagM(matEig.Expose()),
11  init(false),
12  use_source(param.preserve_source == QUDA_PRESERVE_SOURCE_NO && param.precision == param.precision_sloppy
13  && param.use_init_guess == QUDA_USE_INIT_GUESS_NO && !param.deflate),
14  basis(param.ca_basis),
15  alpha(nullptr),
16  rp(nullptr),
17  tmpp(nullptr),
18  tmp_sloppy(nullptr)
19  {
20  }
21 
24 
25  if (init) {
26  if (alpha) delete []alpha;
27  if (basis == QUDA_POWER_BASIS) {
28  for (int i=0; i<param.Nkrylov+1; i++) if (i>0 || !use_source) delete p[i];
29  } else {
30  for (int i=0; i<param.Nkrylov; i++) if (i>0 || !use_source) delete p[i];
31  for (int i=0; i<param.Nkrylov; i++) delete q[i];
32  }
33  if (tmp_sloppy) delete tmp_sloppy;
34  if (tmpp) delete tmpp;
35  if (rp) delete rp;
36  }
37 
39 
41  }
42 
43  void CAGCR::create(ColorSpinorField &b)
44  {
45  if (!init) {
46  if (!param.is_preconditioner) {
47  blas::flops = 0;
48  profile.TPSTART(QUDA_PROFILE_INIT);
49  }
50 
51  alpha = new Complex[param.Nkrylov];
52 
53  bool mixed = param.precision != param.precision_sloppy;
54 
55  ColorSpinorParam csParam(b);
57 
58  // Source needs to be preserved if we're computing the true residual
59  rp = (mixed && !use_source) ? ColorSpinorField::Create(csParam) : nullptr;
61 
62  // now allocate sloppy fields
63  csParam.setPrecision(param.precision_sloppy);
64 
65  if (basis != QUDA_POWER_BASIS) {
66  warningQuda("CA-GCR does not support any basis besides QUDA_POWER_BASIS. Switching to QUDA_POWER_BASIS...\n");
67  basis = QUDA_POWER_BASIS;
68  }
69 
70  if (basis == QUDA_POWER_BASIS) {
71  // in power basis q[k] = p[k+1], so we don't need a separate q array
72  p.resize(param.Nkrylov+1);
73  q.resize(param.Nkrylov);
74  for (int i=0; i<param.Nkrylov+1; i++) {
75  p[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
76  if (i>0) q[i-1] = p[i];
77  }
78  } else {
79  p.resize(param.Nkrylov);
80  q.resize(param.Nkrylov);
81  for (int i=0; i<param.Nkrylov; i++) {
82  p[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
84  }
85  }
86 
87  //sloppy temporary for mat-vec
88  tmp_sloppy = mixed ? tmpp->CreateAlias(csParam) : nullptr;
89 
91 
92  init = true;
93  } // init
94  }
95 
96  void CAGCR::solve(Complex *psi_, std::vector<ColorSpinorField*> &q, ColorSpinorField &b)
97  {
98  typedef Matrix<Complex, Dynamic, Dynamic> matrix;
99  typedef Matrix<Complex, Dynamic, 1> vector;
100 
101  const int N = q.size();
102  vector phi(N), psi(N);
103  matrix A(N,N);
104 
105 #if 1
106  // only a single reduction but requires using the full dot product
107  // compute rhs vector phi = Q* b = (q_i, b)
108  std::vector<ColorSpinorField*> Q;
109  for (int i=0; i<N; i++) Q.push_back(q[i]);
110  Q.push_back(&b);
111 
112  // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j)
113  Complex *A_ = new Complex[N*(N+1)];
114  blas::cDotProduct(A_, q, Q);
115  for (int i=0; i<N; i++) {
116  phi(i) = A_[i*(N+1)+N];
117  for (int j=0; j<N; j++) {
118  A(i,j) = A_[i*(N+1)+j];
119  }
120  }
121  delete[] A_;
122 #else
123  // two reductions but uses the Hermitian block dot product
124  // compute rhs vector phi = Q* b = (q_i, b)
125  std::vector<ColorSpinorField*> B;
126  B.push_back(&b);
127  Complex *phi_ = new Complex[N];
128  blas::cDotProduct(phi_,q, B);
129  for (int i=0; i<N; i++) phi(i) = phi_[i];
130  delete phi_;
131 
132  // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j)
133  Complex *A_ = new Complex[N*N];
134  blas::hDotProduct(A_, q, q);
135  for (int i=0; i<N; i++)
136  for (int j=0; j<N; j++)
137  A(i,j) = A_[i*N+j];
138  delete[] A_;
139 #endif
140 
141  if (!param.is_preconditioner) {
144  profile.TPSTART(QUDA_PROFILE_EIGEN);
145  }
146 
147  // use Cholesky LDL since this seems plenty stable
148  LDLT<matrix> cholesky(A);
149  psi = cholesky.solve(phi);
150 
151  for (int i=0; i<N; i++) psi_[i] = psi(i);
152 
153  if (!param.is_preconditioner) {
154  profile.TPSTOP(QUDA_PROFILE_EIGEN);
156  profile.TPSTART(QUDA_PROFILE_COMPUTE);
157  }
158 
159  }
160 
161  /*
162  The main CA-GCR algorithm, which consists of three main steps:
163  1. Build basis vectors q_k = A p_k for k = 1..Nkrlylov
164  2. Minimize the residual in this basis
165  3. Update solution and residual vectors
166  4. (Optional) restart if convergence or maxiter not reached
167  */
169  {
170  const int n_krylov = param.Nkrylov;
171 
172  if (checkPrecision(x,b) != param.precision) errorQuda("Precision mismatch %d %d", checkPrecision(x,b), param.precision);
173  if (param.return_residual && param.preserve_source == QUDA_PRESERVE_SOURCE_YES) errorQuda("Cannot preserve source and return the residual");
174 
175  if (param.maxiter == 0 || n_krylov == 0) {
177  return;
178  }
179 
180  create(b);
181 
182  ColorSpinorField &r = rp ? *rp : *p[0];
183  ColorSpinorField &tmp = *tmpp;
184  ColorSpinorField &tmpSloppy = tmp_sloppy ? *tmp_sloppy : tmp;
185 
187 
188  // compute b2, but only if we need to
189  bool fixed_iteration = param.sloppy_converge && n_krylov == param.maxiter && !param.compute_true_res;
190  double b2 = !fixed_iteration ? blas::norm2(b) : 1.0;
191  double r2 = 0.0; // if zero source then we will exit immediately doing no work
192 
193  if (param.deflate) {
194  // Construct the eigensolver and deflation space if requested.
196  constructDeflationSpace(b, matMdagM);
197  } else {
198  // Use Arnoldi to inspect the space only and turn off deflation
200  param.deflate = false;
201  }
202  if (deflate_compute) {
203  // compute the deflation space.
205  (*eig_solve)(evecs, evals);
206  if (param.deflate) {
207  // double the size of the Krylov space
209  // populate extra memory with L/R singular vectors
210  eig_solve->computeSVD(matMdagM, evecs, evals);
211  }
213  deflate_compute = false;
214  }
215  if (recompute_evals) {
216  eig_solve->computeEvals(matMdagM, evecs, evals);
217  eig_solve->computeSVD(matMdagM, evecs, evals);
218  recompute_evals = false;
219  }
220  }
221 
222  // compute intitial residual depending on whether we have an initial guess or not
224  mat(r, x, tmp);
225  //r = b - Ax0
226  if (!fixed_iteration) {
227  r2 = blas::xmyNorm(b, r);
228  } else {
229  blas::xpay(b, -1.0, r);
230  r2 = b2; // dummy setting
231  }
232  } else {
233  r2 = b2;
234  blas::copy(r, b);
235  blas::zero(x);
236  }
237 
238  if (param.deflate && param.maxiter > 1) {
239  // Deflate: Hardcoded to SVD. If maxiter == 1, this is a dummy solve
240  eig_solve->deflateSVD(x, r, evecs, evals, true);
241 
242  // Compute r_defl = RHS - A * LHS
243  mat(r, x, tmp);
244  if (!fixed_iteration) {
245  r2 = blas::xmyNorm(b, r);
246  } else {
247  blas::xpay(b, -1.0, r);
248  r2 = b2; // dummy setting
249  }
250  }
251 
252  // Check to see that we're not trying to invert on a zero-field source
253  if (b2 == 0) {
255  warningQuda("inverting on zero-field source\n");
256  x = b;
257  param.true_res = 0.0;
258  param.true_res_hq = 0.0;
259  return;
260  } else {
261  b2 = r2;
262  }
263  }
264 
265  double stop = !fixed_iteration ? stopping(param.tol, b2, param.residual_type) : 0.0; // stopping condition of solver
266 
267  const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
268 
269  // this parameter determines how many consective reliable update
270  // reisudal increases we tolerate before terminating the solver,
271  // i.e., how long do we want to keep trying to converge
272  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
273  const int maxResIncreaseTotal = param.max_res_increase_total;
274 
275  double heavy_quark_res = 0.0; // heavy quark residual
276  if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r).z);
277 
278  int resIncrease = 0;
279  int resIncreaseTotal = 0;
280 
281  if (!param.is_preconditioner) {
282  blas::flops = 0;
284  profile.TPSTART(QUDA_PROFILE_COMPUTE);
285  }
286  int total_iter = 0;
287  int restart = 0;
288  double r2_old = r2;
289  double maxr_deflate = sqrt(r2);
290  bool l2_converge = false;
291 
292  blas::copy(*p[0], r); // no op if uni-precision
293 
294  PrintStats("CA-GCR", total_iter, r2, b2, heavy_quark_res);
295  while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) {
296 
297  // build up a space of size n_krylov
298  for (int k = 0; k < n_krylov; k++) {
299  matSloppy(*q[k], *p[k], tmpSloppy);
300  if (k < n_krylov - 1 && basis != QUDA_POWER_BASIS) blas::copy(*p[k + 1], *q[k]);
301  }
302 
303  solve(alpha, q, *p[0]);
304 
305  // update the solution vector
306  std::vector<ColorSpinorField*> X;
307  X.push_back(&x);
308  // need to make sure P is only length n_krylov
309  std::vector<ColorSpinorField*> P;
310  for (int i = 0; i < n_krylov; i++) P.push_back(p[i]);
311  blas::caxpy(alpha, P, X);
312 
313  // no need to compute residual vector if not returning
314  // residual vector and only doing a single fixed iteration
315  if (!fixed_iteration || param.return_residual) {
316  // update the residual vector
317  std::vector<ColorSpinorField*> R;
318  R.push_back(&r);
319  for (int i = 0; i < n_krylov; i++) alpha[i] = -alpha[i];
320  blas::caxpy(alpha, q, R);
321  }
322 
323  total_iter += n_krylov;
324  if ( !fixed_iteration || getVerbosity() >= QUDA_DEBUG_VERBOSE) {
325  // only compute the residual norm if we need to
326  r2 = blas::norm2(r);
327  }
328 
329  PrintStats("CA-GCR", total_iter, r2, b2, heavy_quark_res);
330 
331  // update since n_krylov or maxiter reached, converged or reliable update required
332  // note that the heavy quark residual will by definition only be checked every n_krylov steps
333  if (total_iter>=param.maxiter || (r2 < stop && !l2_converge) || sqrt(r2/r2_old) < param.delta) {
334 
335  if ( (r2 < stop || total_iter>=param.maxiter) && param.sloppy_converge) break;
336  mat(r, x, tmp);
337  r2 = blas::xmyNorm(b, r);
338 
339  if (param.deflate && sqrt(r2) < maxr_deflate * param.tol_restart) {
340  // Deflate and accumulate to solution vector
341  eig_solve->deflateSVD(x, r, evecs, evals, true);
342 
343  // Compute r_defl = RHS - A * LHS
344  mat(r, x, tmp);
345  r2 = blas::xmyNorm(b, r);
346 
347  maxr_deflate = sqrt(r2);
348  }
349 
350  if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z);
351 
352  // break-out check if we have reached the limit of the precision
353  if (r2 > r2_old) {
354  resIncrease++;
355  resIncreaseTotal++;
356  warningQuda("CA-GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
357  sqrt(r2), sqrt(r2_old), resIncreaseTotal);
358  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
359  warningQuda("CA-GCR: solver exiting due to too many true residual norm increases");
360  break;
361  }
362  } else {
363  resIncrease = 0;
364  }
365 
366  r2_old = r2;
367  }
368 
369  // No matter what, if we haven't converged, we do a restart.
370  if ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) ) {
371  restart++; // restarting if residual is still too great
372 
373  PrintStats("CA-GCR (restart)", restart, r2, b2, heavy_quark_res);
374  blas::copy(*p[0],r); // no-op if uni-precision
375 
376  r2_old = r2;
377 
378  // prevent ending the Krylov space prematurely if other convergence criteria not met
379  if (r2 < stop) l2_converge = true;
380  }
381 
382  }
383 
384  if (total_iter>param.maxiter && getVerbosity() >= QUDA_SUMMARIZE)
385  warningQuda("Exceeded maximum iterations %d", param.maxiter);
386 
387  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CA-GCR: number of restarts = %d\n", restart);
388 
389  if (param.compute_true_res) {
390  // Calculate the true residual
391  mat(r, x, tmp);
392  double true_res = blas::xmyNorm(b, r);
393  param.true_res = sqrt(true_res / b2);
395  if (param.return_residual) blas::copy(b, r);
396  } else {
397  if (param.return_residual) blas::copy(b, r);
398  }
399 
400  if (!param.is_preconditioner) {
401  qudaDeviceSynchronize(); // ensure solver is complete before ending timing
405 
406  // store flops and reset counters
407  double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9;
408 
409  param.gflops += gflops;
410  param.iter += total_iter;
411 
412  // reset the flops counters
413  blas::flops = 0;
414  mat.flops();
415  matSloppy.flops();
416  matPrecon.flops();
417  matMdagM.flops();
418 
420  }
421 
422  PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq);
423  }
424 
425 } // namespace quda
CAGCR(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_gcr.cpp:7
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_gcr.cpp:168
virtual ~CAGCR()
Definition: inv_ca_gcr.cpp:22
ColorSpinorField * CreateAlias(const ColorSpinorParam &param)
Create a field that aliases this field's storage. The alias field can use a different precision than ...
static ColorSpinorField * Create(const ColorSpinorParam &param)
unsigned long long flops() const
Definition: dirac_quda.h:1909
void deflateSVD(std::vector< ColorSpinorField * > &sol, const std::vector< ColorSpinorField * > &vec, const std::vector< ColorSpinorField * > &evecs, const std::vector< Complex > &evals, bool accumulate=false) const
Deflate a set of source vectors with a set of left and right singular vectors.
void computeEvals(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals, int size)
Compute eigenvalues and their residiua.
void computeSVD(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals)
Computes Left/Right SVD from pre computed Right/Left.
bool deflate_compute
Definition: invert_quda.h:475
TimeProfile & profile
Definition: invert_quda.h:471
const DiracMatrix & mat
Definition: invert_quda.h:465
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:328
bool recompute_evals
Definition: invert_quda.h:476
std::vector< ColorSpinorField * > evecs
Definition: invert_quda.h:477
void destroyDeflationSpace()
Destroy the allocated deflation space.
Definition: solver.cpp:229
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE)....
Definition: solver.cpp:386
SolverParam & param
Definition: invert_quda.h:470
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:311
void extendSVDDeflationSpace()
Extends the deflation space to twice its size for SVD deflation.
Definition: solver.cpp:287
std::vector< Complex > evals
Definition: invert_quda.h:478
EigenSolver * eig_solve
Definition: invert_quda.h:473
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE)
Definition: solver.cpp:373
void constructDeflationSpace(const ColorSpinorField &meta, const DiracMatrix &mat)
Constructs the deflation space and eigensolver.
Definition: solver.cpp:168
const DiracMatrix & matPrecon
Definition: invert_quda.h:467
const DiracMatrix & matSloppy
Definition: invert_quda.h:466
double Last(QudaProfileType idx)
Definition: timer.h:254
QudaCABasis ca_basis
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
@ QUDA_POWER_BASIS
Definition: enum_quda.h:201
@ QUDA_USE_INIT_GUESS_NO
Definition: enum_quda.h:429
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195
@ QUDA_EIG_BLK_TR_LANCZOS
Definition: enum_quda.h:138
@ QUDA_EIG_TR_LANCZOS
Definition: enum_quda.h:137
@ QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:238
@ QUDA_PRESERVE_SOURCE_YES
Definition: enum_quda.h:239
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
@ QUDA_COMPUTE_NULL_VECTOR_NO
Definition: enum_quda.h:441
#define checkPrecision(...)
void init()
Create the BLAS context.
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:79
unsigned long long flops
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45
void hDotProduct(Complex *result, std::vector< ColorSpinorField * > &a, std::vector< ColorSpinorField * > &b)
Computes the matrix of inner products between the vector set a and the vector set b....
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void stop()
Stop profiling.
Definition: device.cpp:228
std::complex< double > Complex
Definition: quda_internal.h:86
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_FREE
Definition: timer.h:111
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107
@ QUDA_PROFILE_EIGEN
Definition: timer.h:114
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
QudaEigType eig_type
Definition: quda.h:416
QudaPreserveSource preserve_source
Definition: invert_quda.h:151
QudaPrecision precision
Definition: invert_quda.h:136
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:61
bool is_preconditioner
verbosity to use for preconditioner
Definition: invert_quda.h:238
int max_res_increase_total
Definition: invert_quda.h:90
QudaResidualType residual_type
Definition: invert_quda.h:49
QudaPrecision precision_sloppy
Definition: invert_quda.h:139
QudaEigParam eig_param
Definition: invert_quda.h:55
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:58
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120