QUDA  v1.1.0
A library for QCD on GPUs
inv_ca_cg.cpp
Go to the documentation of this file.
1 #include <invert_quda.h>
2 #include <blas_quda.h>
3 #include <eigen_helper.h>
4 
13 namespace quda {
14 
15  CACG::CACG(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
16  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
17  Solver(mat, matSloppy, matPrecon, matEig, param, profile),
18  init(false),
19  lambda_init(false),
20  basis(param.ca_basis),
21  Q_AQandg(nullptr),
22  Q_AS(nullptr),
23  alpha(nullptr),
24  beta(nullptr),
25  rp(nullptr),
26  tmpp(nullptr),
27  tmpp2(nullptr),
28  tmp_sloppy(nullptr),
29  tmp_sloppy2(nullptr)
30  {
31  }
32 
35 
36  if (init) {
37  if (Q_AQandg) delete []Q_AQandg;
38  if (Q_AS) delete []Q_AS;
39  if (alpha) delete []alpha;
40  if (beta) delete []beta;
41 
42  bool use_source = false; // needed for explicit residual recompute
43  // (param.preserve_source == QUDA_PRESERVE_SOURCE_NO &&
44  // param.precision == param.precision_sloppy &&
45  // param.use_init_guess == QUDA_USE_INIT_GUESS_NO);
46  if (basis == QUDA_POWER_BASIS) {
47  for (int i=0; i<param.Nkrylov+1; i++) if (i>0 || !use_source) delete S[i];
48  } else {
49  for (int i=0; i<param.Nkrylov; i++) if (i>0 || !use_source) delete S[i];
50  for (int i=0; i<param.Nkrylov; i++) delete AS[i];
51  }
52  for (int i=0; i<param.Nkrylov; i++) delete Q[i];
53  for (int i=0; i<param.Nkrylov; i++) delete Qtmp[i];
54  for (int i=0; i<param.Nkrylov; i++) delete AQ[i];
55 
56  if (tmp_sloppy) delete tmp_sloppy;
57  if (tmp_sloppy2) delete tmp_sloppy2;
58  if (tmpp) delete tmpp;
59  if (tmpp2) delete tmpp2;
60  if (rp) delete rp;
61  }
62 
64 
66  }
67 
68  CACGNE::CACGNE(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
69  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
70  CACG(mmdag, mmdagSloppy, mmdagPrecon, mmdagEig, param, profile),
71  mmdag(mat.Expose()),
72  mmdagSloppy(matSloppy.Expose()),
73  mmdagPrecon(matPrecon.Expose()),
74  mmdagEig(matEig.Expose()),
75  xp(nullptr),
76  yp(nullptr),
77  init(false)
78  {
79  }
80 
82  if ( init ) {
83  if (xp) delete xp;
84  if (yp) delete yp;
85  init = false;
86  }
87  }
88 
89  // CACGNE: M Mdag y = b is solved; x = Mdag y is returned as solution.
91  if (param.maxiter == 0 || param.Nsteps == 0) {
93  return;
94  }
95 
96  const int iter0 = param.iter;
97 
98  if (!init) {
104  init = true;
105  }
106 
107  double b2 = blas::norm2(b);
108 
110 
111  // compute initial residual
112  mmdag.Expose()->M(*xp,x);
113  double r2 = blas::xmyNorm(b,*xp);
114  if (b2 == 0.0) b2 = r2;
115 
116  // compute solution to residual equation
117  CACG::operator()(*yp,*xp);
118 
119  mmdag.Expose()->Mdag(*xp,*yp);
120 
121  // compute full solution
122  blas::xpy(*xp, x);
123 
124  } else {
125 
126  CACG::operator()(*yp,b);
127  mmdag.Expose()->Mdag(x,*yp);
128 
129  }
130 
131  // future optimization: with preserve_source == QUDA_PRESERVE_SOURCE_NO; b is already
132  // expected to be the CG residual which matches the CGNE residual
133  // (but only with zero initial guess). at the moment, CG does not respect this convention
135 
136  // compute the true residual
137  mmdag.Expose()->M(*xp, x);
138 
141  blas::axpby(-1.0, A, 1.0, B);
142 
143  double r2;
145  double3 h3 = blas::HeavyQuarkResidualNorm(x, B);
146  r2 = h3.y;
147  param.true_res_hq = sqrt(h3.z);
148  } else {
149  r2 = blas::norm2(B);
150  }
151  param.true_res = sqrt(r2 / b2);
152 
153  PrintSummary("CA-CGNE", param.iter - iter0, r2, b2, stopping(param.tol, b2, param.residual_type), param.tol_hq);
154  }
155 
156  }
157 
158  CACGNR::CACGNR(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
159  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
160  CACG(mdagm, mdagmSloppy, mdagmPrecon, mdagmEig, param, profile),
161  mdagm(mat.Expose()),
162  mdagmSloppy(matSloppy.Expose()),
163  mdagmPrecon(matPrecon.Expose()),
164  mdagmEig(matEig.Expose()),
165  bp(nullptr),
166  init(false)
167  {
168  }
169 
171  if ( init ) {
172  if (bp) delete bp;
173  init = false;
174  }
175  }
176 
177  // CACGNR: Mdag M x = Mdag b is solved.
179  if (param.maxiter == 0 || param.Nsteps == 0) {
181  return;
182  }
183 
184  const int iter0 = param.iter;
185 
186  if (!init) {
190 
191  init = true;
192  }
193 
194  double b2 = blas::norm2(b);
195  if (b2 == 0.0) { // compute initial residual vector
196  mdagm.Expose()->M(*bp,x);
197  b2 = blas::norm2(*bp);
198  }
199 
200  mdagm.Expose()->Mdag(*bp,b);
201  CACG::operator()(x,*bp);
202 
204 
205  // compute the true residual
206  mdagm.Expose()->M(*bp, x);
207 
210  blas::axpby(-1.0, A, 1.0, B);
211 
212  double r2;
214  double3 h3 = blas::HeavyQuarkResidualNorm(x, B);
215  r2 = h3.y;
216  param.true_res_hq = sqrt(h3.z);
217  } else {
218  r2 = blas::norm2(B);
219  }
220  param.true_res = sqrt(r2 / b2);
221  PrintSummary("CA-CGNR", param.iter - iter0, r2, b2, stopping(param.tol, b2, param.residual_type), param.tol_hq);
222 
224  mdagm.Expose()->M(*bp, x);
225  blas::axpby(-1.0, *bp, 1.0, b);
226  }
227 
228  }
229 
230  void CACG::create(ColorSpinorField &b)
231  {
232  if (!init) {
233  if (!param.is_preconditioner) {
234  blas::flops = 0;
235  profile.TPSTART(QUDA_PROFILE_INIT);
236  }
237 
238  Q_AQandg = new Complex[param.Nkrylov*(param.Nkrylov+1)];
239  Q_AS = new Complex[param.Nkrylov*param.Nkrylov];
240  alpha = new Complex[param.Nkrylov];
241  beta = new Complex[param.Nkrylov*param.Nkrylov];
242 
243  bool mixed = param.precision != param.precision_sloppy;
244  bool use_source = false; // need to preserve source for residual computation
245  //(param.preserve_source == QUDA_PRESERVE_SOURCE_NO && !mixed &&
246  // param.use_init_guess == QUDA_USE_INIT_GUESS_NO);
247 
248  ColorSpinorParam csParam(b);
250 
251  // Source needs to be preserved if we're computing the true residual
252  rp = (mixed && !use_source) ? ColorSpinorField::Create(csParam) : nullptr;
255 
256  // now allocate sloppy fields
257  csParam.setPrecision(param.precision_sloppy);
258 
259  if (basis == QUDA_POWER_BASIS) {
260  // in power basis AS[k] = S[k+1], so we don't need a separate array
261  S.resize(param.Nkrylov+1);
262  AS.resize(param.Nkrylov);
263  Q.resize(param.Nkrylov);
264  AQ.resize(param.Nkrylov);
265  Qtmp.resize(param.Nkrylov); // for pointer swap
266  for (int i=0; i<param.Nkrylov+1; i++) {
267  S[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
268  if (i>0) AS[i-1] = S[i];
269  }
270  } else {
271  S.resize(param.Nkrylov);
272  AS.resize(param.Nkrylov);
273  Q.resize(param.Nkrylov);
274  AQ.resize(param.Nkrylov);
275  Qtmp.resize(param.Nkrylov);
276  for (int i=0; i<param.Nkrylov; i++) {
277  S[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
279  }
280  }
281 
282  for (int i=0; i<param.Nkrylov; i++) Q[i] = ColorSpinorField::Create(csParam);
283  for (int i=0; i<param.Nkrylov; i++) Qtmp[i] = ColorSpinorField::Create(csParam);
284  for (int i=0; i<param.Nkrylov; i++) AQ[i] = ColorSpinorField::Create(csParam);
285 
286  //sloppy temporary for mat-vec
287  tmp_sloppy = mixed ? ColorSpinorField::Create(csParam) : nullptr;
288  tmp_sloppy2 = mixed ? ColorSpinorField::Create(csParam) : nullptr;
289 
291 
292  init = true;
293  } // init
294  }
295 
296  // template!
297  template <int N>
298  void compute_alpha_N(Complex* Q_AQandg, Complex* alpha)
299  {
300  typedef Matrix<Complex, N, N, RowMajor> matrix;
302 
303  matrix matQ_AQ(N,N);
304  vector vecg(N);
305  for (int i=0; i<N; i++) {
306  vecg(i) = Q_AQandg[i*(N+1)+N];
307  for (int j=0; j<N; j++) {
308  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
309  }
310  }
311  Map<vector> vecalpha(alpha,N);
312 
313  vecalpha = matQ_AQ.fullPivLu().solve(vecg);
314 
315  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
316  //psi = svd.solve(phi);
317  }
318 
319  void CACG::compute_alpha()
320  {
321  if (!param.is_preconditioner) {
324  profile.TPSTART(QUDA_PROFILE_EIGEN);
325  }
326 
327  const int N = Q.size();
328  switch (N) {
329 #if 0 // since CA-CG is not used anywhere at the moment, no point paying for this compilation cost
330  case 1: compute_alpha_N<1>(Q_AQandg, alpha); break;
331  case 2: compute_alpha_N<2>(Q_AQandg, alpha); break;
332  case 3: compute_alpha_N<3>(Q_AQandg, alpha); break;
333  case 4: compute_alpha_N<4>(Q_AQandg, alpha); break;
334  case 5: compute_alpha_N<5>(Q_AQandg, alpha); break;
335  case 6: compute_alpha_N<6>(Q_AQandg, alpha); break;
336  case 7: compute_alpha_N<7>(Q_AQandg, alpha); break;
337  case 8: compute_alpha_N<8>(Q_AQandg, alpha); break;
338  case 9: compute_alpha_N<9>(Q_AQandg, alpha); break;
339  case 10: compute_alpha_N<10>(Q_AQandg, alpha); break;
340  case 11: compute_alpha_N<11>(Q_AQandg, alpha); break;
341  case 12: compute_alpha_N<12>(Q_AQandg, alpha); break;
342 #endif
343  default: // failsafe
345  typedef Matrix<Complex, Dynamic, 1> vector;
346 
347  const int N = Q.size();
348  matrix matQ_AQ(N, N);
349  vector vecg(N);
350  for (int i = 0; i < N; i++) {
351  vecg(i) = Q_AQandg[i * (N + 1) + N];
352  for (int j = 0; j < N; j++) { matQ_AQ(i, j) = Q_AQandg[i * (N + 1) + j]; }
353  }
354  Map<vector> vecalpha(alpha, N);
355 
356  vecalpha = matQ_AQ.fullPivLu().solve(vecg);
357 
358  // JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
359  // psi = svd.solve(phi);
360  break;
361  }
362 
363  if (!param.is_preconditioner) {
364  profile.TPSTOP(QUDA_PROFILE_EIGEN);
366  profile.TPSTART(QUDA_PROFILE_COMPUTE);
367  }
368  }
369 
370  // template!
371  template <int N>
372  void compute_beta_N(Complex* Q_AQandg, Complex* Q_AS, Complex* beta)
373  {
374  typedef Matrix<Complex, N, N, RowMajor> matrix;
375 
376  matrix matQ_AQ(N,N);
377  for (int i=0; i<N; i++) {
378  for (int j=0; j<N; j++) {
379  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
380  }
381  }
382  Map<matrix> matQ_AS(Q_AS,N,N), matbeta(beta,N,N);
383 
384  matQ_AQ = -matQ_AQ;
385  matbeta = matQ_AQ.fullPivLu().solve(matQ_AS);
386 
387  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
388  //psi = svd.solve(phi);
389  }
390 
391  void CACG::compute_beta()
392  {
393  if (!param.is_preconditioner) {
396  profile.TPSTART(QUDA_PROFILE_EIGEN);
397  }
398 
399  const int N = Q.size();
400  switch (N) {
401 #if 0 // since CA-CG is not used anywhere at the moment, no point paying for this compilation cost
402  case 1: compute_beta_N<1>(Q_AQandg, Q_AS, beta); break;
403  case 2: compute_beta_N<2>(Q_AQandg, Q_AS, beta); break;
404  case 3: compute_beta_N<3>(Q_AQandg, Q_AS, beta); break;
405  case 4: compute_beta_N<4>(Q_AQandg, Q_AS, beta); break;
406  case 5: compute_beta_N<5>(Q_AQandg, Q_AS, beta); break;
407  case 6: compute_beta_N<6>(Q_AQandg, Q_AS, beta); break;
408  case 7: compute_beta_N<7>(Q_AQandg, Q_AS, beta); break;
409  case 8: compute_beta_N<8>(Q_AQandg, Q_AS, beta); break;
410  case 9: compute_beta_N<9>(Q_AQandg, Q_AS, beta); break;
411  case 10: compute_beta_N<10>(Q_AQandg, Q_AS, beta); break;
412  case 11: compute_beta_N<11>(Q_AQandg, Q_AS, beta); break;
413  case 12: compute_beta_N<12>(Q_AQandg, Q_AS, beta); break;
414 #endif
415  default: // failsafe
417 
418  const int N = Q.size();
419  matrix matQ_AQ(N, N);
420  for (int i = 0; i < N; i++) {
421  for (int j = 0; j < N; j++) { matQ_AQ(i, j) = Q_AQandg[i * (N + 1) + j]; }
422  }
423  Map<matrix> matQ_AS(Q_AS, N, N), matbeta(beta, N, N);
424 
425  matQ_AQ = -matQ_AQ;
426  matbeta = matQ_AQ.fullPivLu().solve(matQ_AS);
427 
428  // JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
429  // psi = svd.solve(phi);
430  }
431 
432  if (!param.is_preconditioner) {
433  profile.TPSTOP(QUDA_PROFILE_EIGEN);
435  profile.TPSTART(QUDA_PROFILE_COMPUTE);
436  }
437  }
438 
439  // Code to check for reliable updates
440  int CACG::reliable(double &rNorm, double &maxrr, int &rUpdate, const double &r2, const double &delta) {
441  // reliable updates
442  rNorm = sqrt(r2);
443  if (rNorm > maxrr) maxrr = rNorm;
444 
445  int updateR = (rNorm < delta*maxrr) ? 1 : 0;
446 
447  if (updateR) {
448  rUpdate++;
449  rNorm = sqrt(r2);
450  maxrr = rNorm;
451  }
452 
453  //printfQuda("Reliable triggered: %d %e\n", updateR, rNorm);
454 
455  return updateR;
456  }
457 
458  /*
459  The main CA-CG algorithm, which consists of three main steps:
460  1. Build basis vectors q_k = A p_k for k = 1..Nkrlylov
461  2. Steepest descent minmization of the residual in this basis
462  3. Update solution and residual vectors
463  */
465  {
466  const int n_krylov = param.Nkrylov;
467 
468  if (checkPrecision(x,b) != param.precision) errorQuda("Precision mismatch %d %d", checkPrecision(x,b), param.precision);
469  if (param.return_residual && param.preserve_source == QUDA_PRESERVE_SOURCE_YES) errorQuda("Cannot preserve source and return the residual");
470 
471  if (param.maxiter == 0 || n_krylov == 0) {
473  return;
474  }
475 
476  create(b);
477 
478  ColorSpinorField &r_ = rp ? *rp : *S[0];
479  ColorSpinorField &tmp = *tmpp;
480  ColorSpinorField &tmp2 = *tmpp2;
481  ColorSpinorField &tmpSloppy = tmp_sloppy ? *tmp_sloppy : tmp;
482  ColorSpinorField &tmpSloppy2 = tmp_sloppy2 ? *tmp_sloppy2 : tmp2;
483 
485 
486  // compute b2, but only if we need to
487  bool fixed_iteration = param.sloppy_converge && n_krylov == param.maxiter && !param.compute_true_res;
488  double b2 = !fixed_iteration ? blas::norm2(b) : 1.0;
489  double r2 = 0.0; // if zero source then we will exit immediately doing no work
490 
491  if (param.deflate) {
492  // Construct the eigensolver and deflation space.
494  if (deflate_compute) {
495  // compute the deflation space.
497  (*eig_solve)(evecs, evals);
499  deflate_compute = false;
500  }
501  if (recompute_evals) {
503  recompute_evals = false;
504  }
505  }
506 
507  // compute intitial residual depending on whether we have an initial guess or not
509  mat(r_, x, tmp, tmp2);
510  //r = b - Ax0
511  if (!fixed_iteration) {
512  r2 = blas::xmyNorm(b, r_);
513  } else {
514  blas::xpay(b, -1.0, r_);
515  r2 = b2; // dummy setting
516  }
517  } else {
518  r2 = b2;
519  blas::copy(r_, b);
520  blas::zero(x);
521  }
522 
523  if (param.deflate && param.maxiter > 1) {
524  // Deflate and add solution to accumulator
525  eig_solve->deflate(x, r_, evecs, evals, true);
526 
527  mat(r_, x, tmp, tmp2);
528  if (!fixed_iteration) {
529  r2 = blas::xmyNorm(b, r_);
530  } else {
531  blas::xpay(b, -1.0, r_);
532  r2 = b2; // dummy setting
533  }
534  }
535 
536  // Use power iterations to approx lambda_max
537  auto &lambda_min = param.ca_lambda_min;
538  auto &lambda_max = param.ca_lambda_max;
539 
540  if (basis == QUDA_CHEBYSHEV_BASIS && lambda_max < lambda_min && !lambda_init) {
542 
543  *Q[0] = r_; // do power iterations on this
544  // Do 100 iterations, normalize every 10.
545  for (int i = 0; i < 10; i++) {
546  double tmpnrm = blas::norm2(*Q[0]);
547  blas::ax(1.0/sqrt(tmpnrm), *Q[0]);
548  for (int j = 0; j < 10; j++) {
549  matSloppy(*AQ[0], *Q[0], tmpSloppy, tmpSloppy2);
550  if (j == 0 && getVerbosity() >= QUDA_VERBOSE) {
551  printfQuda("Current Rayleigh Quotient step %d is %e\n", i*10+1, sqrt(blas::norm2(*AQ[0])));
552  }
553  std::swap(AQ[0], Q[0]);
554  }
555  }
556  // Get Rayleigh quotient
557  double tmpnrm = blas::norm2(*Q[0]);
558  blas::ax(1.0/sqrt(tmpnrm), *Q[0]);
559  matSloppy(*AQ[0], *Q[0], tmpSloppy, tmpSloppy2);
560  lambda_max = 1.1*(sqrt(blas::norm2(*AQ[0])));
561  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("CA-CG Approximate lambda max = 1.1 x %e\n", lambda_max/1.1);
562  lambda_init = true;
563 
565  }
566 
567  // Check to see that we're not trying to invert on a zero-field source
568  if (b2 == 0) {
570  warningQuda("inverting on zero-field source\n");
571  x = b;
572  param.true_res = 0.0;
573  param.true_res_hq = 0.0;
574  return;
575  } else {
576  b2 = r2;
577  }
578  }
579 
580  double stop = !fixed_iteration ? stopping(param.tol, b2, param.residual_type) : 0.0; // stopping condition of solver
581 
582  const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
583 
584  // this parameter determines how many consective reliable update
585  // reisudal increases we tolerate before terminating the solver,
586  // i.e., how long do we want to keep trying to converge
587  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
588  const int maxResIncreaseTotal = param.max_res_increase_total;
589 
590  double heavy_quark_res = 0.0; // heavy quark residual
591  if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r_).z);
592 
593  int resIncrease = 0;
594  int resIncreaseTotal = 0;
595 
596  if (!param.is_preconditioner) {
597  blas::flops = 0;
599  profile.TPSTART(QUDA_PROFILE_COMPUTE);
600  }
601  int total_iter = 0;
602  double r2_old = r2;
603  bool l2_converge = false;
604 
605  // Various variables related to reliable updates.
606  int rUpdate = 0; // count reliable updates.
607  double delta = param.delta; // delta for reliable updates.
608  double rNorm = sqrt(r2); // The current residual norm.
609  double maxrr = rNorm; // The maximum residual norm since the last reliable update.
610  double maxr_deflate = rNorm; // The maximum residual since the last deflation
611 
612  // Factors which map linear operator onto [-1,1]
613  double m_map = 2./(lambda_max-lambda_min);
614  double b_map = -(lambda_max+lambda_min)/(lambda_max-lambda_min);
615 
616  blas::copy(*S[0], r_); // no op if uni-precision
617 
618  PrintStats("CA-CG", total_iter, r2, b2, heavy_quark_res);
619  while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) {
620 
621  // build up a space of size n_krylov
622  if (basis == QUDA_POWER_BASIS) {
623  for (int k = 0; k < n_krylov; k++) { matSloppy(*AS[k], *S[k], tmpSloppy, tmpSloppy2); }
624  } else { // chebyshev basis
625 
626  matSloppy(*AS[0], *S[0], tmpSloppy, tmpSloppy2);
627 
628  if (n_krylov > 1) {
629  // S_1 = m AS_0 + b S_0
630  Complex facs1[] = { m_map, b_map };
631  std::vector<ColorSpinorField*> recur1{AS[0],S[0]};
632  std::vector<ColorSpinorField*> S1{S[1]};
633  blas::zero(*S[1]);
634  blas::caxpy(facs1,recur1,S1);
635  matSloppy(*AS[1], *S[1], tmpSloppy, tmpSloppy2);
636 
637  // Enter recursion relation
638  if (n_krylov > 2) {
639  // S_k = 2 m AS_{k-1} + 2 b S_{k-1} - S_{k-2}
640  Complex factors[] = { 2.*m_map, 2.*b_map, -1 };
641  for (int k = 2; k < n_krylov; k++) {
642  std::vector<ColorSpinorField*> recur2{AS[k-1],S[k-1],S[k-2]};
643  std::vector<ColorSpinorField*> Sk{S[k]};
644  blas::zero(*S[k]);
645  blas::caxpy(factors, recur2, Sk);
646  matSloppy(*AS[k], *S[k], tmpSloppy, tmpSloppy2);
647  }
648  }
649  }
650  }
651 
652  // first iteration, copy S and AS into Q and AQ
653  if (total_iter == 0) {
654  // first iteration Q = S
655  for (int i = 0; i < n_krylov; i++) *Q[i] = *S[i];
656  for (int i = 0; i < n_krylov; i++) *AQ[i] = *AS[i];
657 
658  } else {
659 
660 
661  // Compute the beta coefficients for updating Q, AQ
662  // 1. compute matrix Q_AS = -Q^\dagger AS
663  // 2. Solve Q_AQ beta = Q_AS
664  std::vector<ColorSpinorField*> R;
665  for (int i = 0; i < n_krylov; i++) R.push_back(S[i]);
666  blas::cDotProduct(Q_AS, AQ, R);
667  for (int i = 0; i < param.Nkrylov*param.Nkrylov; i++) { Q_AS[i] = real(Q_AS[i]); }
668 
669  compute_beta();
670 
671  // update direction vectors
672  blas::caxpyz(beta, Q, R, Qtmp);
673  for (int i = 0; i < n_krylov; i++) std::swap(Q[i], Qtmp[i]);
674 
675  blas::caxpyz(beta, AQ, AS, Qtmp);
676  for (int i = 0; i < n_krylov; i++) std::swap(AQ[i], Qtmp[i]);
677  }
678 
679  // compute the alpha coefficients
680  // 1. Compute Q_AQ = Q^\dagger AQ and g = Q^dagger r = Q^dagger S[0]
681  // 2. Solve Q_AQ alpha = g
682  {
683  std::vector<ColorSpinorField*> Q2;
684  for (int i = 0; i < n_krylov; i++) Q2.push_back(AQ[i]);
685  Q2.push_back(S[0]);
686  blas::cDotProduct(Q_AQandg, Q, Q2);
687 
688  for (int i = 0; i < param.Nkrylov*(param.Nkrylov+1); i++) { Q_AQandg[i] = real(Q_AQandg[i]); }
689 
690  compute_alpha();
691  }
692 
693  // update the solution vector
694  std::vector<ColorSpinorField*> X;
695  X.push_back(&x);
696  blas::caxpy(alpha, Q, X);
697 
698  // no need to compute residual vector if only doing a single fixed iteration
699  // perhaps we could save the mat-vec here if we compute "Ap"
700  // vectors when we update p?
701  if (!fixed_iteration) {
702  for (int i = 0; i < param.Nkrylov; i++) { alpha[i] = -alpha[i]; }
703  std::vector<ColorSpinorField*> S0{S[0]};
704 
705  // Can we fuse these? We don't need this reduce in all cases...
706  blas::caxpy(alpha, AQ, S0);
707  //if (getVerbosity() >= QUDA_VERBOSE) r2 = blas::norm2(*S[0]);
708  /*else*/ r2 = real(Q_AQandg[param.Nkrylov]); // actually the old r2... so we do one more iter than needed...
709 
710  }
711 
712  // NOTE: Because we always carry around the residual from an iteration before, we
713  // "lie" about which iteration we're on so the printed residual matches with the
714  // printed iteration number.
715  if (total_iter > 0) {
716  PrintStats("CA-CG", total_iter, r2, b2, heavy_quark_res);
717  }
718 
719  total_iter += n_krylov;
720 
721  // update since n_krylov or maxiter reached, converged or reliable update required
722  // note that the heavy quark residual will by definition only be checked every n_krylov steps
723  // Note: this won't reliable update when the norm _increases_.
724  if (total_iter>=param.maxiter || (r2 < stop && !l2_converge) || reliable(rNorm, maxrr, rUpdate, r2, delta)) {
725  if ( (r2 < stop || total_iter>=param.maxiter) && param.sloppy_converge) break;
726  mat(r_, x, tmp, tmp2);
727  r2 = blas::xmyNorm(b, r_);
728 
729  if (param.deflate && sqrt(r2) < maxr_deflate * param.tol_restart) {
730  // Deflate and add solution to accumulator
731  eig_solve->deflate(x, r_, evecs, evals, true);
732 
733  // Compute r_defl = RHS - A * LHS
734  mat(r_, x, tmp, tmp2);
735  r2 = blas::xmyNorm(b, r_);
736 
737  maxr_deflate = sqrt(r2);
738  }
739 
740  blas::copy(*S[0], r_);
741 
742  if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r_).z);
743 
744  // break-out check if we have reached the limit of the precision
745  if (r2 > r2_old) {
746  resIncrease++;
747  resIncreaseTotal++;
748  warningQuda("CA-CG: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
749  sqrt(r2), sqrt(r2_old), resIncreaseTotal);
750  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
751  warningQuda("CA-CG: solver exiting due to too many true residual norm increases");
752  break;
753  }
754  } else {
755  resIncrease = 0;
756  }
757 
758  r2_old = r2;
759  }
760 
761  }
762 
763  if (total_iter>param.maxiter && getVerbosity() >= QUDA_SUMMARIZE)
764  warningQuda("Exceeded maximum iterations %d", param.maxiter);
765 
766  // Print number of reliable updates.
767  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s: Reliable updates = %d\n", "CA-CG", rUpdate);
768 
769  if (param.compute_true_res) {
770  // Calculate the true residual
771  mat(r_, x, tmp, tmp2);
772  double true_res = blas::xmyNorm(b, r_);
773  param.true_res = sqrt(true_res / b2);
775  if (param.return_residual) blas::copy(b, r_);
776  } else {
777  if (param.return_residual) blas::copy(b, *Q[0]);
778  }
779 
780  if (!param.is_preconditioner) {
781  qudaDeviceSynchronize(); // ensure solver is complete before ending timing
785 
786  // store flops and reset counters
787  double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
788 
789  param.gflops += gflops;
790  param.iter += total_iter;
791 
792  // reset the flops counters
793  blas::flops = 0;
794  mat.flops();
795  matSloppy.flops();
796  matPrecon.flops();
797  matEig.flops();
798 
800  }
801 
802  PrintSummary("CA-CG", total_iter, r2, b2, stop, param.tol_hq);
803  }
804 
805 } // namespace quda
Communication-avoiding CG solver. This solver does un-preconditioned CG, running in steps of n_krylov...
Definition: invert_quda.h:989
virtual ~CACG()
Definition: inv_ca_cg.cpp:33
CACG(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:15
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:464
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:90
CACGNE(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:68
virtual ~CACGNE()
Definition: inv_ca_cg.cpp:81
virtual ~CACGNR()
Definition: inv_ca_cg.cpp:170
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:178
CACGNR(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:158
static ColorSpinorField * Create(const ColorSpinorParam &param)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
Apply M for the dirac op. E.g. the Schur Complement operator.
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Apply Mdag (daggered operator of M.
Definition: dirac.cpp:92
const Dirac * Expose() const
Definition: dirac_quda.h:1964
unsigned long long flops() const
Definition: dirac_quda.h:1909
void deflate(std::vector< ColorSpinorField * > &sol, const std::vector< ColorSpinorField * > &src, const std::vector< ColorSpinorField * > &evecs, const std::vector< Complex > &evals, bool accumulate=false) const
Deflate a set of source vectors with a given eigenspace.
void computeEvals(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals, int size)
Compute eigenvalues and their residiua.
bool deflate_compute
Definition: invert_quda.h:475
TimeProfile & profile
Definition: invert_quda.h:471
const DiracMatrix & mat
Definition: invert_quda.h:465
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:328
bool recompute_evals
Definition: invert_quda.h:476
std::vector< ColorSpinorField * > evecs
Definition: invert_quda.h:477
void destroyDeflationSpace()
Destroy the allocated deflation space.
Definition: solver.cpp:229
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE)....
Definition: solver.cpp:386
const DiracMatrix & matEig
Definition: invert_quda.h:468
SolverParam & param
Definition: invert_quda.h:470
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:311
std::vector< Complex > evals
Definition: invert_quda.h:478
EigenSolver * eig_solve
Definition: invert_quda.h:473
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE)
Definition: solver.cpp:373
void constructDeflationSpace(const ColorSpinorField &meta, const DiracMatrix &mat)
Constructs the deflation space and eigensolver.
Definition: solver.cpp:168
const DiracMatrix & matPrecon
Definition: invert_quda.h:467
const DiracMatrix & matSloppy
Definition: invert_quda.h:466
double Last(QudaProfileType idx)
Definition: timer.h:254
QudaCABasis ca_basis
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
@ QUDA_CHEBYSHEV_BASIS
Definition: enum_quda.h:202
@ QUDA_POWER_BASIS
Definition: enum_quda.h:201
@ QUDA_USE_INIT_GUESS_NO
Definition: enum_quda.h:429
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195
@ QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:238
@ QUDA_PRESERVE_SOURCE_YES
Definition: enum_quda.h:239
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
@ QUDA_COMPUTE_NULL_VECTOR_NO
Definition: enum_quda.h:441
#define checkPrecision(...)
void init()
Create the BLAS context.
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
void caxpyz(const Complex *a, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &y, std::vector< ColorSpinorField * > &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:79
unsigned long long flops
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45
void ax(double a, ColorSpinorField &x)
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:41
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void axpby(double a, ColorSpinorField &x, double b, ColorSpinorField &y)
Definition: blas_quda.h:44
void stop()
Stop profiling.
Definition: device.cpp:228
std::complex< double > Complex
Definition: quda_internal.h:86
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_FREE
Definition: timer.h:111
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107
@ QUDA_PROFILE_EIGEN
Definition: timer.h:114
void compute_alpha_N(Complex *Q_AQandg, Complex *alpha)
Definition: inv_ca_cg.cpp:298
void compute_beta_N(Complex *Q_AQandg, Complex *Q_AS, Complex *beta)
Definition: inv_ca_cg.cpp:372
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
void updateR()
update the radius for halos.
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
QudaPreserveSource preserve_source
Definition: invert_quda.h:151
QudaPrecision precision
Definition: invert_quda.h:136
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:61
bool is_preconditioner
verbosity to use for preconditioner
Definition: invert_quda.h:238
int max_res_increase_total
Definition: invert_quda.h:90
QudaResidualType residual_type
Definition: invert_quda.h:49
QudaPrecision precision_sloppy
Definition: invert_quda.h:139
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:58
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120