QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
inv_ca_cg.cpp
Go to the documentation of this file.
1 #include <invert_quda.h>
2 #include <blas_quda.h>
3 #include <Eigen/Dense>
4 
13 namespace quda {
14 
16  : Solver(param, profile), mat(mat), matSloppy(matSloppy), init(false),
17  lambda_init(false), basis(param.ca_basis),
18  Q_AQandg(nullptr), Q_AS(nullptr), alpha(nullptr), beta(nullptr), rp(nullptr),
19  tmpp(nullptr), tmpp2(nullptr), tmp_sloppy(nullptr), tmp_sloppy2(nullptr) { }
20 
23 
24  if (init) {
25  if (Q_AQandg) delete []Q_AQandg;
26  if (Q_AS) delete []Q_AS;
27  if (alpha) delete []alpha;
28  if (beta) delete []beta;
29 
30  bool use_source = false; // needed for explicit residual recompute
31  // (param.preserve_source == QUDA_PRESERVE_SOURCE_NO &&
32  // param.precision == param.precision_sloppy &&
33  // param.use_init_guess == QUDA_USE_INIT_GUESS_NO);
34  if (basis == QUDA_POWER_BASIS) {
35  for (int i=0; i<param.Nkrylov+1; i++) if (i>0 || !use_source) delete S[i];
36  } else {
37  for (int i=0; i<param.Nkrylov; i++) if (i>0 || !use_source) delete S[i];
38  for (int i=0; i<param.Nkrylov; i++) delete AS[i];
39  }
40  for (int i=0; i<param.Nkrylov; i++) delete Q[i];
41  for (int i=0; i<param.Nkrylov; i++) delete Qtmp[i];
42  for (int i=0; i<param.Nkrylov; i++) delete AQ[i];
43 
44  if (tmp_sloppy) delete tmp_sloppy;
45  if (tmp_sloppy2) delete tmp_sloppy2;
46  if (tmpp) delete tmpp;
47  if (tmpp2) delete tmpp2;
48  if (rp) delete rp;
49  }
50 
51  if (deflate_init) {
52  for (auto veci : param.evecs)
53  if (veci) delete veci;
54  delete defl_tmp1[0];
55  delete defl_tmp2[0];
56  }
57 
59  }
60 
62  CACG(mmdag, mmdagSloppy, param, profile), mmdag(mat.Expose()), mmdagSloppy(matSloppy.Expose()),
63  xp(nullptr), yp(nullptr), init(false) {
64  }
65 
67  if ( init ) {
68  if (xp) delete xp;
69  if (yp) delete yp;
70  init = false;
71  }
72  }
73 
74  // CACGNE: M Mdag y = b is solved; x = Mdag y is returned as solution.
76  if (param.maxiter == 0 || param.Nsteps == 0) {
78  return;
79  }
80 
81  const int iter0 = param.iter;
82 
83  if (!init) {
86  xp = ColorSpinorField::Create(x, csParam);
88  yp = ColorSpinorField::Create(x, csParam);
89  init = true;
90  }
91 
92  double b2 = blas::norm2(b);
93 
95 
96  // compute initial residual
97  mmdag.Expose()->M(*xp,x);
98  double r2 = blas::xmyNorm(b,*xp);
99  if (b2 == 0.0) b2 = r2;
100 
101  // compute solution to residual equation
103 
104  mmdag.Expose()->Mdag(*xp,*yp);
105 
106  // compute full solution
107  blas::xpy(*xp, x);
108 
109  } else {
110 
111  CACG::operator()(*yp,b);
112  mmdag.Expose()->Mdag(x,*yp);
113 
114  }
115 
116  // future optimization: with preserve_source == QUDA_PRESERVE_SOURCE_NO; b is already
117  // expected to be the CG residual which matches the CGNE residual
118  // (but only with zero initial guess). at the moment, CG does not respect this convention
120 
121  // compute the true residual
122  mmdag.Expose()->M(*xp, x);
123 
126  blas::axpby(-1.0, A, 1.0, B);
127 
128  double r2;
130  double3 h3 = blas::HeavyQuarkResidualNorm(x, B);
131  r2 = h3.y;
132  param.true_res_hq = sqrt(h3.z);
133  } else {
134  r2 = blas::norm2(B);
135  }
136  param.true_res = sqrt(r2 / b2);
137 
138  PrintSummary("CA-CGNE", param.iter - iter0, r2, b2, stopping(param.tol, b2, param.residual_type), param.tol_hq);
139  }
140 
141  }
142 
144  CACG(mdagm, mdagmSloppy, param, profile), mdagm(mat.Expose()), mdagmSloppy(matSloppy.Expose()),
145  bp(nullptr), init(false) {
146  }
147 
149  if ( init ) {
150  if (bp) delete bp;
151  init = false;
152  }
153  }
154 
155  // CACGNR: Mdag M x = Mdag b is solved.
157  if (param.maxiter == 0 || param.Nsteps == 0) {
159  return;
160  }
161 
162  const int iter0 = param.iter;
163 
164  if (!init) {
166  csParam.create = QUDA_ZERO_FIELD_CREATE;
167  bp = ColorSpinorField::Create(csParam);
168 
169  init = true;
170  }
171 
172  double b2 = blas::norm2(b);
173  if (b2 == 0.0) { // compute initial residual vector
174  mdagm.Expose()->M(*bp,x);
175  b2 = blas::norm2(*bp);
176  }
177 
178  mdagm.Expose()->Mdag(*bp,b);
179  CACG::operator()(x,*bp);
180 
182 
183  // compute the true residual
184  mdagm.Expose()->M(*bp, x);
185 
188  blas::axpby(-1.0, A, 1.0, B);
189 
190  double r2;
192  double3 h3 = blas::HeavyQuarkResidualNorm(x, B);
193  r2 = h3.y;
194  param.true_res_hq = sqrt(h3.z);
195  } else {
196  r2 = blas::norm2(B);
197  }
198  param.true_res = sqrt(r2 / b2);
199  PrintSummary("CA-CGNR", param.iter - iter0, r2, b2, stopping(param.tol, b2, param.residual_type), param.tol_hq);
200 
202  mdagm.Expose()->M(*bp, x);
203  blas::axpby(-1.0, *bp, 1.0, b);
204  }
205 
206  }
207 
209  {
210  if (!init) {
211  if (!param.is_preconditioner) {
212  blas::flops = 0;
213  profile.TPSTART(QUDA_PROFILE_INIT);
214  }
215 
218  alpha = new Complex[param.Nkrylov];
220 
221  bool mixed = param.precision != param.precision_sloppy;
222  bool use_source = false; // need to preserve source for residual computation
223  //(param.preserve_source == QUDA_PRESERVE_SOURCE_NO && !mixed &&
224  // param.use_init_guess == QUDA_USE_INIT_GUESS_NO);
225 
227  csParam.create = QUDA_NULL_FIELD_CREATE;
228 
229  // Source needs to be preserved if we're computing the true residual
230  rp = (mixed && !use_source) ? ColorSpinorField::Create(csParam) : nullptr;
231  tmpp = ColorSpinorField::Create(csParam);
232  tmpp2 = ColorSpinorField::Create(csParam);
233 
234  // now allocate sloppy fields
236 
237  if (basis == QUDA_POWER_BASIS) {
238  // in power basis AS[k] = S[k+1], so we don't need a separate array
239  S.resize(param.Nkrylov+1);
240  AS.resize(param.Nkrylov);
241  Q.resize(param.Nkrylov);
242  AQ.resize(param.Nkrylov);
243  Qtmp.resize(param.Nkrylov); // for pointer swap
244  for (int i=0; i<param.Nkrylov+1; i++) {
245  S[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
246  if (i>0) AS[i-1] = S[i];
247  }
248  } else {
249  S.resize(param.Nkrylov);
250  AS.resize(param.Nkrylov);
251  Q.resize(param.Nkrylov);
252  AQ.resize(param.Nkrylov);
253  Qtmp.resize(param.Nkrylov);
254  for (int i=0; i<param.Nkrylov; i++) {
255  S[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
256  AS[i] = ColorSpinorField::Create(csParam);
257  }
258  }
259 
260  for (int i=0; i<param.Nkrylov; i++) Q[i] = ColorSpinorField::Create(csParam);
261  for (int i=0; i<param.Nkrylov; i++) Qtmp[i] = ColorSpinorField::Create(csParam);
262  for (int i=0; i<param.Nkrylov; i++) AQ[i] = ColorSpinorField::Create(csParam);
263 
264  // Once the CACG operator is called, we are able to construct an appropriate
265  // Krylov space for deflation
266  if (param.deflate && !deflate_init) { constructDeflationSpace(b, mat, false); }
267 
268  //sloppy temporary for mat-vec
269  tmp_sloppy = mixed ? ColorSpinorField::Create(csParam) : nullptr;
270  tmp_sloppy2 = mixed ? ColorSpinorField::Create(csParam) : nullptr;
271 
273 
274  init = true;
275  } // init
276  }
277 
278  // template!
279  template <int N>
281  {
282  using namespace Eigen;
283  typedef Matrix<Complex, N, N, RowMajor> matrix;
284  typedef Matrix<Complex, N, 1> vector;
285 
286  matrix matQ_AQ(N,N);
287  vector vecg(N);
288  for (int i=0; i<N; i++) {
289  vecg(i) = Q_AQandg[i*(N+1)+N];
290  for (int j=0; j<N; j++) {
291  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
292  }
293  }
294  Map<vector> vecalpha(alpha,N);
295 
296  vecalpha = matQ_AQ.fullPivLu().solve(vecg);
297 
298  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
299  //psi = svd.solve(phi);
300  }
301 
303  {
304  if (!param.is_preconditioner) {
307  profile.TPSTART(QUDA_PROFILE_EIGEN);
308  }
309 
310  const int N = Q.size();
311  switch (N) {
312  case 1: compute_alpha_N<1>(Q_AQandg, alpha); break;
313  case 2: compute_alpha_N<2>(Q_AQandg, alpha); break;
314  case 3: compute_alpha_N<3>(Q_AQandg, alpha); break;
315  case 4: compute_alpha_N<4>(Q_AQandg, alpha); break;
316  case 5: compute_alpha_N<5>(Q_AQandg, alpha); break;
317  case 6: compute_alpha_N<6>(Q_AQandg, alpha); break;
318  case 7: compute_alpha_N<7>(Q_AQandg, alpha); break;
319  case 8: compute_alpha_N<8>(Q_AQandg, alpha); break;
320  case 9: compute_alpha_N<9>(Q_AQandg, alpha); break;
321  case 10: compute_alpha_N<10>(Q_AQandg, alpha); break;
322  case 11: compute_alpha_N<11>(Q_AQandg, alpha); break;
323  case 12: compute_alpha_N<12>(Q_AQandg, alpha); break;
324  default: // failsafe
325  using namespace Eigen;
327  typedef Matrix<Complex, Dynamic, 1> vector;
328 
329  const int N = Q.size();
330  matrix matQ_AQ(N,N);
331  vector vecg(N);
332  for (int i=0; i<N; i++) {
333  vecg(i) = Q_AQandg[i*(N+1)+N];
334  for (int j=0; j<N; j++) {
335  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
336  }
337  }
338  Map<vector> vecalpha(alpha,N);
339 
340  vecalpha = matQ_AQ.fullPivLu().solve(vecg);
341 
342  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
343  //psi = svd.solve(phi);
344  break;
345  }
346 
347  if (!param.is_preconditioner) {
348  profile.TPSTOP(QUDA_PROFILE_EIGEN);
350  profile.TPSTART(QUDA_PROFILE_COMPUTE);
351  }
352  }
353 
354  // template!
355  template <int N>
357  {
358  using namespace Eigen;
359  typedef Matrix<Complex, N, N, RowMajor> matrix;
360 
361  matrix matQ_AQ(N,N);
362  for (int i=0; i<N; i++) {
363  for (int j=0; j<N; j++) {
364  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
365  }
366  }
367  Map<matrix> matQ_AS(Q_AS,N,N), matbeta(beta,N,N);
368 
369  matQ_AQ = -matQ_AQ;
370  matbeta = matQ_AQ.fullPivLu().solve(matQ_AS);
371 
372  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
373  //psi = svd.solve(phi);
374  }
375 
377  {
378  if (!param.is_preconditioner) {
381  profile.TPSTART(QUDA_PROFILE_EIGEN);
382  }
383 
384  const int N = Q.size();
385  switch (N) {
386  case 1: compute_beta_N<1>(Q_AQandg, Q_AS, beta); break;
387  case 2: compute_beta_N<2>(Q_AQandg, Q_AS, beta); break;
388  case 3: compute_beta_N<3>(Q_AQandg, Q_AS, beta); break;
389  case 4: compute_beta_N<4>(Q_AQandg, Q_AS, beta); break;
390  case 5: compute_beta_N<5>(Q_AQandg, Q_AS, beta); break;
391  case 6: compute_beta_N<6>(Q_AQandg, Q_AS, beta); break;
392  case 7: compute_beta_N<7>(Q_AQandg, Q_AS, beta); break;
393  case 8: compute_beta_N<8>(Q_AQandg, Q_AS, beta); break;
394  case 9: compute_beta_N<9>(Q_AQandg, Q_AS, beta); break;
395  case 10: compute_beta_N<10>(Q_AQandg, Q_AS, beta); break;
396  case 11: compute_beta_N<11>(Q_AQandg, Q_AS, beta); break;
397  case 12: compute_beta_N<12>(Q_AQandg, Q_AS, beta); break;
398  default: // failsafe
399  using namespace Eigen;
401 
402  const int N = Q.size();
403  matrix matQ_AQ(N,N);
404  for (int i=0; i<N; i++) {
405  for (int j=0; j<N; j++) {
406  matQ_AQ(i,j) = Q_AQandg[i*(N+1)+j];
407  }
408  }
409  Map<matrix> matQ_AS(Q_AS,N,N), matbeta(beta,N,N);
410 
411  matQ_AQ = -matQ_AQ;
412  matbeta = matQ_AQ.fullPivLu().solve(matQ_AS);
413 
414  //JacobiSVD<matrix> svd(A, ComputeThinU | ComputeThinV);
415  //psi = svd.solve(phi);
416  }
417 
418  if (!param.is_preconditioner) {
419  profile.TPSTOP(QUDA_PROFILE_EIGEN);
421  profile.TPSTART(QUDA_PROFILE_COMPUTE);
422  }
423  }
424 
425  // Code to check for reliable updates
426  int CACG::reliable(double &rNorm, double &maxrr, int &rUpdate, const double &r2, const double &delta) {
427  // reliable updates
428  rNorm = sqrt(r2);
429  if (rNorm > maxrr) maxrr = rNorm;
430 
431  int updateR = (rNorm < delta*maxrr) ? 1 : 0;
432 
433  if (updateR) {
434  rUpdate++;
435  rNorm = sqrt(r2);
436  maxrr = rNorm;
437  }
438 
439  //printfQuda("Reliable triggered: %d %e\n", updateR, rNorm);
440 
441  return updateR;
442  }
443 
444  /*
445  The main CA-CG algorithm, which consists of three main steps:
446  1. Build basis vectors q_k = A p_k for k = 1..Nkrlylov
447  2. Steepest descent minmization of the residual in this basis
448  3. Update solution and residual vectors
449  */
451  {
452  const int nKrylov = param.Nkrylov;
453 
454  if (checkPrecision(x,b) != param.precision) errorQuda("Precision mismatch %d %d", checkPrecision(x,b), param.precision);
455  if (param.return_residual && param.preserve_source == QUDA_PRESERVE_SOURCE_YES) errorQuda("Cannot preserve source and return the residual");
456 
457  if (param.maxiter == 0 || nKrylov == 0) {
459  return;
460  }
461 
462  create(b);
463 
464  ColorSpinorField &r_ = rp ? *rp : *S[0];
467  ColorSpinorField &tmpSloppy = tmp_sloppy ? *tmp_sloppy : tmp;
468  ColorSpinorField &tmpSloppy2 = tmp_sloppy2 ? *tmp_sloppy2 : tmp2;
469 
471 
472  // compute b2, but only if we need to
473  bool fixed_iteration = param.sloppy_converge && nKrylov==param.maxiter && !param.compute_true_res;
474  double b2 = !fixed_iteration ? blas::norm2(b) : 1.0;
475  double r2 = 0.0; // if zero source then we will exit immediately doing no work
476 
477  // compute intitial residual depending on whether we have an initial guess or not
479  mat(r_, x, tmp, tmp2);
480  //r = b - Ax0
481  if (!fixed_iteration) {
482  r2 = blas::xmyNorm(b, r_);
483  } else {
484  blas::xpay(b, -1.0, r_);
485  r2 = b2; // dummy setting
486  }
487  } else {
488  r2 = b2;
489  blas::copy(r_, b);
490  blas::zero(x);
491  }
492 
493  if (param.deflate == true) {
494  std::vector<ColorSpinorField *> rhs;
495  // Use residual from supplied guess r, or original
496  // rhs b. use `x` as a temp.
497  blas::copy(*defl_tmp2[0], r_);
498  rhs.push_back(defl_tmp2[0]);
499 
500  // Deflate
502 
503  // Compute r_defl = RHS - A * LHS
504  mat(r_, *defl_tmp1[0], tmp, tmp2);
505  r2 = blas::xmyNorm(*rhs[0], r_);
506 
507  // defl_tmp1 must be added to the solution at the end
508  blas::axpy(1.0, *defl_tmp1[0], x);
509  }
510 
511  // Use power iterations to approx lambda_max
512  auto &lambda_min = param.ca_lambda_min;
513  auto &lambda_max = param.ca_lambda_max;
514 
515  if (basis == QUDA_CHEBYSHEV_BASIS && lambda_max < lambda_min && !lambda_init) {
517 
518  *Q[0] = r_; // do power iterations on this
519  // Do 100 iterations, normalize every 10.
520  for (int i = 0; i < 10; i++) {
521  double tmpnrm = blas::norm2(*Q[0]);
522  blas::ax(1.0/sqrt(tmpnrm), *Q[0]);
523  for (int j = 0; j < 10; j++) {
524  matSloppy(*AQ[0], *Q[0], tmpSloppy, tmpSloppy2);
525  if (j == 0 && getVerbosity() >= QUDA_VERBOSE) {
526  printfQuda("Current Rayleigh Quotient step %d is %e\n", i*10+1, sqrt(blas::norm2(*AQ[0])));
527  }
528  std::swap(AQ[0], Q[0]);
529  }
530  }
531  // Get Rayleigh quotient
532  double tmpnrm = blas::norm2(*Q[0]);
533  blas::ax(1.0/sqrt(tmpnrm), *Q[0]);
534  matSloppy(*AQ[0], *Q[0], tmpSloppy, tmpSloppy2);
535  lambda_max = 1.1*(sqrt(blas::norm2(*AQ[0])));
536  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("CA-CG Approximate lambda max = 1.1 x %e\n", lambda_max/1.1);
537  lambda_init = true;
538 
540  }
541 
542  // Check to see that we're not trying to invert on a zero-field source
543  if (b2 == 0) {
545  warningQuda("inverting on zero-field source\n");
546  x = b;
547  param.true_res = 0.0;
548  param.true_res_hq = 0.0;
549  return;
550  } else {
551  b2 = r2;
552  }
553  }
554 
555  double stop = !fixed_iteration ? stopping(param.tol, b2, param.residual_type) : 0.0; // stopping condition of solver
556 
557  const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
558 
559  // this parameter determines how many consective reliable update
560  // reisudal increases we tolerate before terminating the solver,
561  // i.e., how long do we want to keep trying to converge
562  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
563  const int maxResIncreaseTotal = param.max_res_increase_total;
564 
565  double heavy_quark_res = 0.0; // heavy quark residual
566  if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r_).z);
567 
568  int resIncrease = 0;
569  int resIncreaseTotal = 0;
570 
571  if (!param.is_preconditioner) {
572  blas::flops = 0;
574  profile.TPSTART(QUDA_PROFILE_COMPUTE);
575  }
576  int total_iter = 0;
577  double r2_old = r2;
578  bool l2_converge = false;
579 
580  // Various variables related to reliable updates.
581  int rUpdate = 0; // count reliable updates.
582  double delta = param.delta; // delta for reliable updates.
583  double rNorm = sqrt(r2); // The current residual norm.
584  double maxrr = rNorm; // The maximum residual norm since the last reliable update.
585 
586  // Factors which map linear operator onto [-1,1]
587  double m_map = 2./(lambda_max-lambda_min);
588  double b_map = -(lambda_max+lambda_min)/(lambda_max-lambda_min);
589 
590  blas::copy(*S[0], r_); // no op if uni-precision
591 
592  PrintStats("CA-CG", total_iter, r2, b2, heavy_quark_res);
593  while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) {
594 
595  // build up a space of size nKrylov
596  if (basis == QUDA_POWER_BASIS) {
597  for (int k=0; k<nKrylov; k++) {
598  matSloppy(*AS[k], *S[k], tmpSloppy, tmpSloppy2);
599  }
600  } else { // chebyshev basis
601 
602  matSloppy(*AS[0], *S[0], tmpSloppy, tmpSloppy2);
603 
604  if (nKrylov > 1) {
605  // S_1 = m AS_0 + b S_0
606  Complex facs1[] = { m_map, b_map };
607  std::vector<ColorSpinorField*> recur1{AS[0],S[0]};
608  std::vector<ColorSpinorField*> S1{S[1]};
609  blas::zero(*S[1]);
610  blas::caxpy(facs1,recur1,S1);
611  matSloppy(*AS[1], *S[1], tmpSloppy, tmpSloppy2);
612 
613  // Enter recursion relation
614  if (nKrylov > 2) {
615  // S_k = 2 m AS_{k-1} + 2 b S_{k-1} - S_{k-2}
616  Complex factors[] = { 2.*m_map, 2.*b_map, -1 };
617  for (int k = 2; k < nKrylov; k++) {
618  std::vector<ColorSpinorField*> recur2{AS[k-1],S[k-1],S[k-2]};
619  std::vector<ColorSpinorField*> Sk{S[k]};
620  blas::zero(*S[k]);
621  blas::caxpy(factors, recur2, Sk);
622  matSloppy(*AS[k], *S[k], tmpSloppy, tmpSloppy2);
623  }
624  }
625  }
626 
627  }
628 
629  // first iteration, copy S and AS into Q and AQ
630  if (total_iter == 0) {
631  // first iteration Q = S
632  for (int i=0; i<nKrylov; i++) *Q[i] = *S[i];
633  for (int i=0; i<nKrylov; i++) *AQ[i] = *AS[i];
634 
635  } else {
636 
637 
638  // Compute the beta coefficients for updating Q, AQ
639  // 1. compute matrix Q_AS = -Q^\dagger AS
640  // 2. Solve Q_AQ beta = Q_AS
641  std::vector<ColorSpinorField*> R;
642  for (int i=0; i < nKrylov; i++) R.push_back(S[i]);
644  for (int i = 0; i < param.Nkrylov*param.Nkrylov; i++) { Q_AS[i] = real(Q_AS[i]); }
645 
646  compute_beta();
647 
648  // update direction vectors
649  blas::caxpyz(beta, Q, R, Qtmp);
650  for (int i=0; i<nKrylov; i++) std::swap(Q[i],Qtmp[i]);
651 
652  blas::caxpyz(beta, AQ, AS, Qtmp);
653  for (int i=0; i<nKrylov; i++) std::swap(AQ[i],Qtmp[i]);
654  }
655 
656  // compute the alpha coefficients
657  // 1. Compute Q_AQ = Q^\dagger AQ and g = Q^dagger r = Q^dagger S[0]
658  // 2. Solve Q_AQ alpha = g
659  {
660  std::vector<ColorSpinorField*> Q2;
661  for (int i=0; i<nKrylov; i++) Q2.push_back(AQ[i]);
662  Q2.push_back(S[0]);
664 
665  for (int i = 0; i < param.Nkrylov*(param.Nkrylov+1); i++) { Q_AQandg[i] = real(Q_AQandg[i]); }
666 
667  compute_alpha();
668  }
669 
670  // update the solution vector
671  std::vector<ColorSpinorField*> X;
672  X.push_back(&x);
673  blas::caxpy(alpha, Q, X);
674 
675  // no need to compute residual vector if only doing a single fixed iteration
676  // perhaps we could save the mat-vec here if we compute "Ap"
677  // vectors when we update p?
678  if (!fixed_iteration) {
679  for (int i = 0; i < param.Nkrylov; i++) { alpha[i] = -alpha[i]; }
680  std::vector<ColorSpinorField*> S0{S[0]};
681 
682  // Can we fuse these? We don't need this reduce in all cases...
683  blas::caxpy(alpha, AQ, S0);
684  //if (getVerbosity() >= QUDA_VERBOSE) r2 = blas::norm2(*S[0]);
685  /*else*/ r2 = real(Q_AQandg[param.Nkrylov]); // actually the old r2... so we do one more iter than needed...
686 
687  }
688 
689  // NOTE: Because we always carry around the residual from an iteration before, we
690  // "lie" about which iteration we're on so the printed residual matches with the
691  // printed iteration number.
692  if (total_iter > 0) {
693  PrintStats("CA-CG", total_iter, r2, b2, heavy_quark_res);
694  }
695 
696  total_iter+=nKrylov;
697 
698 
699  // update since nKrylov or maxiter reached, converged or reliable update required
700  // note that the heavy quark residual will by definition only be checked every nKrylov steps
701  // Note: this won't reliable update when the norm _increases_.
702  if (total_iter>=param.maxiter || (r2 < stop && !l2_converge) || reliable(rNorm, maxrr, rUpdate, r2, delta)) {
703  if ( (r2 < stop || total_iter>=param.maxiter) && param.sloppy_converge) break;
704  mat(r_, x, tmp, tmp2);
705  r2 = blas::xmyNorm(b, r_);
706  blas::copy(*S[0], r_);
707 
708  if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r_).z);
709 
710  // break-out check if we have reached the limit of the precision
711  if (r2 > r2_old) {
712  resIncrease++;
713  resIncreaseTotal++;
714  warningQuda("CA-CG: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
715  sqrt(r2), sqrt(r2_old), resIncreaseTotal);
716  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
717  warningQuda("CA-CG: solver exiting due to too many true residual norm increases");
718  break;
719  }
720  } else {
721  resIncrease = 0;
722  }
723 
724  r2_old = r2;
725  }
726 
727  }
728 
729  if (total_iter>param.maxiter && getVerbosity() >= QUDA_SUMMARIZE)
730  warningQuda("Exceeded maximum iterations %d", param.maxiter);
731 
732  // Print number of reliable updates.
733  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s: Reliable updates = %d\n", "CA-CG", rUpdate);
734 
735  if (param.compute_true_res) {
736  // Calculate the true residual
737  mat(r_, x, tmp, tmp2);
738  double true_res = blas::xmyNorm(b, r_);
739  param.true_res = sqrt(true_res / b2);
741  if (param.return_residual) blas::copy(b, r_);
742  } else {
743  if (param.return_residual) blas::copy(b, *Q[0]);
744  }
745 
746  if (!param.is_preconditioner) {
747  qudaDeviceSynchronize(); // ensure solver is complete before ending timing
751 
752  // store flops and reset counters
753  double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
754 
755  param.gflops += gflops;
756  param.iter += total_iter;
757 
758  // reset the flops counters
759  blas::flops = 0;
760  mat.flops();
761  matSloppy.flops();
762 
764  }
765 
766  PrintSummary("CA-CG", total_iter, r2, b2, stop, param.tol_hq);
767  }
768 
769 } // namespace quda
cudaColorSpinorField * tmp2
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508
void caxpyz(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
virtual ~CACG()
Definition: inv_ca_cg.cpp:21
Communication-avoiding CG solver. This solver does un-preconditioned CG, running in steps of nKrylov...
Definition: invert_quda.h:891
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:450
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
virtual ~CACGNE()
Definition: inv_ca_cg.cpp:66
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
Complex * Q_AS
Definition: invert_quda.h:902
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:75
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
void deflate(std::vector< ColorSpinorField *> vec_defl, std::vector< ColorSpinorField *> vec, std::vector< ColorSpinorField *> evecs, std::vector< Complex > evals)
Deflate vector with Eigenvectors.
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE) ...
Definition: solver.cpp:256
std::vector< ColorSpinorField * > AQ
Definition: invert_quda.h:916
const DiracMatrix & matSloppy
Definition: invert_quda.h:895
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
static ColorSpinorField * Create(const ColorSpinorParam &param)
Complex * alpha
Definition: invert_quda.h:903
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:223
Complex * beta
Definition: invert_quda.h:904
TimeProfile & profile
Definition: invert_quda.h:464
static int R[4]
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
QudaCABasis basis
Definition: invert_quda.h:899
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:75
QudaPreserveSource preserve_source
Definition: invert_quda.h:154
int max_res_increase_total
Definition: invert_quda.h:96
std::vector< ColorSpinorField * > defl_tmp1
Definition: invert_quda.h:547
This is just a dummy structure we use for trove to define the required structure size.
ColorSpinorField * yp
Definition: invert_quda.h:958
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_cg.cpp:156
int reliable(double &rNorm, double &maxrr, int &rUpdate, const double &r2, const double &delta)
Definition: inv_ca_cg.cpp:426
std::vector< ColorSpinorField * > Qtmp
Definition: invert_quda.h:915
QudaGaugeParam param
Definition: pack_test.cpp:17
ColorSpinorField * tmp_sloppy
Definition: invert_quda.h:909
std::vector< ColorSpinorField * > defl_tmp2
Definition: invert_quda.h:548
void compute_beta_N(Complex *Q_AQandg, Complex *Q_AS, Complex *beta)
Definition: inv_ca_cg.cpp:356
void compute_beta()
Compute the beta coefficients.
Definition: inv_ca_cg.cpp:376
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:67
double Last(QudaProfileType idx)
Definition: timer.h:251
#define qudaDeviceSynchronize()
ColorSpinorField * bp
Definition: invert_quda.h:973
void compute_alpha_N(Complex *Q_AQandg, Complex *alpha)
Definition: inv_ca_cg.cpp:280
CACGNR(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:143
ColorSpinorField * tmp_sloppy2
Definition: invert_quda.h:910
QudaResidualType residual_type
Definition: invert_quda.h:49
virtual ~CACGNR()
Definition: inv_ca_cg.cpp:148
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:206
ColorSpinorParam csParam
Definition: pack_test.cpp:24
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:35
ColorSpinorField * rp
Definition: invert_quda.h:906
#define warningQuda(...)
Definition: util_quda.h:133
bool is_preconditioner
verbosity to use for preconditioner
Definition: invert_quda.h:241
void compute_alpha()
Compute the alpha coefficients.
Definition: inv_ca_cg.cpp:302
ColorSpinorField * tmpp
Definition: invert_quda.h:907
void constructDeflationSpace(const ColorSpinorField &meta, const DiracMatrix &mat, bool svd)
Constructs the deflation space.
Definition: solver.cpp:159
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809
const DiracMatrix & mat
Definition: invert_quda.h:894
std::vector< ColorSpinorField * > AS
Definition: invert_quda.h:913
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
EigenSolver * eig_solve
Definition: invert_quda.h:545
std::vector< ColorSpinorField * > Q
Definition: invert_quda.h:914
std::vector< Complex > evals
Definition: invert_quda.h:61
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:512
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
bool lambda_init
Definition: invert_quda.h:898
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: dirac.cpp:90
std::vector< ColorSpinorField * > evecs
Definition: invert_quda.h:58
QudaPrecision precision
Definition: invert_quda.h:142
CACG(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:15
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
SolverParam & param
Definition: invert_quda.h:463
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:139
unsigned long long flops() const
Definition: dirac_quda.h:1119
QudaCABasis ca_basis
Definition: test_util.cpp:1631
#define printfQuda(...)
Definition: util_quda.h:115
ColorSpinorField * tmpp2
Definition: invert_quda.h:908
unsigned long long flops
Definition: blas_quda.cu:22
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:33
void axpby(double a, ColorSpinorField &x, double b, ColorSpinorField &y)
Definition: blas_quda.h:36
DiracMdagM mdagm
Definition: invert_quda.h:971
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:64
CACGNE(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_cg.cpp:61
QudaPrecision precision_sloppy
Definition: invert_quda.h:145
void create(ColorSpinorField &b)
Initiate the fields needed by the solver.
Definition: inv_ca_cg.cpp:208
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE). Assumes SolverParam.true_res and SolverParam.true_res_hq has been set.
Definition: solver.cpp:270
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
DiracMMdag mmdag
Definition: invert_quda.h:955
bool deflate_init
Definition: invert_quda.h:546
const Dirac * Expose() const
Definition: dirac_quda.h:1135
ColorSpinorField * xp
Definition: invert_quda.h:957
void updateR()
update the radius for halos.
Complex * Q_AQandg
Definition: invert_quda.h:901