QUDA  0.9.0
inv_bicgstabl_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <iostream>
5 #include <sstream>
6 #include <complex>
7 
8 #include <quda_internal.h>
9 #include <color_spinor_field.h>
10 #include <blas_quda.h>
11 #include <dslash_quda.h>
12 #include <invert_quda.h>
13 #include <util_quda.h>
14 
15 namespace quda {
16 
17 
18  // Utility functions for Gram-Schmidt. Based on GCR functions.
19  // Big change is we need to go from 1 to nKrylov, not 0 to nKrylov-1.
20 
21  void BiCGstabL::computeTau(Complex **tau, double* sigma, std::vector<ColorSpinorField*> r, int begin, int size, int j)
22  {
23  Complex *Tau = new Complex[size];
24  std::vector<ColorSpinorField*> a(size), b(1);
25  for (int k=0; k<size; k++)
26  {
27  a[k] = r[begin+k];
28  Tau[k] = 0;
29  }
30  b[0] = r[j];
31  blas::cDotProduct(Tau, a, b); // vectorized dot product
32 
33  for (int k=0; k<size; k++)
34  {
35  tau[begin+k][j] = Tau[k]/sigma[begin+k];
36  }
37 
38  }
39 
40  void BiCGstabL::updateR(Complex **tau, std::vector<ColorSpinorField*> r, int begin, int size, int j)
41  {
42 
43  Complex *tau_ = new Complex[size];
44  for (int i=0; i<size; i++)
45  {
46  tau_[i] = -tau[i+begin][j];
47  }
48 
49  std::vector<ColorSpinorField*> r_(r.begin() + begin, r.begin() + begin + size);
50  std::vector<ColorSpinorField*> rj(r.begin() + j, r.begin() + j + 1);
51 
52  blas::caxpy(tau_, r_, rj);
53 
54  delete[] tau_;
55  }
56 
57  void BiCGstabL::orthoDir(Complex **tau, double* sigma, std::vector<ColorSpinorField*> r, int j, int pipeline)
58  {
59 
60  switch (pipeline)
61  {
62  case 0: // no kernel fusion
63  for (int i=1; i<j; i++) // 5 (j-2) memory transactions here. Start at 1 b/c bicgstabl convention.
64  {
65  tau[i][j] = blas::cDotProduct(*r[i], *r[j])/sigma[i];
66  blas::caxpy(-tau[i][j], *r[i], *r[j]);
67  }
68  break;
69  case 1: // basic kernel fusion
70  if (j==1) // start at 1.
71  {
72  break;
73  }
74  tau[1][j] = blas::cDotProduct(*r[1], *r[j])/sigma[1];
75  for (int i=1; i<j-1; i++) // 4 (j-2) memory transactions here. start at 1.
76  {
77  tau[i+1][j] = blas::caxpyDotzy(-tau[i][j], *r[i], *r[j], *r[i+1])/sigma[i+1];
78  }
79  blas::caxpy(-tau[j-1][j], *r[j-1], *r[j]);
80  break;
81  case 2: // two-way pipelining
82  case 3: // three-way pipelining
83  case 4: // four-way pipelining
84  case 5: // five-way pipelining
85  case 6: // six-way pipelining
86  case 7: // seven-way pipelining
87  case 8: // eight-way pipelining
88  {
89  const int N = pipeline;
90  // We're orthogonalizing r[j] against r[1], ..., r[j-1].
91  // We need to do (j-1)/N updates of length N, at 1,1+N,1+2*N,...
92  // After, we do 1 update of length (j-1)%N.
93 
94  // (j-1)/N updates of length N, at 1,1+N,1+2*N,...
95  int step;
96  for (step = 0; step < (j-1)/N; step++)
97  {
98  computeTau(tau, sigma, r, 1+step*N, N, j);
99  updateR(tau, r, 1+step*N, N, j);
100  }
101 
102  if ((j-1)%N != 0) // need to update the remainder
103  {
104  // 1 update of length (j-1)%N.
105  computeTau(tau, sigma, r, 1+step*N, (j-1)%N, j);
106  updateR(tau, r, 1+step*N, (j-1)%N, j);
107  }
108  }
109  break;
110  default:
111  errorQuda("Pipeline length %d type not defined", pipeline);
112  }
113 
114  }
115 
116  void BiCGstabL::updateUend(Complex* gamma, std::vector<ColorSpinorField*> u, int nKrylov)
117  {
118  // for (j = 0; j <= nKrylov; j++) { caxpy(-gamma[j], *u[j], *u[0]); }
119  Complex *gamma_ = new Complex[nKrylov];
120  for (int i = 0; i < nKrylov; i++)
121  {
122  gamma_[i] = -gamma[i+1];
123  }
124 
125  std::vector<ColorSpinorField*> u_(u.begin() + 1, u.end());
126  std::vector<ColorSpinorField*> u0(u.begin(), u.begin() + 1);
127 
128  blas::caxpy(gamma_, u_, u0);
129 
130  delete[] gamma_;
131  }
132 
133  void BiCGstabL::updateXRend(Complex* gamma, Complex* gamma_prime, Complex* gamma_prime_prime,
134  std::vector<ColorSpinorField*> r, ColorSpinorField& x, int nKrylov)
135  {
136  /*
137  blas::caxpy(gamma[1], *r[0], x_sloppy);
138  blas::caxpy(-gamma_prime[nKrylov], *r[nKrylov], *r[0]);
139  for (j = 1; j < nKrylov; j++)
140  {
141  caxpy(gamma_prime_prime[j], *r[j], x);
142  caxpy(-gamma_prime[j], *r[j], *r[0]);
143  }
144  */
145 
146  // This does two "wasted" caxpys (so 2*nKrylov+2 instead of 2*nKrylov), but
147  // the alternative way would be un-fusing some calls, which would require
148  // loading and saving x twice. In a solve where the sloppy precision is lower than
149  // the full precision, this can be a killer.
150  Complex *gamma_prime_prime_ = new Complex[nKrylov+1];
151  Complex *gamma_prime_ = new Complex[nKrylov+1];
152  gamma_prime_prime_[0] = gamma[1];
153  gamma_prime_prime_[nKrylov] = 0.0; // x never gets updated with r[nKrylov]
154  gamma_prime_[0] = 0.0; // r[0] never gets updated with r[0]... obvs.
155  gamma_prime_[nKrylov] = -gamma_prime[nKrylov];
156  for (int i = 1; i < nKrylov; i++)
157  {
158  gamma_prime_prime_[i] = gamma_prime_prime[i];
159  gamma_prime_[i] = -gamma_prime[i];
160  }
161  blas::caxpyBxpz(gamma_prime_prime_, r, x, gamma_prime_, *r[0]);
162 
163  delete[] gamma_prime_prime_;
164  delete[] gamma_prime_;
165  }
166 
182  {
185  };
186 
187  class BiCGstabLUpdate : public Worker {
188 
190  std::vector<ColorSpinorField*> &r;
191  std::vector<ColorSpinorField*> &u;
192 
195 
197 
202  int j_max;
203 
209  int n_update;
210 
211  public:
212  BiCGstabLUpdate(ColorSpinorField* x, std::vector<ColorSpinorField*>& r, std::vector<ColorSpinorField*>& u,
214  x(x), r(r), u(u), alpha(alpha), beta(beta), j_max(j_max),
216  {
217 
218  }
219  virtual ~BiCGstabLUpdate() { }
220 
221  void update_j_max(int new_j_max) { j_max = new_j_max; }
222  void update_update_type(BiCGstabLUpdateType new_update_type) { update_type = new_update_type; }
223 
224  // note that we can't set the stream parameter here so it is
225  // ignored. This is more of a future design direction to consider
226  void apply(const cudaStream_t &stream) {
227  static int count = 0;
228 
229  // on the first call do the first half of the update
231  {
232  for (int i= (count*j_max)/n_update; i<((count+1)*j_max)/n_update && i<j_max; i++)
233  {
234  blas::caxpby(1.0, *r[i], -*beta, *u[i]);
235  }
236  }
237  else // (update_type == BICGSTABL_UPDATE_R)
238  {
239  if (count == 0)
240  {
241  blas::caxpy(*alpha, *u[0], *x);
242  }
243  if (j_max > 0)
244  {
245  for (int i= (count*j_max)/n_update; i<((count+1)*j_max)/n_update && i<j_max; i++)
246  {
247  blas::caxpy(-*alpha, *u[i+1], *r[i]);
248  }
249  }
250  }
251 
252  if (++count == n_update) count = 0;
253 
254  }
255 
256  };
257 
258  // this is the Worker pointer that the dslash uses to launch the shifted updates
259  namespace dslash {
260  extern Worker* aux_worker;
261  }
262 
264  Solver(param, profile), mat(mat), matSloppy(matSloppy), nKrylov(param.Nkrylov), init(false)
265  {
266  r.resize(nKrylov+1);
267  u.resize(nKrylov+1);
268 
269  gamma = new Complex[nKrylov+1];
270  gamma_prime = new Complex[nKrylov+1];
272  sigma = new double[nKrylov+1];
273 
274  tau = new Complex*[nKrylov+1];
275  for (int i = 0; i < nKrylov+1; i++) { tau[i] = new Complex[nKrylov+1]; }
276 
277  std::stringstream ss;
278  ss << "BiCGstab-" << nKrylov;
279  solver_name = ss.str();
280  }
281 
283  profile.TPSTART(QUDA_PROFILE_FREE);
284  delete[] gamma;
285  delete[] gamma_prime;
286  delete[] gamma_prime_prime;
287  delete[] sigma;
288 
289  for (int i = 0; i < nKrylov+1; i++) { delete[] tau[i]; }
290  delete[] tau;
291 
292  if (init) {
293  delete r_sloppy_saved_p;
294  delete u[0];
295  for (int i = 1; i < nKrylov+1; i++) {
296  delete r[i];
297  delete u[i];
298  }
299 
300  delete x_sloppy_saved_p;
301  delete r_fullp;
302  delete r0_saved_p;
303  delete yp;
304  delete tempp;
305 
306  init = false;
307  }
308 
309  profile.TPSTOP(QUDA_PROFILE_FREE);
310 
311  }
312 
313  // Code to check for reliable updates, copied from inv_bicgstab_quda.cpp
314  // Technically, there are ways to check both 'x' and 'r' for reliable updates...
315  // the current status in BiCGstab is to just look for reliable updates in 'r'.
316  int BiCGstabL::reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta) {
317  // reliable updates
318  rNorm = sqrt(r2);
319  if (rNorm > maxrx) maxrx = rNorm;
320  if (rNorm > maxrr) maxrr = rNorm;
321  //int updateR = (rNorm < delta*maxrr && r0Norm <= maxrr) ? 1 : 0;
322  //int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0
323  int updateR = (rNorm < delta*maxrr) ? 1 : 0;
324 
325  //printf("reliable %d %e %e %e %e\n", updateR, rNorm, maxrx, maxrr, r2);
326 
327  return updateR;
328  }
329 
331  {
332  // BiCGstab-l is based on the algorithm outlined in
333  // BICGSTAB(L) FOR LINEAR EQUATIONS INVOLVING UNSYMMETRIC MATRICES WITH COMPLEX SPECTRUM
334  // G. Sleijpen, D. Fokkema, 1993.
335  // My implementation is based on Kate Clark's implementation in CPS, to be found in
336  // src/util/dirac_op/d_op_wilson_types/bicgstab.C
337 
338  // Begin profiling preamble.
340 
341  if (!init) {
342  // Initialize fields.
345 
346  // Full precision variables.
348 
349  // Create temporary.
351 
352  // Sloppy precision variables.
353  csParam.setPrecision(param.precision_sloppy);
354 
355  // Sloppy solution.
356  x_sloppy_saved_p = ColorSpinorField::Create(csParam); // Used depending on precision.
357 
358  // Shadow residual.
359  r0_saved_p = ColorSpinorField::Create(csParam); // Used depending on precision.
360 
361  // Temporary
363 
364  // Residual (+ extra residuals for BiCG steps), Search directions.
365  // Remark: search directions are sloppy in GCR. I wonder if we can
366  // get away with that here.
367  for (int i = 0; i <= nKrylov; i++) {
370  }
371  r_sloppy_saved_p = r[0]; // Used depending on precision.
372 
373  init = true;
374  }
375 
376  double b2 = blas::norm2(b); // norm sq of source.
377  double r2; // norm sq of residual
378 
379  ColorSpinorField &r_full = *r_fullp;
380  ColorSpinorField &y = *yp;
381  ColorSpinorField &temp = *tempp;
382 
383  ColorSpinorField *r0p, *x_sloppyp; // Get assigned below.
384 
385  // Compute initial residual depending on whether we have an initial guess or not.
387  mat(r_full, x, y); // r[0] = Ax
388  r2 = blas::xmyNorm(b, r_full); // r = b - Ax, return norm.
389  blas::copy(y, x);
390  } else {
391  blas::copy(r_full, b); // r[0] = b
392  r2 = b2;
393  blas::zero(x); // defensive measure in case solution isn't already zero
394  blas::zero(y);
395  }
396 
397  // Check to see that we're not trying to invert on a zero-field source
398  if (b2 == 0) {
400  warningQuda("inverting on zero-field source");
401  x = b;
402  param.true_res = 0.0;
403  param.true_res_hq = 0.0;
405  return;
407  b2 = r2;
408  } else {
409  errorQuda("Null vector computing requires non-zero guess!");
410  }
411  }
412 
413 
414 
415  // Set field aliasing according to whether we're doing mixed precision or not.
416  // There probably be bugs and headaches hiding here.
417  if (param.precision_sloppy == x.Precision()) {
418  r[0] = &r_full; // r[0] \equiv r_sloppy points to the same memory location as r.
420  {
421  r0p = &b; // r0, b point to the same vector in memory.
422  }
423  else
424  {
425  r0p = r0_saved_p; // r0p points to the saved r0 memory.
426  *r0p = r_full; // and is set equal to r.
427  }
428  }
429  else
430  {
431  r0p = r0_saved_p; // r0p points to saved r0 memory.
432  r[0] = r_sloppy_saved_p; // r[0] points to saved r_sloppy memory.
433  *r0p = r_full; // and is set equal to r.
434  *r[0] = r_full; // yup.
435  }
436 
438  {
439  x_sloppyp = &x; // x_sloppy and x point to the same vector in memory.
440  blas::zero(*x_sloppyp); // x_sloppy is zeroed out (and, by extension, so is x).
441  }
442  else
443  {
444  x_sloppyp = x_sloppy_saved_p; // x_sloppy point to saved x_sloppy memory.
445  blas::zero(*x_sloppyp); // and is zeroed out.
446  }
447 
448  // Syntatic sugar.
449  ColorSpinorField &r0 = *r0p;
450  ColorSpinorField &x_sloppy = *x_sloppyp;
451 
452  // Zero out the first search direction.
453  blas::zero(*u[0]);
454 
455 
456  // Set some initial values.
457  sigma[0] = blas::norm2(r_full);
458 
459 
460  // Initialize values.
461  for (int i = 1; i <= nKrylov; i++)
462  {
463  blas::zero(*r[i]);
464  }
465 
466  rho0 = 1.0;
467  alpha = 0.0;
468  omega = 1.0;
469 
470  double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver.
471 
472  const bool use_heavy_quark_res =
473  (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
474  double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r_full).z) : 0.0;
475  const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual
476 
477  blas::flops = 0;
478  //bool l2_converge = false;
479  //double r2_old = r2;
480 
481  int pipeline = param.pipeline;
482 
483  // Create the worker class for updating non-critical r, u vectors.
484  BiCGstabLUpdate bicgstabl_update(&x_sloppy, r, u, &alpha, &beta, BICGSTABL_UPDATE_U, 0, matSloppy.getStencilSteps() );
485 
486 
487  // done with preamble, begin computing.
489  profile.TPSTART(QUDA_PROFILE_COMPUTE);
490 
491  // count iteration counts
492  int k = 0;
493 
494  // Various variables related to reliable updates.
495  int rUpdate = 0; // count reliable updates.
496  double delta = param.delta; // delta for reliable updates.
497  double rNorm = sqrt(r2); // The current residual norm.
498  double maxrr = rNorm; // The maximum residual norm since the last reliable update.
499  double maxrx = rNorm; // The same. Would be different if we did 'x' reliable updates.
500 
501  PrintStats(solver_name.c_str(), k, r2, b2, heavy_quark_res);
502  while(!convergence(r2, 0.0, stop, 0.0) && k < param.maxiter) {
503 
504  // rho0 = -omega*rho0;
505  rho0 *= -omega;
506 
507  // BiCG part of calculation.
508  for (int j = 0; j < nKrylov; j++) {
509  // rho1 = <r0, r_j>, beta = alpha*rho1/rho0, rho0 = rho1;
510  // Can fuse into updateXRend.
511  rho1 = blas::cDotProduct(r0, *r[j]);
512  beta = alpha*rho1/rho0;
513  rho0 = rho1;
514 
515  // for i = 0 .. j, u[i] = r[i] - beta*u[i]
516  // All but i = j is hidden in Dslash auxillary work (overlapping comms and compute).
517  /*for (int i = 0; i <= j; i++)
518  {
519  blas::caxpby(1.0, *r[i], -beta, *u[i]);
520  }*/
521  blas::caxpby(1.0, *r[j], -beta, *u[j]);
522  if (j > 0)
523  {
524  dslash::aux_worker = &bicgstabl_update;
525  bicgstabl_update.update_j_max(j);
526  bicgstabl_update.update_update_type(BICGSTABL_UPDATE_U);
527  }
528  else
529  {
530  dslash::aux_worker = NULL;
531  }
532 
533  // u[j+1] = A ( u[j] )
534  matSloppy(*u[j+1], *u[j], temp);
535 
536  // alpha = rho0/<r0, u[j+1]>
537  // The machinary isn't there yet, but this could be fused with the matSloppy above.
538  alpha = rho0/blas::cDotProduct(r0, *u[j+1]);
539 
540  // for i = 0 .. j, r[i] = r[i] - alpha u[i+1]
541  // All but i = j is hidden in Dslash auxillary work (overlapping comms and compute).
542  /*for (int i = 0; i <= j; i++)
543  {
544  blas::caxpy(-alpha, *u[i+1], *r[i]);
545  }*/
546  blas::caxpy(-alpha, *u[j+1], *r[j]);
547  // We can always at least update x.
548  dslash::aux_worker = &bicgstabl_update;
549  bicgstabl_update.update_j_max(j);
550  bicgstabl_update.update_update_type(BICGSTABL_UPDATE_R);
551 
552  // r[j+1] = A r[j], x = x + alpha*u[0]
553  matSloppy(*r[j+1], *r[j], temp);
554  dslash::aux_worker = NULL;
555 
556  } // End BiCG part.
557 
558  // MR part. Really just modified Gram-Schmidt.
559  // The algorithm uses the byproducts of the Gram-Schmidt to update x
560  // and other such niceties. One day I'll read the paper more closely.
561  // Can take this from 'orthoDir' in inv_gcr_quda.cpp, hard code pipelining up to l = 8.
562  for (int j = 1; j <= nKrylov; j++)
563  {
564 
565 
566  // This becomes a fused operator below.
567  /*for (int i = 1; i < j; i++)
568  {
569  // tau_ij = <r_i,r_j>/sigma_i.
570  tau[i][j] = blas::cDotProduct(*r[i], *r[j])/sigma[i];
571 
572  // r_j = r_j - tau_ij r_i;
573  blas::caxpy(-tau[i][j], *r[i], *r[j]);
574  }*/
575  orthoDir(tau, sigma, r, j, pipeline);
576 
577  // sigma_j = r_j^2, gamma'_j = <r_0, r_j>/sigma_j
578 
579  // This becomes a fused operator below.
580  //sigma[j] = blas::norm2(*r[j]);
581  //gamma_prime[j] = blas::cDotProduct(*r[j], *r[0])/sigma[j];
582 
583  // rjr.x = Re(<r[j],r[0]), rjr.y = Im(<r[j],r[0]>), rjr.z = <r[j],r[j]>
584  double3 rjr = blas::cDotProductNormA(*r[j], *r[0]);
585  sigma[j] = rjr.z;
586  gamma_prime[j] = Complex(rjr.x, rjr.y)/sigma[j];
587  }
588 
589  // gamma[nKrylov] = gamma'[nKrylov], omega = gamma[nKrylov]
591  omega = gamma[nKrylov];
592 
593  // gamma = T^(-1) gamma_prime. It's in the paper, I promise.
594  for (int j = nKrylov-1; j > 0; j--)
595  {
596  // Internal def: gamma[j] = gamma'_j - \sum_{i = j+1 to nKrylov} tau_ji gamma_i
597  gamma[j] = gamma_prime[j];
598  for (int i = j+1; i <= nKrylov; i++)
599  {
600  gamma[j] = gamma[j] - tau[j][i]*gamma[i];
601  }
602  }
603 
604  // gamma'' = T S gamma. Check paper for defn of S.
605  for (int j = 1; j < nKrylov; j++)
606  {
607  gamma_prime_prime[j] = gamma[j+1];
608  for (int i = j+1; i < nKrylov; i++)
609  {
610  gamma_prime_prime[j] = gamma_prime_prime[j] + tau[j][i]*gamma[i+1];
611  }
612  }
613 
614  // Update x, r, u.
615  // x = x+ gamma_1 r_0, r_0 = r_0 - gamma'_l r_l, u_0 = u_0 - gamma_l u_l, where l = nKrylov.
616  // for (j = 0; j < nKrylov; j++) { caxpy(-gamma[j], *u[j], *u[0]); }
618 
619  //blas::caxpy(gamma[1], *r[0], x_sloppy);
620  //blas::caxpy(-gamma_prime[nKrylov], *r[nKrylov], *r[0]);
621  //for (j = 1; j < nKrylov; j++) {
622  // blas::caxpy(gamma_gamma_prime[j], *r[j], x_sloppy);
623  // blas::caxpy(-gamma_prime[j], *r[j], *r[0]);
624  //}
626 
627  // sigma[0] = r_0^2
628  sigma[0] = blas::norm2(*r[0]);
629  r2 = sigma[0];
630 
631  // Check the heavy quark residual if we need to.
632  if (use_heavy_quark_res && k%heavy_quark_check==0) {
633  if (&x != &x_sloppy)
634  {
635  blas::copy(temp,y);
636  heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x_sloppy, temp, *r[0]).z);
637  }
638  else
639  {
640  blas::copy(r_full, *r[0]);
641  heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r_full).z);
642  }
643  }
644 
645  // Check if we need to do a reliable update.
646  // In inv_bicgstab_quda.cpp, there's a variable 'updateR' that holds the check.
647  // That variable gets carried about because there are a few different places 'r' can get
648  // updated (depending on if you're using pipelining or not). In BiCGstab-L, there's only
649  // one place (for now) to get the updated residual, so we just do away with 'updateR'.
650  // Further remark: "reliable" updates rNorm, maxrr, maxrx!!
651  if (reliable(rNorm, maxrx, maxrr, r2, delta))
652  {
653  if (x.Precision() != x_sloppy.Precision())
654  {
655  blas::copy(x, x_sloppy);
656  }
657 
658  blas::xpy(x, y); // swap these around? (copied from bicgstab)
659 
660  // Don't do aux work!
661  dslash::aux_worker = NULL;
662 
663  // Explicitly recompute the residual.
664  mat(r_full, y, x); // r[0] = Ax
665 
666  r2 = blas::xmyNorm(b, r_full); // r = b - Ax, return norm.
667 
668  sigma[0] = r2;
669 
670  if (x.Precision() != r[0]->Precision())
671  {
672  blas::copy(*r[0], r_full);
673  }
674  blas::zero(x_sloppy);
675 
676  // Update rNorm, maxrr, maxrx.
677  rNorm = sqrt(r2);
678  maxrr = rNorm;
679  maxrx = rNorm;
680 
681  // Increment the reliable update count.
682  rUpdate++;
683  }
684 
685  // Check convergence.
686  k += nKrylov;
687  PrintStats(solver_name.c_str(), k, r2, b2, heavy_quark_res);
688  } // Done iterating.
689 
690  if (x.Precision() != x_sloppy.Precision())
691  {
692  blas::copy(x, x_sloppy);
693  }
694 
695  blas::xpy(y, x);
696 
697  // Done with compute, begin the epilogue.
700 
702  double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
703  param.gflops = gflops;
704  param.iter += k;
705 
706  if (k >= param.maxiter) // >= if nKrylov doesn't divide max iter.
707  warningQuda("Exceeded maximum iterations %d", param.maxiter);
708 
709  // Print number of reliable updates.
710  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s: Reliable updates = %d\n", solver_name.c_str(), rUpdate);
711 
712  // compute the true residual
713  // !param.is_preconditioner comes from bicgstab, param.compute_true_res came from gcr.
714  if (!param.is_preconditioner && param.compute_true_res) { // do not do the below if this is an inner solver.
715  mat(r_full, x, y);
716  double true_res = blas::xmyNorm(b, r_full);
717  param.true_res = sqrt(true_res / b2);
718 
719  param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,*r[0]).z) : 0.0;
720  }
721 
722  // Reset flops counters.
723  blas::flops = 0;
724  mat.flops();
725 
726  // copy the residual to b so we can use it outside of the solver.
728  {
729  blas::copy(b, r_full);
730  }
731 
732  // Done with epilogue, begin free.
733 
735  profile.TPSTART(QUDA_PROFILE_FREE);
736 
737  // ...yup...
738  PrintSummary(solver_name.c_str(), k, r2, b2);
739 
740  // Done!
741  profile.TPSTOP(QUDA_PROFILE_FREE);
742  return;
743  }
744 
745 } // namespace quda
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:139
ColorSpinorField * x_sloppy_saved_p
Definition: invert_quda.h:561
void update_update_type(BiCGstabLUpdateType new_update_type)
ColorSpinorField * yp
Full precision residual.
Definition: invert_quda.h:554
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
Definition: solver.cpp:122
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:572
ColorSpinorField * r0_saved_p
Sloppy solution vector.
Definition: invert_quda.h:562
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
void init()
Definition: blas_quda.cu:64
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
BiCGstabL(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
Definition: reduce_quda.cu:742
static ColorSpinorField * Create(const ColorSpinorParam &param)
void computeTau(Complex **tau, double *sigma, std::vector< ColorSpinorField *> r, int begin, int size, int j)
TimeProfile & profile
Definition: invert_quda.h:329
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263
Complex * gamma
Definition: invert_quda.h:547
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:364
std::vector< ColorSpinorField * > u
Definition: invert_quda.h:558
Complex * gamma_prime
Definition: invert_quda.h:547
QudaPreserveSource preserve_source
Definition: invert_quda.h:121
ColorSpinorField * r_fullp
Definition: invert_quda.h:553
std::string solver_name
Definition: invert_quda.h:586
DiracMatrix & mat
Definition: invert_quda.h:537
BiCGstabLUpdate(ColorSpinorField *x, std::vector< ColorSpinorField *> &r, std::vector< ColorSpinorField *> &u, Complex *alpha, Complex *beta, BiCGstabLUpdateType update_type, int j_max, int n_update)
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:438
int pipeline
Definition: test_util.cpp:1632
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:53
std::vector< ColorSpinorField * > & u
double Last(QudaProfileType idx)
Complex * gamma_prime_prime
Definition: invert_quda.h:547
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
Definition: solver.cpp:194
static unsigned int delta
QudaResidualType residual_type
Definition: invert_quda.h:47
Worker * aux_worker
Definition: dslash_quda.cu:78
ColorSpinorParam csParam
Definition: pack_test.cpp:24
void apply(const cudaStream_t &stream)
#define warningQuda(...)
Definition: util_quda.h:101
bool is_preconditioner
verbosity to use for preconditioner
Definition: invert_quda.h:199
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:703
std::vector< ColorSpinorField * > r
Sloppy temporary vector.
Definition: invert_quda.h:557
void orthoDir(Complex **tau, double *sigma, std::vector< ColorSpinorField *> r, int j, int pipeline)
const DiracMatrix & matSloppy
Definition: invert_quda.h:538
void updateUend(Complex *gamma, std::vector< ColorSpinorField *> u, int nKrylov)
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:544
double gamma(double) __attribute__((availability(macosx
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:246
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:45
ColorSpinorField * tempp
Full precision temporary.
Definition: invert_quda.h:556
int reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta)
Current residual, in BiCG language.
SolverParam & param
Definition: invert_quda.h:328
void updateR(Complex **tau, std::vector< ColorSpinorField *> r, int begin, int size, int j)
unsigned long long flops() const
Definition: dirac_quda.h:995
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
Definition: solver.cpp:179
void updateXRend(Complex *gamma, Complex *gamma_prime, Complex *gamma_prime_prime, std::vector< ColorSpinorField *> r, ColorSpinorField &x, int nKrylov)
#define printfQuda(...)
Definition: util_quda.h:84
ColorSpinorField * r_sloppy_saved_p
Shadow residual, in BiCG language.
Definition: invert_quda.h:563
unsigned long long flops
Definition: blas_quda.cu:42
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:128
double * sigma
Definition: invert_quda.h:549
virtual int getStencilSteps() const =0
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
Definition: blas_quda.cu:292
std::vector< ColorSpinorField * > & r
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:50
QudaPrecision precision_sloppy
Definition: invert_quda.h:115
bool use_sloppy_partial_accumulator
Definition: invert_quda.h:59
void update_j_max(int new_j_max)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
#define a
BiCGstabLUpdateType update_type
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Complex ** tau
Definition: invert_quda.h:548