QUDA  v1.1.0
A library for QCD on GPUs
inv_pcg_quda.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cmath>
4 #include <iostream>
5 
6 #include <quda_internal.h>
7 #include <color_spinor_field.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 
13 namespace quda
14 {
15 
16  using namespace blas;
17 
18  // set the required parameters for the inner solver
19  static void fillInnerSolverParam(SolverParam &inner, const SolverParam &outer)
20  {
21  inner.tol = outer.tol_precondition;
22  inner.delta = 1e-20; // no reliable updates within the inner solver
23 
24  // most preconditioners are uni-precision solvers, with CG being an exception
25  inner.precision
26  = outer.inv_type_precondition == QUDA_CG_INVERTER ? outer.precision_sloppy : outer.precision_precondition;
27  inner.precision_sloppy = outer.precision_precondition;
28 
29  // this sets a fixed iteration count if we're using the MR solver
30  inner.residual_type
31  = (outer.inv_type_precondition == QUDA_MR_INVERTER) ? QUDA_INVALID_RESIDUAL : QUDA_L2_RELATIVE_RESIDUAL;
32 
33  inner.iter = 0;
34  inner.gflops = 0;
35  inner.secs = 0;
36 
37  inner.inv_type_precondition = QUDA_INVALID_INVERTER;
38  inner.is_preconditioner = true; // used to tell the inner solver it is an inner solver
39  inner.pipeline = true;
40 
41  inner.schwarz_type = outer.schwarz_type;
42  inner.global_reduction = inner.schwarz_type == QUDA_INVALID_SCHWARZ ? true : false;
43 
44  inner.maxiter = outer.maxiter_precondition;
45  if (outer.inv_type_precondition == QUDA_CA_GCR_INVERTER) {
46  inner.Nkrylov = inner.maxiter / outer.precondition_cycle;
47  } else {
48  inner.Nsteps = outer.precondition_cycle;
49  }
50 
51  if (outer.inv_type == QUDA_PCG_INVERTER && outer.precision_sloppy != outer.precision_precondition)
52  inner.preserve_source = QUDA_PRESERVE_SOURCE_NO;
53  else
54  inner.preserve_source = QUDA_PRESERVE_SOURCE_YES;
55 
56  inner.verbosity_precondition = outer.verbosity_precondition;
57 
58  inner.compute_true_res = false;
59  inner.sloppy_converge = true;
60  }
61 
62  PreconCG::PreconCG(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
63  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
64  Solver(mat, matSloppy, matPrecon, matEig, param, profile),
65  K(0),
66  Kparam(param)
67  {
68  fillInnerSolverParam(Kparam, param);
69  // Preconditioners do not need a deflation space,
70  // so we explicily set this here.
71  Kparam.deflate = false;
72 
74  K = new CG(matPrecon, matPrecon, matPrecon, matEig, Kparam, profile);
76  K = new MR(matPrecon, matPrecon, Kparam, profile);
78  K = new SD(matPrecon, Kparam, profile);
79  } else if (param.inv_type_precondition != QUDA_INVALID_INVERTER) { // unknown preconditioner
80  errorQuda("Unknown inner solver %d", param.inv_type_precondition);
81  }
82  }
83 
85  {
86  profile.TPSTART(QUDA_PROFILE_FREE);
87 
88  if (K) delete K;
90 
91  profile.TPSTOP(QUDA_PROFILE_FREE);
92  }
93 
95  {
96  profile.TPSTART(QUDA_PROFILE_INIT);
97 
98  double b2 = blas::norm2(b);
99 
100  // Check to see that we're not trying to invert on a zero-field source
102  profile.TPSTOP(QUDA_PROFILE_INIT);
103  printfQuda("Warning: inverting on zero-field source\n");
104  x = b;
105  param.true_res = 0.0;
106  param.true_res_hq = 0.0;
107  return;
108  }
109 
110  int k = 0;
111  int rUpdate = 0;
112 
113  if (param.deflate) {
114  // Construct the eigensolver and deflation space if requested.
116  if (deflate_compute) {
117  // compute the deflation space.
118  (*eig_solve)(evecs, evals);
119  deflate_compute = false;
120  }
121  if (recompute_evals) {
123  recompute_evals = false;
124  }
125  }
126 
127  cudaColorSpinorField *minvrPre = NULL;
128  cudaColorSpinorField *rPre = NULL;
129  cudaColorSpinorField *minvr = NULL;
130  cudaColorSpinorField *minvrSloppy = NULL;
131  cudaColorSpinorField *p = NULL;
132 
135  if (K) minvr = new cudaColorSpinorField(b);
138 
139  csParam.setPrecision(param.precision_sloppy);
140 
141  // temporary fields
143  ColorSpinorField *tmp2p = nullptr;
144  ColorSpinorField *tmp3p = nullptr;
145  if (!mat.isStaggered()) {
146  // tmp2 only needed for multi-gpu Wilson-like kernels
148  // additional high-precision temporary if Wilson and mixed-precision
149  csParam.setPrecision(param.precision);
151  csParam.setPrecision(param.precision_sloppy);
152  } else {
153  tmp3p = tmp2p = tmpp;
154  }
155  ColorSpinorField &tmp = *tmpp;
156  ColorSpinorField &tmp2 = *tmp2p;
157  ColorSpinorField &tmp3 = *tmp3p;
158 
159  // compute initial residual
160  double r2 = 0.0;
162  // Compute r = b - A * x
163  mat(r, x, y, tmp3);
164  r2 = blas::xmyNorm(b, r);
165  if (b2 == 0) b2 = r2;
166  // y contains the original guess.
167  blas::copy(y, x);
168  } else {
169  if (&r != &b) blas::copy(r, b);
170  r2 = b2;
171  blas::zero(y);
172  }
173 
174  if (param.deflate && param.maxiter > 1) {
175  // Deflate and accumulate to solution vector
176  eig_solve->deflate(y, r, evecs, evals, true);
177  mat(r, y, x, tmp3);
178  r2 = blas::xmyNorm(b, r);
179  }
180 
182 
183  cudaColorSpinorField *r_sloppy;
184  if (param.precision_sloppy == x.Precision()) {
185  r_sloppy = &r;
186  minvrSloppy = minvr;
187  } else {
189  r_sloppy = new cudaColorSpinorField(r, csParam);
190  if (K) minvrSloppy = new cudaColorSpinorField(*minvr, csParam);
191  }
192 
193  cudaColorSpinorField *x_sloppy;
196  x_sloppy = &static_cast<cudaColorSpinorField &>(x);
197  } else {
199  x_sloppy = new cudaColorSpinorField(x, csParam);
200  }
201 
202  ColorSpinorField &xSloppy = *x_sloppy;
203  ColorSpinorField &rSloppy = *r_sloppy;
204 
205  blas::zero(x);
206  if (&x != &xSloppy) blas::zero(xSloppy);
207 
208  const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
209 
210  if (K) {
212  csParam.setPrecision(Kparam.precision);
213  rPre = new cudaColorSpinorField(rSloppy, csParam);
214  // Create minvrPre
215  minvrPre = new cudaColorSpinorField(*rPre);
216  (*K)(*minvrPre, *rPre);
217  *minvrSloppy = *minvrPre;
218  p = new cudaColorSpinorField(*minvrSloppy);
219  } else {
220  p = new cudaColorSpinorField(rSloppy);
221  }
222 
223  profile.TPSTOP(QUDA_PROFILE_INIT);
225 
226  double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver
227  double heavy_quark_res = 0.0; // heavy quark residual
228  if (use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNorm(x, r).z);
229 
230  double alpha = 0.0, beta = 0.0;
231  double pAp;
232  double rMinvr = 0;
233  double rMinvr_old = 0.0;
234  double r_new_Minvr_old = 0.0;
235  double r2_old = 0;
236  r2 = norm2(r);
237 
238  double rNorm = sqrt(r2);
239  double r0Norm = rNorm;
240  double maxrx = rNorm;
241  double maxrr = rNorm;
242  double maxr_deflate = rNorm; // The maximum residual since the last deflation
243  double delta = param.delta;
244 
245  if (K) rMinvr = reDotProduct(rSloppy, *minvrSloppy);
246 
248  profile.TPSTART(QUDA_PROFILE_COMPUTE);
249 
250  blas::flops = 0;
251 
252  PrintStats("PCG", k, r2, b2, heavy_quark_res);
253 
254  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
255  const int maxResIncreaseTotal = param.max_res_increase_total;
256 
257  int resIncrease = 0;
258  int resIncreaseTotal = 0;
259 
260  while (!convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) {
261 
262  matSloppy(Ap, *p, tmp, tmp2);
263 
264  double sigma;
265  pAp = reDotProduct(*p, Ap);
266 
267  alpha = (K) ? rMinvr / pAp : r2 / pAp;
268  Complex cg_norm = axpyCGNorm(-alpha, Ap, rSloppy);
269  // r --> r - alpha*A*p
270  r2_old = r2;
271  r2 = real(cg_norm);
272 
273  sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks
274 
275  if (K) rMinvr_old = rMinvr;
276 
277  rNorm = sqrt(r2);
278  if (rNorm > maxrx) maxrx = rNorm;
279  if (rNorm > maxrr) maxrr = rNorm;
280 
281  int updateX = (rNorm < delta * r0Norm && r0Norm <= maxrx) ? 1 : 0;
282  int updateR = ((rNorm < delta * maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
283 
284  // force a reliable update if we are within target tolerance (only if doing reliable updates)
285  if (convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1;
286 
287  if (!(updateR || updateX)) {
288 
289  if (K) {
290  // can fuse these two kernels
291  r_new_Minvr_old = reDotProduct(rSloppy, *minvrSloppy);
292  *rPre = rSloppy;
293 
294  (*K)(*minvrPre, *rPre);
295 
296  // can fuse these two kernels
297  *minvrSloppy = *minvrPre;
298  rMinvr = reDotProduct(rSloppy, *minvrSloppy);
299 
300  beta = (rMinvr - r_new_Minvr_old) / rMinvr_old;
301  axpyZpbx(alpha, *p, xSloppy, *minvrSloppy, beta);
302  } else {
303  beta = sigma / r2_old; // use the alternative beta computation
304  axpyZpbx(alpha, *p, xSloppy, rSloppy, beta);
305  }
306  } else { // reliable update
307 
308  axpy(alpha, *p, xSloppy); // xSloppy += alpha*p
309  xpy(xSloppy, y); // y += x
310  // Now compute r
311  mat(r, y, x, tmp3); // x is just a temporary here
312  r2 = xmyNorm(b, r);
313 
314  if (param.deflate && sqrt(r2) < maxr_deflate * param.tol_restart) {
315  // Deflate and accumulate to solution vector
316  eig_solve->deflate(y, r, evecs, evals, true);
317 
318  // Compute r_defl = RHS - A * LHS
319  mat(r, y, x, tmp3);
320  r2 = blas::xmyNorm(b, r);
321 
322  maxr_deflate = sqrt(r2);
323  }
324 
325  copy(rSloppy, r); // copy r to rSloppy
326  zero(xSloppy);
327 
328  // break-out check if we have reached the limit of the precision
329  if (sqrt(r2) > r0Norm && updateX) {
330  resIncrease++;
331  resIncreaseTotal++;
332  // reuse r0Norm for this
333  warningQuda(
334  "PCG: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
335  sqrt(r2), r0Norm, resIncreaseTotal);
336 
337  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) break;
338 
339  } else {
340  resIncrease = 0;
341  }
342 
343  rNorm = sqrt(r2);
344  maxrr = rNorm;
345  maxrx = rNorm;
346  r0Norm = rNorm;
347  ++rUpdate;
348 
349  if (K) {
350  *rPre = rSloppy;
351  (*K)(*minvrPre, *rPre);
352  *minvrSloppy = *minvrPre;
353 
354  rMinvr = reDotProduct(rSloppy, *minvrSloppy);
355  beta = rMinvr / rMinvr_old;
356 
357  xpay(*minvrSloppy, beta, *p); // p = minvrSloppy + beta*p
358  } else { // standard CG - no preconditioning
359 
360  // explicitly restore the orthogonality of the gradient vector
361  double rp = reDotProduct(rSloppy, *p) / (r2);
362  axpy(-rp, rSloppy, *p);
363 
364  beta = r2 / r2_old;
365  xpay(rSloppy, beta, *p);
366  }
367  }
368  ++k;
369  PrintStats("PCG", k, r2, b2, heavy_quark_res);
370  }
371 
373 
375 
376  if (x.Precision() != param.precision_sloppy) copy(x, xSloppy);
377  xpy(y, x); // x += y
378 
380  double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
381  param.gflops = gflops;
382  param.iter += k;
383 
384  if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
385 
386  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("PCG: Reliable updates = %d\n", rUpdate);
387 
388  // compute the true residual
389  mat(r, x, y, tmp3);
390  double true_res = xmyNorm(b, r);
391  param.true_res = sqrt(true_res / b2);
392 
393  // reset the flops counters
394  blas::flops = 0;
395  mat.flops();
396  matSloppy.flops();
397  matPrecon.flops();
398  matEig.flops();
399 
401  profile.TPSTART(QUDA_PROFILE_FREE);
402 
403  if (tmpp) delete tmpp;
404  if (!mat.isStaggered()) {
405  if (tmp2p && tmpp != tmp2p) delete tmp2p;
406  if (tmp3p && tmpp != tmp3p && param.precision != param.precision_sloppy) delete tmp3p;
407  }
408 
409  if (K) { // These are only needed if preconditioning is used
410  delete minvrPre;
411  delete rPre;
412  delete minvr;
413  if (x.Precision() != param.precision_sloppy) delete minvrSloppy;
414  }
415  delete p;
416 
417  if (param.precision_sloppy != x.Precision()) {
418  delete r_sloppy;
419  if (param.use_sloppy_partial_accumulator) { delete x_sloppy; }
420  }
421 
422  profile.TPSTOP(QUDA_PROFILE_FREE);
423  return;
424  }
425 
426 } // namespace quda
Conjugate-Gradient Solver.
Definition: invert_quda.h:639
static ColorSpinorField * Create(const ColorSpinorParam &param)
bool isStaggered() const
return if the operator is a staggered operator
Definition: dirac_quda.h:1935
unsigned long long flops() const
Definition: dirac_quda.h:1909
void deflate(std::vector< ColorSpinorField * > &sol, const std::vector< ColorSpinorField * > &src, const std::vector< ColorSpinorField * > &evecs, const std::vector< Complex > &evals, bool accumulate=false) const
Deflate a set of source vectors with a given eigenspace.
void computeEvals(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals, int size)
Compute eigenvalues and their residiua.
QudaPrecision Precision() const
virtual ~PreconCG()
void operator()(ColorSpinorField &out, ColorSpinorField &in)
PreconCG(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile)
bool deflate_compute
Definition: invert_quda.h:475
TimeProfile & profile
Definition: invert_quda.h:471
const DiracMatrix & mat
Definition: invert_quda.h:465
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:328
bool recompute_evals
Definition: invert_quda.h:476
std::vector< ColorSpinorField * > evecs
Definition: invert_quda.h:477
void destroyDeflationSpace()
Destroy the allocated deflation space.
Definition: solver.cpp:229
const DiracMatrix & matEig
Definition: invert_quda.h:468
SolverParam & param
Definition: invert_quda.h:470
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:311
std::vector< Complex > evals
Definition: invert_quda.h:478
EigenSolver * eig_solve
Definition: invert_quda.h:473
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE)
Definition: solver.cpp:373
void constructDeflationSpace(const ColorSpinorField &meta, const DiracMatrix &mat)
Constructs the deflation space and eigensolver.
Definition: solver.cpp:168
const DiracMatrix & matPrecon
Definition: invert_quda.h:467
const DiracMatrix & matSloppy
Definition: invert_quda.h:466
double Last(QudaProfileType idx)
Definition: timer.h:254
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195
@ QUDA_INVALID_RESIDUAL
Definition: enum_quda.h:196
@ QUDA_L2_RELATIVE_RESIDUAL
Definition: enum_quda.h:193
@ QUDA_MR_INVERTER
Definition: enum_quda.h:110
@ QUDA_PCG_INVERTER
Definition: enum_quda.h:114
@ QUDA_CA_GCR_INVERTER
Definition: enum_quda.h:132
@ QUDA_SD_INVERTER
Definition: enum_quda.h:112
@ QUDA_CG_INVERTER
Definition: enum_quda.h:107
@ QUDA_INVALID_INVERTER
Definition: enum_quda.h:133
@ QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:238
@ QUDA_PRESERVE_SOURCE_YES
Definition: enum_quda.h:239
@ QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:189
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:362
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_COMPUTE_NULL_VECTOR_NO
Definition: enum_quda.h:441
Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
void axpyZpbx(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, double b)
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:79
unsigned long long flops
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:43
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:41
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
void stop()
Stop profiling.
Definition: device.cpp:228
double norm2(const CloverField &a, bool inverse=false)
__device__ __host__ void zero(double &a)
Definition: float_vector.h:318
std::complex< double > Complex
Definition: quda_internal.h:86
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_FREE
Definition: timer.h:111
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
void updateR()
update the radius for halos.
QudaPrecision precision
Definition: invert_quda.h:136
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:61
bool use_sloppy_partial_accumulator
Definition: invert_quda.h:70
int max_res_increase_total
Definition: invert_quda.h:90
QudaResidualType residual_type
Definition: invert_quda.h:49
QudaPrecision precision_sloppy
Definition: invert_quda.h:139
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:58
QudaInverterType inv_type_precondition
Definition: invert_quda.h:28
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120