QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_gcr_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <complex>
6 
7 #include <quda_internal.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 
13 #include<face_quda.h>
14 
15 #include <color_spinor_field.h>
16 
17 #include <sys/time.h>
18 
19 namespace quda {
20 
21  double timeInterval(struct timeval start, struct timeval end) {
22  long ds = end.tv_sec - start.tv_sec;
23  long dus = end.tv_usec - start.tv_usec;
24  return ds + 0.000001*dus;
25  }
26 
27  // set the required parameters for the inner solver
28  void fillInnerSolveParam(SolverParam &inner, const SolverParam &outer) {
29  inner.tol = outer.tol_precondition;
30  inner.maxiter = outer.maxiter_precondition;
31  inner.delta = 1e-20; // no reliable updates within the inner solver
32 
33  inner.precision = outer.precision_precondition; // preconditioners are uni-precision solvers
35 
36  inner.iter = 0;
37  inner.gflops = 0;
38  inner.secs = 0;
39 
40  inner.inv_type_precondition = QUDA_GCR_INVERTER; // used to tell the inner solver it is an inner solver
41 
42  if (outer.inv_type == QUDA_GCR_INVERTER && outer.precision_sloppy != outer.precision_precondition)
45 
46  }
47 
48  void orthoDir(Complex **beta, cudaColorSpinorField *Ap[], int k) {
49  int type = 1;
50 
51  switch (type) {
52  case 0: // no kernel fusion
53  for (int i=0; i<k; i++) { // 5 (k-1) memory transactions here
54  beta[i][k] = cDotProductCuda(*Ap[i], *Ap[k]);
55  caxpyCuda(-beta[i][k], *Ap[i], *Ap[k]);
56  }
57  break;
58  case 1: // basic kernel fusion
59  if (k==0) break;
60  beta[0][k] = cDotProductCuda(*Ap[0], *Ap[k]);
61  for (int i=0; i<k-1; i++) { // 4 (k-1) memory transactions here
62  beta[i+1][k] = caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]);
63  }
64  caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]);
65  break;
66  case 2: //
67  for (int i=0; i<k-2; i+=3) { // 5 (k-1) memory transactions here
68  for (int j=i; j<i+3; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
69  caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]);
70  }
71 
72  if (k%3 != 0) { // need to update the remainder
73  if ((k - 3*(k/3)) % 2 == 0) {
74  beta[k-2][k] = cDotProductCuda(*Ap[k-2], *Ap[k]);
75  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
76  caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]);
77  } else {
78  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
79  caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
80  }
81  }
82 
83  break;
84  case 3:
85  for (int i=0; i<k-1; i+=2) {
86  for (int j=i; j<i+2; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
87  caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]);
88  }
89 
90  if (k%2 != 0) { // need to update the remainder
91  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
92  caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
93  }
94  break;
95  default:
96  errorQuda("Orthogonalization type not defined");
97  }
98 
99  }
100 
101  void backSubs(const Complex *alpha, Complex** const beta, const double *gamma, Complex *delta, int n) {
102  for (int k=n-1; k>=0;k--) {
103  delta[k] = alpha[k];
104  for (int j=k+1;j<n; j++) {
105  delta[k] -= beta[k][j]*delta[j];
106  }
107  delta[k] /= gamma[k];
108  }
109  }
110 
111  void updateSolution(cudaColorSpinorField &x, const Complex *alpha, Complex** const beta,
112  double *gamma, int k, cudaColorSpinorField *p[]) {
113 
114  Complex *delta = new Complex[k];
115 
116  // Update the solution vector
117  backSubs(alpha, beta, gamma, delta, k);
118 
119  //for (int i=0; i<k; i++) caxpyCuda(delta[i], *p[i], x);
120 
121  for (int i=0; i<k-2; i+=3)
122  caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x);
123 
124  if (k%3 != 0) { // need to update the remainder
125  if ((k - 3*(k/3)) % 2 == 0) caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x);
126  else caxpyCuda(delta[k-1], *p[k-1], x);
127  }
128 
129  delete []delta;
130  }
131 
133  TimeProfile &profile) :
134  Solver(param, profile), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0), Kparam(param)
135  {
136 
137  fillInnerSolveParam(Kparam, param);
138 
139  if (param.inv_type_precondition == QUDA_CG_INVERTER) // inner CG preconditioner
140  K = new CG(matPrecon, matPrecon, Kparam, profile);
141  else if (param.inv_type_precondition == QUDA_BICGSTAB_INVERTER) // inner BiCGstab preconditioner
142  K = new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam, profile);
143  else if (param.inv_type_precondition == QUDA_MR_INVERTER) // inner MR preconditioner
144  K = new MR(matPrecon, Kparam, profile);
145  else if (param.inv_type_precondition == QUDA_SD_INVERTER) // inner MR preconditioner
146  K = new SD(matPrecon, Kparam, profile);
147  else if (param.inv_type_precondition != QUDA_INVALID_INVERTER) // unknown preconditioner
148  errorQuda("Unknown inner solver %d", param.inv_type_precondition);
149 
150  }
151 
152  /*
153  GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon,
154  SolverParam &param, TimeProfile &profile);
155 
156  */
157 
160 
161  if (K) delete K;
162 
164  }
165 
167  {
169 
170  int Nkrylov = param.Nkrylov; // size of Krylov space
171 
173  csParam.create = QUDA_ZERO_FIELD_CREATE;
174  cudaColorSpinorField r(x, csParam);
175  cudaColorSpinorField y(x, csParam); // high precision accumulator
176 
177  // create sloppy fields used for orthogonalization
179  cudaColorSpinorField **p = new cudaColorSpinorField*[Nkrylov];
180  cudaColorSpinorField **Ap = new cudaColorSpinorField*[Nkrylov];
181  for (int i=0; i<Nkrylov; i++) {
182  p[i] = new cudaColorSpinorField(x, csParam);
183  Ap[i] = new cudaColorSpinorField(x, csParam);
184  }
185 
186  cudaColorSpinorField tmp(x, csParam); //temporary for sloppy mat-vec
187 
188  cudaColorSpinorField *x_sloppy, *r_sloppy;
191  x_sloppy = new cudaColorSpinorField(x, csParam);
192  r_sloppy = new cudaColorSpinorField(x, csParam);
193  } else {
194  x_sloppy = &x;
195  r_sloppy = &r;
196  }
197 
198  cudaColorSpinorField &xSloppy = *x_sloppy;
199  cudaColorSpinorField &rSloppy = *r_sloppy;
200 
201  // these low precision fields are used by the inner solver
202  bool precMatch = true;
203  cudaColorSpinorField *r_pre, *p_pre;
206  p_pre = new cudaColorSpinorField(x, csParam);
207  r_pre = new cudaColorSpinorField(x, csParam);
208  precMatch = false;
209  } else {
210  p_pre = NULL;
211  r_pre = r_sloppy;
212  }
213  cudaColorSpinorField &rPre = *r_pre;
214 
216 
217  Complex *alpha = new Complex[Nkrylov];
218  Complex **beta = new Complex*[Nkrylov];
219  for (int i=0; i<Nkrylov; i++) beta[i] = new Complex[Nkrylov];
220  double *gamma = new double[Nkrylov];
221 
222  // compute parity of the node
223  int parity = 0;
224  for (int i=0; i<4; i++) parity += commCoords(i);
225  parity = parity % 2;
226 
227  double b2 = normCuda(b); // norm sq of source
228  double r2; // norm sq of residual
229 
230  // compute initial residual depending on whether we have an initial guess or not
232  mat(r, x, y);
233  r2 = xmyNormCuda(b, r);
234  copyCuda(y, x);
235  if (&x == &xSloppy) zeroCuda(x); // need to zero x when doing uni-precision solver
236  } else {
237  copyCuda(r, b);
238  r2 = b2;
239  zeroCuda(x); // defensive measure in case solution isn't already zero
240  }
241 
242  // Check to see that we're not trying to invert on a zero-field source
243  if (b2 == 0) {
245  warningQuda("inverting on zero-field source\n");
246  x = b;
247  param.true_res = 0.0;
248  param.true_res_hq = 0.0;
249  return;
250  }
251 
252  double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver
253 
254  const bool use_heavy_quark_res =
255  (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
256 
257  // this parameter determines how many consective reliable update
258  // reisudal increases we tolerate before terminating the solver,
259  // i.e., how long do we want to keep trying to converge
260  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
261  const int maxResIncreaseTotal = param.max_res_increase_total;
262 
263  double heavy_quark_res = 0.0; // heavy quark residual
264  if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
265 
266  int resIncrease = 0;
267  int resIncreaseTotal = 0;
268 
271 
272  blas_flops = 0;
273 
274  copyCuda(rSloppy, r);
275 
276  int total_iter = 0;
277  int restart = 0;
278  double r2_old = r2;
279  bool l2_converge = false;
280 
283 
284  int k = 0;
285  PrintStats("GCR", total_iter+k, r2, b2, heavy_quark_res);
286  while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) &&
287  total_iter < param.maxiter) {
288 
289  for (int m=0; m<param.precondition_cycle; m++) {
291  cudaColorSpinorField &pPre = (precMatch ? *p[k] : *p_pre);
292 
293  if (m==0) { // residual is just source
294  copyCuda(rPre, rSloppy);
295  } else { // compute residual
296  copyCuda(*rM, rSloppy);
297  axpyCuda(-1.0, *Ap[k], *rM);
298  copyCuda(rPre, *rM);
299  }
300 
301  if ((parity+m)%2 == 0 || param.schwarz_type == QUDA_ADDITIVE_SCHWARZ) (*K)(pPre, rPre);
302  else copyCuda(pPre, rPre);
303 
304  // relaxation p = omega*p + (1-omega)*r
305  //if (param.omega!=1.0) axpbyCuda((1.0-param.omega), rPre, param.omega, pPre);
306 
307  if (m==0) { copyCuda(*p[k], pPre); }
308  else { copyCuda(tmp, pPre); xpyCuda(tmp, *p[k]); }
309 
310  } else { // no preconditioner
311  *p[k] = rSloppy;
312  }
313 
314  matSloppy(*Ap[k], *p[k], tmp);
316  printfQuda("GCR debug iter=%d: Ap2=%e, p2=%e, rPre2=%e\n", total_iter, norm2(*Ap[k]), norm2(*p[k]), norm2(rPre));
317  }
318 
319  orthoDir(beta, Ap, k);
320 
321  double3 Apr = cDotProductNormACuda(*Ap[k], rSloppy);
322 
324  printfQuda("GCR debug iter=%d: Apr=(%e,%e,%e)\n", total_iter, Apr.x, Apr.y, Apr.z);
325  for (int i=0; i<k; i++)
326  for (int j=0; j<=k; j++)
327  printfQuda("GCR debug iter=%d: beta[%d][%d] = (%e,%e)\n",
328  total_iter, i, j, real(beta[i][j]), imag(beta[i][j]));
329  }
330 
331  gamma[k] = sqrt(Apr.z); // gamma[k] = Ap[k]
332  if (gamma[k] == 0.0) errorQuda("GCR breakdown\n");
333  alpha[k] = Complex(Apr.x, Apr.y) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r)
334 
335  // r -= (1/|Ap|^2) * (Ap, r) r, Ap *= 1/|Ap|
336  r2 = cabxpyAxNormCuda(1.0/gamma[k], -alpha[k], *Ap[k], rSloppy);
337 
338  k++;
339  total_iter++;
340 
341  PrintStats("GCR", total_iter, r2, b2, heavy_quark_res);
342 
343  // update since Nkrylov or maxiter reached, converged or reliable update required
344  // note that the heavy quark residual will by definition only be checked every Nkrylov steps
345  if (k==Nkrylov || total_iter==param.maxiter || (r2 < stop && !l2_converge) || sqrt(r2/r2_old) < param.delta) {
346 
347  // update the solution vector
348  updateSolution(xSloppy, alpha, beta, gamma, k, p);
349 
350  // recalculate residual in high precision
351  copyCuda(x, xSloppy);
352  xpyCuda(x, y);
353  mat(r, y, x);
354  r2 = xmyNormCuda(b, r);
355 
356  if (use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y, r).z);
357 
358  // break-out check if we have reached the limit of the precision
359  if (r2 > r2_old) {
360  resIncrease++;
361  resIncreaseTotal++;
362  warningQuda("GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
363  sqrt(r2), sqrt(r2_old), resIncreaseTotal);
364  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) break;
365  } else {
366  resIncrease = 0;
367  }
368 
369  k = 0;
370 
371  if ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) ) {
372  restart++; // restarting if residual is still too great
373 
374  PrintStats("GCR (restart)", restart, r2, b2, heavy_quark_res);
375  copyCuda(rSloppy, r);
376  zeroCuda(xSloppy);
377 
378  r2_old = r2;
379 
380  // prevent ending the Krylov space prematurely if other convergence criteria not met
381  if (r2 < stop) l2_converge = true;
382  }
383 
384  r2_old = r2;
385 
386  }
387 
388  }
389 
390  if (total_iter > 0) copyCuda(x, y);
391 
394 
396 
397  double gflops = (blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
398  reduceDouble(gflops);
399 
400  if (k>=param.maxiter && getVerbosity() >= QUDA_SUMMARIZE)
401  warningQuda("Exceeded maximum iterations %d", param.maxiter);
402 
403  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("GCR: number of restarts = %d\n", restart);
404 
405  // Calculate the true residual
406  mat(r, x);
407  double true_res = xmyNormCuda(b, r);
408  param.true_res = sqrt(true_res / b2);
409 #if (__COMPUTE_CAPABILITY__ >= 200)
411 #else
412  param.true_res_hq = 0.0;
413 #endif
414 
415  param.gflops += gflops;
416  param.iter += total_iter;
417 
418  // reset the flops counters
419  blas_flops = 0;
420  mat.flops();
421  matSloppy.flops();
422  matPrecon.flops();
423 
426 
427  PrintSummary("GCR", total_iter, r2, b2);
428 
429  if (param.precondition_cycle > 1) delete rM;
430 
432  delete x_sloppy;
433  delete r_sloppy;
434  }
435 
437  delete p_pre;
438  delete r_pre;
439  }
440 
441  for (int i=0; i<Nkrylov; i++) {
442  delete p[i];
443  delete Ap[i];
444  }
445  delete[] p;
446  delete[] Ap;
447 
448  delete []alpha;
449  for (int i=0; i<Nkrylov; i++) delete []beta[i];
450  delete []beta;
451  delete []gamma;
452 
454 
455  return;
456  }
457 
458 } // namespace quda
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:82
double timeInterval(struct timeval start, struct timeval end)
void setPrecision(QudaPrecision precision)
QudaSchwarzType schwarz_type
Definition: invert_quda.h:137
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:207
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
Definition: solver.cpp:65
int y[4]
QudaInverterType inv_type
Definition: invert_quda.h:18
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
std::complex< double > Complex
Definition: eig_variables.h:13
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
TimeProfile & profile
Definition: invert_quda.h:224
double cabxpyAxNormCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:440
QudaInverterType inv_type_precondition
Definition: invert_quda.h:24
QudaPreserveSource preserve_source
Definition: invert_quda.h:90
int max_res_increase_total
Definition: invert_quda.h:54
void orthoDir(Complex **beta, cudaColorSpinorField *Ap[], int k)
void fillInnerSolveParam(SolverParam &inner, const SolverParam &outer)
void updateSolution(cudaColorSpinorField &x, const Complex *alpha, Complex **const beta, double *gamma, int k, cudaColorSpinorField *p[])
unsigned long long flops() const
Definition: dirac_quda.h:587
QudaGaugeParam param
Definition: pack_test.cpp:17
void backSubs(const Complex *alpha, Complex **const beta, const double *gamma, Complex *delta, int n)
GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, SolverParam &param, TimeProfile &profile)
cudaColorSpinorField * tmp
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
Definition: solver.cpp:137
QudaResidualType residual_type
Definition: invert_quda.h:35
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
Definition: reduce_quda.cu:468
virtual ~GCR()
ColorSpinorParam csParam
Definition: pack_test.cpp:24
#define warningQuda(...)
Definition: util_quda.h:84
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
double normCuda(const cudaColorSpinorField &b)
Definition: reduce_quda.cu:145
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:115
Complex caxpyDotzyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:559
int commCoords(int)
void caxpbypczpwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
Definition: blas_quda.cu:429
double tol_precondition
Definition: invert_quda.h:126
int x[4]
QudaPrecision precision_precondition
Definition: invert_quda.h:87
unsigned long long blas_flops
Definition: blas_quda.cu:37
QudaPrecision precision
Definition: invert_quda.h:81
SolverParam & param
Definition: invert_quda.h:223
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:98
void Stop(QudaProfileType idx)
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
Definition: solver.cpp:122
double Last(QudaProfileType idx)
void reduceDouble(double &)
void operator()(cudaColorSpinorField &out, cudaColorSpinorField &in)
#define printfQuda(...)
Definition: util_quda.h:67
void caxpbypzCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
Definition: blas_quda.cu:407
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40
double3 cDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:591
void Start(QudaProfileType idx)
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:38
QudaPrecision precision_sloppy
Definition: invert_quda.h:84
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
Definition: reduce_quda.cu:777
double norm2(const ColorSpinorField &)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343
const QudaParity parity
Definition: dslash_test.cpp:29
void end()