QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_gcr_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <complex>
6 
7 #include <quda_internal.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 
13 #include<face_quda.h>
14 
15 #include <color_spinor_field.h>
16 
17 #include <sys/time.h>
18 
19 namespace quda {
20 
21  double timeInterval(struct timeval start, struct timeval end) {
22  long ds = end.tv_sec - start.tv_sec;
23  long dus = end.tv_usec - start.tv_usec;
24  return ds + 0.000001*dus;
25  }
26 
27  // set the required parameters for the inner solver
29  inner.tol = outer.tol_precondition;
30  inner.maxiter = outer.maxiter_precondition;
31  inner.reliable_delta = 1e-20; // no reliable updates within the inner solver
32 
33  inner.cuda_prec = outer.cuda_prec_precondition; // preconditioners are uni-precision solvers
35 
36  inner.verbosity = outer.verbosity_precondition;
37 
38  inner.iter = 0;
39  inner.gflops = 0;
40  inner.secs = 0;
41 
42  inner.inv_type_precondition = QUDA_GCR_INVERTER; // used to tell the inner solver it is an inner solver
43 
44  if (outer.inv_type == QUDA_GCR_INVERTER && outer.cuda_prec_sloppy != outer.cuda_prec_precondition)
47 
48  }
49 
50  void orthoDir(Complex **beta, cudaColorSpinorField *Ap[], int k) {
51  int type = 1;
52 
53  switch (type) {
54  case 0: // no kernel fusion
55  for (int i=0; i<k; i++) { // 5 (k-1) memory transactions here
56  beta[i][k] = cDotProductCuda(*Ap[i], *Ap[k]);
57  caxpyCuda(-beta[i][k], *Ap[i], *Ap[k]);
58  }
59  break;
60  case 1: // basic kernel fusion
61  if (k==0) break;
62  beta[0][k] = cDotProductCuda(*Ap[0], *Ap[k]);
63  for (int i=0; i<k-1; i++) { // 4 (k-1) memory transactions here
64  beta[i+1][k] = caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]);
65  }
66  caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]);
67  break;
68  case 2: //
69  for (int i=0; i<k-2; i+=3) { // 5 (k-1) memory transactions here
70  for (int j=i; j<i+3; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
71  caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]);
72  }
73 
74  if (k%3 != 0) { // need to update the remainder
75  if ((k - 3*(k/3)) % 2 == 0) {
76  beta[k-2][k] = cDotProductCuda(*Ap[k-2], *Ap[k]);
77  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
78  caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]);
79  } else {
80  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
81  caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
82  }
83  }
84 
85  break;
86  case 3:
87  for (int i=0; i<k-1; i+=2) {
88  for (int j=i; j<i+2; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
89  caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]);
90  }
91 
92  if (k%2 != 0) { // need to update the remainder
93  beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
94  caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
95  }
96  break;
97  default:
98  errorQuda("Orthogonalization type not defined");
99  }
100 
101  }
102 
103  void backSubs(const Complex *alpha, Complex** const beta, const double *gamma, Complex *delta, int n) {
104  for (int k=n-1; k>=0;k--) {
105  delta[k] = alpha[k];
106  for (int j=k+1;j<n; j++) {
107  delta[k] -= beta[k][j]*delta[j];
108  }
109  delta[k] /= gamma[k];
110  }
111  }
112 
113  void updateSolution(cudaColorSpinorField &x, const Complex *alpha, Complex** const beta,
114  double *gamma, int k, cudaColorSpinorField *p[]) {
115 
116  Complex *delta = new Complex[k];
117 
118  // Update the solution vector
119  backSubs(alpha, beta, gamma, delta, k);
120 
121  //for (int i=0; i<k; i++) caxpyCuda(delta[i], *p[i], x);
122 
123  for (int i=0; i<k-2; i+=3)
124  caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x);
125 
126  if (k%3 != 0) { // need to update the remainder
127  if ((k - 3*(k/3)) % 2 == 0) caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x);
128  else caxpyCuda(delta[k-1], *p[k-1], x);
129  }
130 
131  delete []delta;
132  }
133 
134  GCR::GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, QudaInvertParam &invParam,
135  TimeProfile &profile) :
136  Solver(invParam, profile), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0)
137  {
138 
139  Kparam = newQudaInvertParam();
140  fillInnerInvertParam(Kparam, invParam);
141 
142  if (invParam.inv_type_precondition == QUDA_CG_INVERTER) // inner CG preconditioner
143  K = new CG(matPrecon, matPrecon, Kparam, profile);
144  else if (invParam.inv_type_precondition == QUDA_BICGSTAB_INVERTER) // inner BiCGstab preconditioner
145  K = new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam, profile);
146  else if (invParam.inv_type_precondition == QUDA_MR_INVERTER) // inner MR preconditioner
147  K = new MR(matPrecon, Kparam, profile);
148  else if (invParam.inv_type_precondition != QUDA_INVALID_INVERTER) // unknown preconditioner
149  errorQuda("Unknown inner solver %d", invParam.inv_type_precondition);
150 
151  }
152 
154  profile[QUDA_PROFILE_FREE].Start();
155 
156  if (K) delete K;
157 
158  profile[QUDA_PROFILE_FREE].Stop();
159  }
160 
162  {
163  profile[QUDA_PROFILE_INIT].Start();
164 
165  int Nkrylov = invParam.gcrNkrylov; // size of Krylov space
166 
169  cudaColorSpinorField r(x, param);
170  cudaColorSpinorField y(x, param); // high precision accumulator
171 
172  // create sloppy fields used for orthogonalization
174  cudaColorSpinorField **p = new cudaColorSpinorField*[Nkrylov];
175  cudaColorSpinorField **Ap = new cudaColorSpinorField*[Nkrylov];
176  for (int i=0; i<Nkrylov; i++) {
177  p[i] = new cudaColorSpinorField(x, param);
178  Ap[i] = new cudaColorSpinorField(x, param);
179  }
180 
181  cudaColorSpinorField tmp(x, param); //temporary for sloppy mat-vec
182 
183  cudaColorSpinorField *x_sloppy, *r_sloppy;
186  x_sloppy = new cudaColorSpinorField(x, param);
187  r_sloppy = new cudaColorSpinorField(x, param);
188  } else {
189  x_sloppy = &x;
190  r_sloppy = &r;
191  }
192 
193  cudaColorSpinorField &xSloppy = *x_sloppy;
194  cudaColorSpinorField &rSloppy = *r_sloppy;
195 
196  // these low precision fields are used by the inner solver
197  bool precMatch = true;
198  cudaColorSpinorField *r_pre, *p_pre;
201  p_pre = new cudaColorSpinorField(x, param);
202  r_pre = new cudaColorSpinorField(x, param);
203  precMatch = false;
204  } else {
205  p_pre = NULL;
206  r_pre = r_sloppy;
207  }
208  cudaColorSpinorField &rPre = *r_pre;
209 
210  Complex *alpha = new Complex[Nkrylov];
211  Complex **beta = new Complex*[Nkrylov];
212  for (int i=0; i<Nkrylov; i++) beta[i] = new Complex[Nkrylov];
213  double *gamma = new double[Nkrylov];
214 
215  double b2 = normCuda(b);
216 
217  const bool use_heavy_quark_res =
219  double stop = b2*invParam.tol*invParam.tol; // stopping condition of solver
220  double heavy_quark_res = 0.0; // heavy quark residual
221  if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
222 
223  int k = 0;
224 
225  // compute parity of the node
226  int parity = 0;
227  for (int i=0; i<4; i++) parity += commCoords(i);
228  parity = parity % 2;
229 
230  cudaColorSpinorField rM(rSloppy);
231  cudaColorSpinorField xM(rSloppy);
232 
233  profile[QUDA_PROFILE_INIT].Stop();
235 
236  blas_flops = 0;
237 
238  // calculate initial residual
239  mat(r, x, y);
240  zeroCuda(y);
241  double r2 = xmyNormCuda(b, r);
242  copyCuda(rSloppy, r);
243 
244  int total_iter = 0;
245  int restart = 0;
246  double r2_old = r2;
247  bool l2_converge = false;
248 
250  profile[QUDA_PROFILE_COMPUTE].Start();
251 
252  PrintStats("GCR", total_iter+k, r2, b2, heavy_quark_res);
253  while ( !convergence(r2, heavy_quark_res, stop, invParam.tol_hq) &&
254  total_iter < invParam.maxiter) {
255 
256  for (int m=0; m<invParam.precondition_cycle; m++) {
258  cudaColorSpinorField &pPre = (precMatch ? *p[k] : *p_pre);
259 
260  if (m==0) { // residual is just source
261  copyCuda(rPre, rSloppy);
262  } else { // compute residual
263  copyCuda(rM,rSloppy);
264  axpyCuda(-1.0, *Ap[k], rM);
265  copyCuda(rPre, rM);
266  }
267 
268  if ((parity+m)%2 == 0 || invParam.schwarz_type == QUDA_ADDITIVE_SCHWARZ) (*K)(pPre, rPre);
269  else copyCuda(pPre, rPre);
270 
271  // relaxation p = omega*p + (1-omega)*r
272  //if (invParam.omega!=1.0) axpbyCuda((1.0-invParam.omega), rPre, invParam.omega, pPre);
273 
274  if (m==0) { copyCuda(*p[k], pPre); }
275  else { copyCuda(tmp, pPre); xpyCuda(tmp, *p[k]); }
276 
277  } else { // no preconditioner
278  *p[k] = rSloppy;
279  }
280 
281  matSloppy(*Ap[k], *p[k], tmp);
282  }
283 
284  orthoDir(beta, Ap, k);
285 
286  double3 Apr = cDotProductNormACuda(*Ap[k], rSloppy);
287 
288  gamma[k] = sqrt(Apr.z); // gamma[k] = Ap[k]
289  if (gamma[k] == 0.0) errorQuda("GCR breakdown\n");
290  alpha[k] = Complex(Apr.x, Apr.y) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r)
291 
292  // r -= (1/|Ap|^2) * (Ap, r) r, Ap *= 1/|Ap|
293  r2 = cabxpyAxNormCuda(1.0/gamma[k], -alpha[k], *Ap[k], rSloppy);
294 
295  k++;
296  total_iter++;
297 
298  PrintStats("GCR", total_iter, r2, b2, heavy_quark_res);
299 
300  // update since Nkrylov or maxiter reached, converged or reliable update required
301  // note that the heavy quark residual will by definition only be checked every Nkrylov steps
302  if (k==Nkrylov || total_iter==invParam.maxiter || (r2 < stop && !l2_converge) || r2/r2_old < invParam.reliable_delta) {
303 
304  // update the solution vector
305  updateSolution(xSloppy, alpha, beta, gamma, k, p);
306 
307  // recalculate residual in high precision
308  copyCuda(x, xSloppy);
309  xpyCuda(x, y);
310 
311  k = 0;
312  mat(r, y, x);
313  r2 = xmyNormCuda(b, r);
314 
315  if (use_heavy_quark_res) {
316  heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y, r).z);
317  }
318 
319  if ( !convergence(r2, heavy_quark_res, stop, invParam.tol_hq) ) {
320  restart++; // restarting if residual is still too great
321 
322  PrintStats("GCR (restart)", restart, r2, b2, heavy_quark_res);
323  copyCuda(rSloppy, r);
324  zeroCuda(xSloppy);
325 
326  r2_old = r2;
327 
328  // prevent ending the Krylov space prematurely if other convergence criteria not met
329  if (r2 < stop) l2_converge = true;
330  }
331 
332  }
333 
334  }
335 
336  if (total_iter > 0) copyCuda(x, y);
337 
340 
342 
343  double gflops = (blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
344  reduceDouble(gflops);
345 
347  warningQuda("Exceeded maximum iterations %d", invParam.maxiter);
348 
349  if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("GCR: number of restarts = %d\n", restart);
350 
351  // Calculate the true residual
352  mat(r, x);
353  double true_res = xmyNormCuda(b, r);
354  invParam.true_res = sqrt(true_res / b2);
355 #if (__COMPUTE_CAPABILITY__ >= 200)
357 #else
358  invParam.true_res_hq = 0.0;
359 #endif
360 
361  invParam.gflops += gflops;
362  invParam.iter += total_iter;
363 
364  // reset the flops counters
365  blas_flops = 0;
366  mat.flops();
367  matSloppy.flops();
368  matPrecon.flops();
369 
371  profile[QUDA_PROFILE_FREE].Start();
372 
373  PrintSummary("GCR", total_iter, r2, b2);
374 
376  delete x_sloppy;
377  delete r_sloppy;
378  }
379 
381  delete p_pre;
382  delete r_pre;
383  }
384 
385  for (int i=0; i<Nkrylov; i++) {
386  delete p[i];
387  delete Ap[i];
388  }
389  delete[] p;
390  delete[] Ap;
391 
392  delete []alpha;
393  for (int i=0; i<Nkrylov; i++) delete []beta[i];
394  delete []beta;
395  delete []gamma;
396 
397  profile[QUDA_PROFILE_FREE].Stop();
398 
399  return;
400  }
401 
402 } // namespace quda