QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_multi_cg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda_internal.h>
6 #include <color_spinor_field.h>
7 #include <blas_quda.h>
8 #include <dslash_quda.h>
9 #include <invert_quda.h>
10 #include <util_quda.h>
11 #include <face_quda.h>
12 
23 namespace quda {
24 
26  TimeProfile &profile)
27  : MultiShiftSolver(invParam, profile), mat(mat), matSloppy(matSloppy) {
28 
29  }
30 
32 
33  }
34 
38  void updateAlphaZeta(double *alpha, double *zeta, double *zeta_old,
39  const double *r2, const double *beta, const double pAp,
40  const double *offset, const int nShift, const int j_low) {
41  double alpha_old[QUDA_MAX_MULTI_SHIFT];
42  for (int j=0; j<nShift; j++) alpha_old[j] = alpha[j];
43 
44  alpha[0] = r2[0] / pAp;
45  zeta[0] = 1.0;
46  for (int j=1; j<nShift; j++) {
47  double c0 = zeta[j] * zeta_old[j] * alpha_old[j_low];
48  double c1 = alpha[j_low] * beta[j_low] * (zeta_old[j]-zeta[j]);
49  double c2 = zeta_old[j] * alpha_old[j_low] * (1.0+(offset[j]-offset[0])*alpha[j_low]);
50 
51  zeta_old[j] = zeta[j];
52  zeta[j] = c0 / (c1 + c2);
53  alpha[j] = alpha[j_low] * zeta[j] / zeta_old[j];
54  }
55  }
56 
58  {
59  profile[QUDA_PROFILE_INIT].Start();
60 
61  int num_offset = invParam.num_offset;
62  double *offset = invParam.offset;
63 
64  if (num_offset == 0) return;
65 
66  const double b2 = normCuda(b);
67  // Check to see that we're not trying to invert on a zero-field source
68  if(b2 == 0){
69  profile[QUDA_PROFILE_INIT].Stop();
70  printfQuda("Warning: inverting on zero-field source\n");
71  for(int i=0; i<num_offset; ++i){
72  *(x[i]) = b;
73  invParam.true_res_offset[i] = 0.0;
75  }
76  return;
77  }
78 
79 
80  double *zeta = new double[num_offset];
81  double *zeta_old = new double[num_offset];
82  double *alpha = new double[num_offset];
83  double *beta = new double[num_offset];
84 
85  int j_low = 0;
86  int num_offset_now = num_offset;
87  for (int i=0; i<num_offset; i++) {
88  zeta[i] = zeta_old[i] = 1.0;
89  beta[i] = 0.0;
90  alpha[i] = 1.0;
91  }
92 
93  // flag whether we will be using reliable updates or not
94  bool reliable = false;
95  for (int j=0; j<num_offset; j++)
96  if (invParam.tol_offset[j] < invParam.reliable_delta) reliable = true;
97 
99  cudaColorSpinorField *r_sloppy;
100  cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset];
101  cudaColorSpinorField **y = reliable ? new cudaColorSpinorField*[num_offset] : NULL;
102 
105 
106  if (reliable)
107  for (int i=0; i<num_offset; i++) y[i] = new cudaColorSpinorField(*r, param);
108 
110 
111  if (invParam.cuda_prec_sloppy == x[0]->Precision()) {
112  for (int i=0; i<num_offset; i++){
113  x_sloppy[i] = x[i];
114  zeroCuda(*x_sloppy[i]);
115  }
116  r_sloppy = r;
117  } else {
118  for (int i=0; i<num_offset; i++)
119  x_sloppy[i] = new cudaColorSpinorField(*x[i], param);
121  r_sloppy = new cudaColorSpinorField(*r, param);
122  }
123 
124  cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];
125  for (int i=0; i<num_offset; i++) p[i]= new cudaColorSpinorField(*r_sloppy);
126 
128  cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param);
129 
130  cudaColorSpinorField tmp1(*Ap, param);
131  cudaColorSpinorField *tmp2_p = &tmp1;
132  // tmp only needed for multi-gpu Wilson-like kernels
133  if (mat.Type() != typeid(DiracStaggeredPC).name() &&
134  mat.Type() != typeid(DiracStaggered).name()) {
135  tmp2_p = new cudaColorSpinorField(*Ap, param);
136  }
137  cudaColorSpinorField &tmp2 = *tmp2_p;
138 
139  profile[QUDA_PROFILE_INIT].Stop();
141 
142 
143  // stopping condition of each shift
144  double stop[QUDA_MAX_MULTI_SHIFT];
145  double r2[QUDA_MAX_MULTI_SHIFT];
146  for (int i=0; i<num_offset; i++) {
147  r2[i] = b2;
148  stop[i] = r2[i] * invParam.tol_offset[i] * invParam.tol_offset[i];
149  }
150 
151  double r2_old;
152  double pAp;
153 
154  double rNorm[QUDA_MAX_MULTI_SHIFT];
155  double r0Norm[QUDA_MAX_MULTI_SHIFT];
156  double maxrx[QUDA_MAX_MULTI_SHIFT];
157  double maxrr[QUDA_MAX_MULTI_SHIFT];
158  for (int i=0; i<num_offset; i++) {
159  rNorm[i] = sqrt(r2[i]);
160  r0Norm[i] = rNorm[i];
161  maxrx[i] = rNorm[i];
162  maxrr[i] = rNorm[i];
163  }
164  double delta = invParam.reliable_delta;
165 
166  int k = 0;
167  int rUpdate = 0;
168  quda::blas_flops = 0;
169 
171  profile[QUDA_PROFILE_COMPUTE].Start();
172 
174  printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
175 
176  while (r2[0] > stop[0] && k < invParam.maxiter) {
177  matSloppy(*Ap, *p[0], tmp1, tmp2);
178  if (invParam.dslash_type != QUDA_ASQTAD_DSLASH) axpyCuda(offset[0], *p[0], *Ap);
179 
180  pAp = reDotProductCuda(*p[0], *Ap);
181 
182  // compute zeta and alpha
183  updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low);
184 
185  r2_old = r2[0];
186  Complex cg_norm = axpyCGNormCuda(-alpha[j_low], *Ap, *r_sloppy);
187  r2[0] = real(cg_norm);
188  double zn = imag(cg_norm);
189 
190  // reliable update conditions
191  rNorm[0] = sqrt(r2[0]);
192  for (int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j];
193 
194  int updateX=0, updateR=0;
195  int reliable_shift = -1; // this is the shift that sets the reliable_shift
196  for (int j=num_offset_now-1; j>=0; j--) {
197  if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j];
198  if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j];
199  updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX;
200  updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR;
201  if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j;
202  }
203 
204  if ( !(updateR || updateX) || !reliable) {
205  //beta[0] = r2[0] / r2_old;
206  beta[0] = zn / r2_old;
207  // update p[0] and x[0]
208  axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]);
209 
210  for (int j=1; j<num_offset_now; j++) {
211  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
212  // update p[i] and x[i]
213  axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]);
214  }
215  } else {
216  for (int j=0; j<num_offset_now; j++) {
217  axpyCuda(alpha[j], *p[j], *x_sloppy[j]);
218  copyCuda(*x[j], *x_sloppy[j]);
219  xpyCuda(*x[j], *y[j]);
220  }
221 
222  mat(*r, *y[0], *x[0]); // here we can use x as tmp
223  if (invParam.dslash_type != QUDA_ASQTAD_DSLASH) axpyCuda(offset[0], *y[0], *r);
224 
225  r2[0] = xmyNormCuda(b, *r);
226  for (int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0];
227  for (int j=0; j<num_offset_now; j++) zeroCuda(*x_sloppy[j]);
228 
229  copyCuda(*r_sloppy, *r);
230 
231  // break-out check if we have reached the limit of the precision
232  if (sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) { // reuse r0Norm for this
233  warningQuda("MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e",
234  reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift]);
235  k++;
236  rUpdate++;
237  if (reliable_shift == j_low) break;
238  }
239 
240  // update beta and p
241  beta[0] = r2[0] / r2_old;
242  xpayCuda(*r_sloppy, beta[0], *p[0]);
243  for (int j=1; j<num_offset_now; j++) {
244  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
245  axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]);
246  }
247 
248  // update reliable update parameters for the system that triggered the update
249  int m = reliable_shift;
250  rNorm[m] = sqrt(r2[0]) * zeta[m];
251  maxrr[m] = rNorm[m];
252  maxrx[m] = rNorm[m];
253  r0Norm[m] = rNorm[m];
254  rUpdate++;
255  }
256 
257  // now we can check if any of the shifts have converged and remove them
258  for (int j=1; j<num_offset_now; j++) {
259  r2[j] = zeta[j] * zeta[j] * r2[0];
260  if (r2[j] < stop[j]) {
262  printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k+1);
263  num_offset_now--;
264  }
265  }
266 
267  k++;
268 
270  printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
271  }
272 
273 
274  for (int i=0; i<num_offset; i++) {
275  copyCuda(*x[i], *x_sloppy[i]);
276  if (reliable) xpyCuda(*y[i], *x[i]);
277  }
278 
281 
282  if (k==invParam.maxiter) warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter);
283 
285  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
286  reduceDouble(gflops);
287  invParam.gflops = gflops;
288  invParam.iter += k;
289 
290  for(int i=0; i < num_offset; i++) {
291  mat(*r, *x[i]);
293  axpyCuda(offset[i], *x[i], *r); // Offset it.
294  } else if (i!=0) {
295  axpyCuda(offset[i]-offset[0], *x[i], *r); // Offset it.
296  }
297  double true_res = xmyNormCuda(b, *r);
298  invParam.true_res_offset[i] = sqrt(true_res/b2);
299 #if (__COMPUTE_CAPABILITY__ >= 200)
301 #else
302  invParam.true_res_hq_offset[i] = 0.0;
303 #endif
304  }
305 
307  printfQuda("MultiShift CG: Converged after %d iterations\n", k);
308  for(int i=0; i < num_offset; i++) {
309  printfQuda(" shift=%d, relative residua: iterated = %e, true = %e\n",
310  i, sqrt(r2[i]/b2), invParam.true_res_offset[i]);
311  }
312  }
313 
314  // reset the flops counters
315  quda::blas_flops = 0;
316  mat.flops();
317  matSloppy.flops();
318 
320  profile[QUDA_PROFILE_FREE].Start();
321 
322  if (&tmp2 != &tmp1) delete tmp2_p;
323 
324  delete r;
325  for (int i=0; i<num_offset; i++) delete p[i];
326  delete []p;
327 
328  if (reliable) {
329  for (int i=0; i<num_offset; i++) delete y[i];
330  delete []y;
331  }
332 
333  delete Ap;
334 
335  if (invParam.cuda_prec_sloppy != x[0]->Precision()) {
336  for (int i=0; i<num_offset; i++) delete x_sloppy[i];
337  delete r_sloppy;
338  }
339  delete []x_sloppy;
340 
341  delete []zeta_old;
342  delete []zeta;
343  delete []alpha;
344  delete []beta;
345 
346  profile[QUDA_PROFILE_FREE].Stop();
347 
348  return;
349  }
350 
351 } // namespace quda