QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_multi_cg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda_internal.h>
6 #include <color_spinor_field.h>
7 #include <blas_quda.h>
8 #include <dslash_quda.h>
9 #include <invert_quda.h>
10 #include <util_quda.h>
11 #include <face_quda.h>
12 
23 namespace quda {
24 
26  TimeProfile &profile)
27  : MultiShiftSolver(param, profile), mat(mat), matSloppy(matSloppy) {
28 
29  }
30 
32 
33  }
34 
38  void updateAlphaZeta(double *alpha, double *zeta, double *zeta_old,
39  const double *r2, const double *beta, const double pAp,
40  const double *offset, const int nShift, const int j_low) {
41  double alpha_old[QUDA_MAX_MULTI_SHIFT];
42  for (int j=0; j<nShift; j++) alpha_old[j] = alpha[j];
43 
44  alpha[0] = r2[0] / pAp;
45  zeta[0] = 1.0;
46  for (int j=1; j<nShift; j++) {
47  double c0 = zeta[j] * zeta_old[j] * alpha_old[j_low];
48  double c1 = alpha[j_low] * beta[j_low] * (zeta_old[j]-zeta[j]);
49  double c2 = zeta_old[j] * alpha_old[j_low] * (1.0+(offset[j]-offset[0])*alpha[j_low]);
50 
51  zeta_old[j] = zeta[j];
52  if (c1+c2 != 0.0){
53  zeta[j] = c0 / (c1 + c2);
54  }
55  else {
56  zeta[j] = 0.0;
57  }
58  if (zeta[j] != 0.0){
59  alpha[j] = alpha[j_low] * zeta[j] / zeta_old[j];
60  }
61  else {
62  alpha[j] = 0.0;
63  }
64  }
65  }
66 
68  {
70 
71  int num_offset = param.num_offset;
72  double *offset = param.offset;
73 
74  if (num_offset == 0) return;
75 
76  const double b2 = normCuda(b);
77  // Check to see that we're not trying to invert on a zero-field source
78  if(b2 == 0){
80  printfQuda("Warning: inverting on zero-field source\n");
81  for(int i=0; i<num_offset; ++i){
82  *(x[i]) = b;
83  param.true_res_offset[i] = 0.0;
84  param.true_res_hq_offset[i] = 0.0;
85  }
86  return;
87  }
88 
89 
90  double *zeta = new double[num_offset];
91  double *zeta_old = new double[num_offset];
92  double *alpha = new double[num_offset];
93  double *beta = new double[num_offset];
94 
95  int j_low = 0;
96  int num_offset_now = num_offset;
97  for (int i=0; i<num_offset; i++) {
98  zeta[i] = zeta_old[i] = 1.0;
99  beta[i] = 0.0;
100  alpha[i] = 1.0;
101  }
102 
103  // flag whether we will be using reliable updates or not
104  bool reliable = false;
105  for (int j=0; j<num_offset; j++)
106  if (param.tol_offset[j] < param.delta) reliable = true;
107 
108 
110  cudaColorSpinorField **y = reliable ? new cudaColorSpinorField*[num_offset] : NULL;
111 
113  csParam.create = QUDA_ZERO_FIELD_CREATE;
114 
115  if (reliable)
116  for (int i=0; i<num_offset; i++) y[i] = new cudaColorSpinorField(*r, csParam);
117 
119 
120  cudaColorSpinorField *r_sloppy;
121  if (param.precision_sloppy == x[0]->Precision()) {
122  r_sloppy = r;
123  } else {
124  csParam.create = QUDA_COPY_FIELD_CREATE;
125  r_sloppy = new cudaColorSpinorField(*r, csParam);
126  }
127 
128  cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset];
129  if (param.precision_sloppy == x[0]->Precision() ||
131  for (int i=0; i<num_offset; i++) x_sloppy[i] = x[i];
132  } else {
133  csParam.create = QUDA_ZERO_FIELD_CREATE;
134  for (int i=0; i<num_offset; i++)
135  x_sloppy[i] = new cudaColorSpinorField(*x[i], csParam);
136  }
137 
138  cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];
139  for (int i=0; i<num_offset; i++) p[i]= new cudaColorSpinorField(*r_sloppy);
140 
141  csParam.create = QUDA_ZERO_FIELD_CREATE;
142  cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, csParam);
143 
144  cudaColorSpinorField tmp1(*Ap, csParam);
145 
146  // tmp2 only needed for multi-gpu Wilson-like kernels
147  cudaColorSpinorField *tmp2_p = !mat.isStaggered() ?
148  new cudaColorSpinorField(*Ap, csParam) : &tmp1;
149  cudaColorSpinorField &tmp2 = *tmp2_p;
150 
151  // additional high-precision temporary if Wilson and mixed-precision
152  csParam.setPrecision(param.precision);
153  cudaColorSpinorField *tmp3_p =
155  new cudaColorSpinorField(*r, csParam) : &tmp1;
156  cudaColorSpinorField &tmp3 = *tmp3_p;
157 
160 
161  // stopping condition of each shift
162  double stop[QUDA_MAX_MULTI_SHIFT];
163  double r2[QUDA_MAX_MULTI_SHIFT];
164  for (int i=0; i<num_offset; i++) {
165  r2[i] = b2;
167  }
168 
169  double r2_old;
170  double pAp;
171 
172  double rNorm[QUDA_MAX_MULTI_SHIFT];
173  double r0Norm[QUDA_MAX_MULTI_SHIFT];
174  double maxrx[QUDA_MAX_MULTI_SHIFT];
175  double maxrr[QUDA_MAX_MULTI_SHIFT];
176  for (int i=0; i<num_offset; i++) {
177  rNorm[i] = sqrt(r2[i]);
178  r0Norm[i] = rNorm[i];
179  maxrx[i] = rNorm[i];
180  maxrr[i] = rNorm[i];
181  }
182  double delta = param.delta;
183 
184  // this parameter determines how many consective reliable update
185  // reisudal increases we tolerate before terminating the solver,
186  // i.e., how long do we want to keep trying to converge
187  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
188  const int maxResIncreaseTotal = param.max_res_increase_total;
189 
190  int resIncrease = 0;
191  int resIncreaseTotal[QUDA_MAX_MULTI_SHIFT];
192  for (int i=0; i<num_offset; i++) {
193  resIncreaseTotal[i]=0;
194  }
195 
196  int k = 0;
197  int rUpdate = 0;
198  quda::blas_flops = 0;
199 
202 
203  if (getVerbosity() >= QUDA_VERBOSE)
204  printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
205 
206  while (r2[0] > stop[0] && k < param.maxiter) {
207  matSloppy(*Ap, *p[0], tmp1, tmp2);
208  // FIXME - this should be curried into the Dirac operator
209  if (r->Nspin()==4) axpyCuda(offset[0], *p[0], *Ap);
210 
211  pAp = reDotProductCuda(*p[0], *Ap);
212 
213  // compute zeta and alpha
214  updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low);
215 
216  r2_old = r2[0];
217  Complex cg_norm = axpyCGNormCuda(-alpha[j_low], *Ap, *r_sloppy);
218  r2[0] = real(cg_norm);
219  double zn = imag(cg_norm);
220 
221  // reliable update conditions
222  rNorm[0] = sqrt(r2[0]);
223  for (int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j];
224 
225  int updateX=0, updateR=0;
226  int reliable_shift = -1; // this is the shift that sets the reliable_shift
227  for (int j=num_offset_now-1; j>=0; j--) {
228  if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j];
229  if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j];
230  updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX;
231  updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR;
232  if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j;
233  }
234 
235  if ( !(updateR || updateX) || !reliable) {
236  //beta[0] = r2[0] / r2_old;
237  beta[0] = zn / r2_old;
238  // update p[0] and x[0]
239  axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]);
240 
241  for (int j=1; j<num_offset_now; j++) {
242  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
243  // update p[i] and x[i]
244  axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]);
245  }
246  } else {
247  for (int j=0; j<num_offset_now; j++) {
248  axpyCuda(alpha[j], *p[j], *x_sloppy[j]);
249  copyCuda(*x[j], *x_sloppy[j]);
250  xpyCuda(*x[j], *y[j]);
251  }
252 
253  mat(*r, *y[0], *x[0], tmp3); // here we can use x as tmp
254  if (r->Nspin()==4) axpyCuda(offset[0], *y[0], *r);
255 
256  r2[0] = xmyNormCuda(b, *r);
257  for (int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0];
258  for (int j=0; j<num_offset_now; j++) zeroCuda(*x_sloppy[j]);
259 
260  copyCuda(*r_sloppy, *r);
261 
262  // break-out check if we have reached the limit of the precision
263 
264  if (sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) { // reuse r0Norm for this
265  resIncrease++;
266  resIncreaseTotal[reliable_shift]++;
267  warningQuda("MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)",
268  reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]);
269 
270 
271  if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) break; // check if we reached the limit of our tolerancebreak;
272  } else {
273  resIncrease = 0;
274  }
275 
276  // explicitly restore the orthogonality of the gradient vector
277  for (int j=0; j<num_offset_now; j++) {
278  double rp = reDotProductCuda(*r_sloppy, *p[j]) / (r2[0]);
279  axpyCuda(-rp, *r_sloppy, *p[j]);
280  }
281 
282  // update beta and p
283  beta[0] = r2[0] / r2_old;
284  xpayCuda(*r_sloppy, beta[0], *p[0]);
285  for (int j=1; j<num_offset_now; j++) {
286  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
287  axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]);
288  }
289 
290  // update reliable update parameters for the system that triggered the update
291  int m = reliable_shift;
292  rNorm[m] = sqrt(r2[0]) * zeta[m];
293  maxrr[m] = rNorm[m];
294  maxrx[m] = rNorm[m];
295  r0Norm[m] = rNorm[m];
296  rUpdate++;
297  }
298 
299  // now we can check if any of the shifts have converged and remove them
300  for (int j=1; j<num_offset_now; j++) {
301  if (zeta[j] == 0.0) {
302  num_offset_now--;
303  if (getVerbosity() >= QUDA_VERBOSE)
304  printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k + 1);
305  }
306  else {
307  r2[j] = zeta[j] * zeta[j] * r2[0];
308  if (r2[j] < stop[j]) {
309  num_offset_now--;
310  if (getVerbosity() >= QUDA_VERBOSE)
311  printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k+1);
312  }
313  }
314  }
315 
316  k++;
317 
318  if (getVerbosity() >= QUDA_VERBOSE)
319  printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
320  }
321 
322 
323  for (int i=0; i<num_offset; i++) {
324  copyCuda(*x[i], *x_sloppy[i]);
325  if (reliable) xpyCuda(*y[i], *x[i]);
326  }
327 
330 
331  if (getVerbosity() >= QUDA_VERBOSE)
332  printfQuda("MultiShift CG: Reliable updates = %d\n", rUpdate);
333 
334  if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter);
335 
337  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
338  reduceDouble(gflops);
339  param.gflops = gflops;
340  param.iter += k;
341 
342  for(int i=0; i < num_offset; i++) {
343  mat(*r, *x[i]);
344  if (r->Nspin()==4) {
345  axpyCuda(offset[i], *x[i], *r); // Offset it.
346  } else if (i!=0) {
347  axpyCuda(offset[i]-offset[0], *x[i], *r); // Offset it.
348  }
349  double true_res = xmyNormCuda(b, *r);
350  param.true_res_offset[i] = sqrt(true_res/b2);
351 #if (__COMPUTE_CAPABILITY__ >= 200)
353 #else
354  param.true_res_hq_offset[i] = 0.0;
355 #endif
356  }
357 
358  if (getVerbosity() >= QUDA_SUMMARIZE){
359  printfQuda("MultiShift CG: Converged after %d iterations\n", k);
360  for(int i=0; i < num_offset; i++) {
361  printfQuda(" shift=%d, relative residual: iterated = %e, true = %e\n",
362  i, sqrt(r2[i]/b2), param.true_res_offset[i]);
363  }
364  }
365 
366  // reset the flops counters
367  quda::blas_flops = 0;
368  mat.flops();
369  matSloppy.flops();
370 
373 
374  if (&tmp3 != &tmp1) delete tmp3_p;
375  if (&tmp2 != &tmp1) delete tmp2_p;
376 
377  if (r_sloppy->Precision() != r->Precision()) delete r_sloppy;
378  for (int i=0; i<num_offset; i++)
379  if (x_sloppy[i]->Precision() != x[i]->Precision()) delete x_sloppy[i];
380  delete []x_sloppy;
381 
382  delete r;
383  for (int i=0; i<num_offset; i++) delete p[i];
384  delete []p;
385 
386  if (reliable) {
387  for (int i=0; i<num_offset; i++) delete y[i];
388  delete []y;
389  }
390 
391  delete Ap;
392 
393  delete []zeta_old;
394  delete []zeta;
395  delete []alpha;
396  delete []beta;
397 
399 
400  return;
401  }
402 
403 } // namespace quda
void setPrecision(QudaPrecision precision)
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
Definition: solver.cpp:65
int y[4]
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
SolverParam & param
Definition: invert_quda.h:476
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
std::complex< double > Complex
Definition: eig_variables.h:13
const DiracMatrix & mat
Definition: invert_quda.h:490
void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y)
Definition: blas_quda.cu:82
cudaColorSpinorField * tmp1
Definition: dslash_test.cpp:41
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
Definition: blas_quda.cu:338
void updateAlphaZeta(double *alpha, double *zeta, double *zeta_old, const double *r2, const double *beta, const double pAp, const double *offset, const int nShift, const int j_low)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:104
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:101
TimeProfile & profile
Definition: invert_quda.h:477
int max_res_increase_total
Definition: invert_quda.h:54
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:682
unsigned long long flops() const
Definition: dirac_quda.h:587
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaColorSpinorField * tmp2
Definition: dslash_test.cpp:41
void axpyBzpcxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, const double &b, cudaColorSpinorField &z, const double &c)
Definition: blas_quda.cu:311
QudaResidualType residual_type
Definition: invert_quda.h:35
const DiracMatrix & matSloppy
Definition: invert_quda.h:491
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:113
ColorSpinorParam csParam
Definition: pack_test.cpp:24
#define warningQuda(...)
Definition: util_quda.h:84
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:110
double normCuda(const cudaColorSpinorField &b)
Definition: reduce_quda.cu:145
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:115
int x[4]
void operator()(cudaColorSpinorField **out, cudaColorSpinorField &in)
unsigned long long blas_flops
Definition: blas_quda.cu:37
QudaPrecision precision
Definition: invert_quda.h:81
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:98
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:170
void Stop(QudaProfileType idx)
QudaPrecision Precision() const
double Last(QudaProfileType idx)
void reduceDouble(double &)
MultiShiftCG(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
#define printfQuda(...)
Definition: util_quda.h:67
int reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta)
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40
void Start(QudaProfileType idx)
bool isStaggered() const
Definition: dirac_quda.h:594
QudaPrecision precision_sloppy
Definition: invert_quda.h:84
bool use_sloppy_partial_accumulator
Definition: invert_quda.h:44
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
Definition: blas_quda.cu:138
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
Definition: reduce_quda.cu:777
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343