QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_eigcg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda_internal.h>
6 #include <color_spinor_field.h>
7 #include <blas_quda.h>
8 #include <dslash_quda.h>
9 #include <invert_quda.h>
10 #include <util_quda.h>
11 #include <sys/time.h>
12 #include <string.h>
13 
14 #include <face_quda.h>
15 
16 #include <iostream>
17 
18 #include <blas_magma.h>
19 
20 #define MAX_EIGENVEC_WINDOW 16
21 
22 #define SINGLE_PRECISION_EPSILON 1e-7
23 
24 /*
25 Based on eigCG(nev, m) algorithm:
26 A. Stathopolous and K. Orginos, arXiv:0707.0131
27 */
28 
29 namespace quda {
30 
31  static DeflationParam *defl_param = 0;
32  static double eigcg_global_stop = 0.0;
33 
34  struct DeflationParam {
35  //host projection matrix:
36  Complex *proj_matrix; //VH A V
37 
38  QudaPrecision ritz_prec; //keep it right now.
39  cudaColorSpinorField *cudaRitzVectors; //device buffer for Ritz vectors
40  double *ritz_values;
41 
42  int ld; //projection matrix leading dimension
43  int tot_dim; //projection matrix full (maximum) dimension (nev*deflation_grid)
44  int cur_dim; //current dimension (must match rhs_idx: if(rhs_idx < deflation_grid) curr_nevs <= nev * rhs_idx)
45  int rtz_dim; //number of ritz values contained in ritz_values array
47 
50 
52 
53  if(param.nev == 0 || param.deflation_grid == 0) errorQuda("\nIncorrect deflation space parameters...\n");
54 
55  tot_dim = param.deflation_grid*param.nev;
56 
57  ld = ((tot_dim+15) / 16) * tot_dim;
58 
59  //allocate deflation resources:
61  ritz_values = (double*)calloc(tot_dim, sizeof(double));
62 
63  ritz_prec = param.precision_ritz;
64 
65  eigv_param.setPrecision(ritz_prec);//the same as for full precision iterations (see diracDeflateParam)
66  eigv_param.create = QUDA_ZERO_FIELD_CREATE;
67  eigv_param.eigv_dim = tot_dim;
68  eigv_param.eigv_id = -1;
69 
70  //if(eigv_param.siteSubset == QUDA_FULL_SITE_SUBSET) eigv_param.siteSubset = QUDA_PARITY_SITE_SUBSET;
71  cudaRitzVectors = new cudaColorSpinorField(eigv_param);
72 
73  return;
74  }
75 
77  if(proj_matrix) delete[] proj_matrix;
78 
80 
81  if(ritz_values) free(ritz_values);
82  }
83 
84  //reset current dimension:
85  void ResetDeflationCurrentDim(const int addedvecs){
86 
87  if(addedvecs == 0) return; //nothing to do
88 
89  if((cur_dim+addedvecs) > tot_dim) errorQuda("\nCannot reset projection matrix dimension.\n");
90 
91  added_nevs = addedvecs;
93 
94  return;
95  }
96 
97  //print information about the deflation space:
98  void PrintInfo(){
99 
100  printfQuda("\nProjection matrix information:\n");
101  printfQuda("Leading dimension %d\n", ld);
102  printfQuda("Total dimension %d\n", tot_dim);
103  printfQuda("Current dimension %d\n", cur_dim);
104  printfQuda("Host pointer: %p\n", proj_matrix);
105 
106  return;
107 
108  }
109 
111  {
112  if( cuda_ritz_alloc ){
113 
114  delete cudaRitzVectors;
115 
116  cuda_ritz_alloc = false;
117  }
118 
119  return;
120  }
121 
122  void ReshapeDeviceRitzVectorsSet(const int nev, QudaPrecision new_ritz_prec = QUDA_INVALID_PRECISION)//reset param.ritz_prec?
123  {
124  if(nev > tot_dim || (nev == tot_dim && new_ritz_prec == QUDA_INVALID_PRECISION)) return;//nothing to do
125 
126  if(!cuda_ritz_alloc) errorQuda("\nCannot reshape Ritz vectors set.\n");
127  //
128  ColorSpinorParam cudaEigvParam(cudaRitzVectors->Eigenvec(0));
129 
130  cudaEigvParam.create = QUDA_ZERO_FIELD_CREATE;
131  cudaEigvParam.eigv_dim = nev;
132  cudaEigvParam.eigv_id = -1;
133 
134  if(new_ritz_prec != QUDA_INVALID_PRECISION)
135  {
136  ritz_prec = new_ritz_prec;
137 
138  cudaEigvParam.setPrecision(ritz_prec);
139  }
140 
142 
143  cudaRitzVectors = new cudaColorSpinorField(cudaEigvParam);
144 
145  cur_dim = nev;
146 
147  cuda_ritz_alloc = true;
148 
149  return;
150  }
151 
152  };
153 
154 
155  template<typename Float, typename CudaComplex>
156  class EigCGArgs{
157 
158  private:
159  BlasMagmaArgs *eigcg_magma_args;
160 
161  //host Lanczos matrice, and its eigenvalue/vector arrays:
162  std::complex<Float> *hTm;//VH A V
163  Float *hTvalm; //eigenvalues of both T[m, m ] and T[m-1, m-1] (re-used)
164 
165  //device Lanczos matrix, and its eigenvalue/vector arrays:
166  CudaComplex *dTm; //VH A V
167  CudaComplex *dTvecm; //eigenvectors of T[m, m ]
168  CudaComplex *dTvecm1; //eigenvectors of T[m-1,m-1]
169 
170  int m;
171  int nev;
172  int ldm;
173 
174  public:
175 
176  EigCGArgs(int m, int nev);
177  ~EigCGArgs();
178 
179  //methods for constructing Lanczos matrix:
180  void LoadLanczosDiag(int idx, double alpha, double alpha0, double beta0);
181  void LoadLanczosOffDiag(int idx, double alpha0, double beta0);
182 
183  //methods for Rayleigh Ritz procedure:
184  int RestartVm(void* vm, const int cld, const int clen, const int vprec);//complex length
185 
186  //methods
187  void FillLanczosDiag(const int _2nev);
188  void FillLanczosOffDiag(const int _2nev, cudaColorSpinorField *v, cudaColorSpinorField *u, double inv_sqrt_r2);
189  //
190  void CheckEigenvalues(const cudaColorSpinorField *Vm, const DiracMatrix &matDefl, const int restart_num);//this method is designed to monitor eigcg effeciency, not for production runs
191  };
192 
193  template<typename Float, typename CudaComplex>
194  EigCGArgs<Float, CudaComplex>::EigCGArgs(int m, int nev): m(m), nev(nev){
195  //include pad?
196  ldm = ((m+15)/16)*16;//too naive
197 
198  //magma initialization:
199  const int prec = sizeof(Float);
200  eigcg_magma_args = new BlasMagmaArgs(m, nev, ldm, prec);
201 
202  hTm = new std::complex<Float>[ldm*m];//VH A V
203  hTvalm = (Float*)malloc(m*sizeof(Float)); //eigenvalues of both T[m, m ] and T[m-1, m-1] (re-used)
204 
205  //allocate dTm etc. buffers on GPU:
206  cudaMalloc(&dTm, ldm*m*sizeof(CudaComplex));//
207  cudaMalloc(&dTvecm, ldm*m*sizeof(CudaComplex));
208  cudaMalloc(&dTvecm1, ldm*m*sizeof(CudaComplex));
209 
210  //set everything to zero:
211  cudaMemset(dTm, 0, ldm*m*sizeof(CudaComplex));//?
212  cudaMemset(dTvecm, 0, ldm*m*sizeof(CudaComplex));
213  cudaMemset(dTvecm1, 0, ldm*m*sizeof(CudaComplex));
214 
215  //Error check...
216  checkCudaError();
217 
218  return;
219  }
220 
221  template<typename Float, typename CudaComplex>
223  delete[] hTm;
224 
225  free(hTvalm);
226 
227  cudaFree(dTm);
228  cudaFree(dTvecm);
229  cudaFree(dTvecm1);
230 
231  delete eigcg_magma_args;
232 
233  return;
234  }
235 
236  template<typename Float, typename CudaComplex>
237  void EigCGArgs<Float, CudaComplex>::LoadLanczosDiag(int idx, double alpha, double alpha0, double beta0)
238  {
239  hTm[idx*ldm+idx] = std::complex<Float>((Float)(1.0/alpha + beta0/alpha0), 0.0);
240  return;
241  }
242 
243  template<typename Float, typename CudaComplex>
244  void EigCGArgs<Float, CudaComplex>::LoadLanczosOffDiag(int idx, double alpha, double beta)
245  {
246  hTm[(idx+1)*ldm+idx] = std::complex<Float>((Float)(-sqrt(beta)/alpha), 0.0f);//'U'
247  hTm[idx*ldm+(idx+1)] = hTm[(idx+1)*ldm+idx];//'L'
248  return;
249  }
250 
251  template<typename Float, typename CudaComplex>
252  int EigCGArgs<Float, CudaComplex>::RestartVm(void* v, const int cld, const int clen, const int vprec)
253  {
254  //Create device version of the Lanczos matrix:
255  cudaMemcpy(dTm, hTm, ldm*m*sizeof(CudaComplex), cudaMemcpyDefault);
256 
257  //Solve m-dimensional eigenproblem:
258  cudaMemcpy(dTvecm, dTm, ldm*m*sizeof(CudaComplex), cudaMemcpyDefault);
259  eigcg_magma_args->MagmaHEEVD((void*)dTvecm, (void*)hTvalm, m);
260 
261  //Solve (m-1)-dimensional eigenproblem:
262  cudaMemcpy(dTvecm1, dTm, ldm*m*sizeof(CudaComplex), cudaMemcpyDefault);
263  eigcg_magma_args->MagmaHEEVD((void*)dTvecm1, (void*)hTvalm, m-1);
264 
265  //Zero the last row (coloumn-major format of the matrix re-interpreted as 2D row-major formated):
266  cudaMemset2D(&dTvecm1[(m-1)], ldm*sizeof(CudaComplex), 0, sizeof(CudaComplex), (m-1));
267 
268  //Attach nev old vectors to nev new vectors (note 2*nev << m):
269  cudaMemcpy(&dTvecm[ldm*nev], dTvecm1, ldm*nev*sizeof(CudaComplex), cudaMemcpyDefault);
270 
271  //Perform QR-factorization and compute QH*Tm*Q:
272  int i = eigcg_magma_args->MagmaORTH_2nev((void*)dTvecm, (void*)dTm);
273 
274  //Solve 2nev-dimensional eigenproblem:
275  eigcg_magma_args->MagmaHEEVD((void*)dTm, (void*)hTvalm, i);
276 
277  //solve zero unused part of the eigenvectors in dTm:
278  cudaMemset2D(&(dTm[i]), ldm*sizeof(CudaComplex), 0, (m-i)*sizeof(CudaComplex), i);//check..
279 
280  //Restart V:
281  eigcg_magma_args->RestartV(v, cld, clen, vprec, (void*)dTvecm, (void*)dTm);
282 
283  return i;
284  }
285 
286 
287  template<typename Float, typename CudaComplex>
289  {
290  memset(hTm, 0, ldm*m*sizeof(std::complex<Float>));
291  for (int i = 0; i < _2nev; i++) hTm[i*ldm+i]= hTvalm[i];//fill-up diagonal
292 
293  return;
294  }
295 
296  template<typename Float, typename CudaComplex>
298  {
299  if(v->Precision() != u->Precision()) errorQuda("\nIncorrect precision...\n");
300  for (int i = 0; i < _2nev; i++){
301  std::complex<double> s = cDotProductCuda(*v, u->Eigenvec(i));
302  s *= inv_sqrt_r2;
303  hTm[_2nev*ldm+i] = std::complex<Float>((Float)s.real(), (Float)s.imag());
304  hTm[i*ldm+_2nev] = conj(hTm[_2nev*ldm+i]);
305  }
306  }
307 
308  //not implemented
309  template<typename Float, typename CudaComplex>
310  void EigCGArgs<Float, CudaComplex>::CheckEigenvalues(const cudaColorSpinorField *Vm, const DiracMatrix &matDefl, const int restart_num)
311  {
312  printfQuda("\nPrint eigenvalue accuracy after %d restart.\n", restart_num);
313 
314  Complex *hproj = new Complex[nev*nev];
315 
318 
319  csParam.eigv_dim = 0;
320  csParam.eigv_id = -1;
321 
324 
326 
327  Complex alpha;
328 
329  for (int j = 0; j < nev; j++)//
330  {
331  matDefl(*W, Vm->Eigenvec(j), tmp);
332 
333  //off-diagonal:
334  for (int i = 0; i < j; i++)//row id
335  {
336  alpha = cDotProductCuda(Vm->Eigenvec(i), *W);
337  //
338  hproj[j*nev+i] = alpha;
339  hproj[i*nev+j] = conj(alpha);//conj
340  }
341 
342  //diagonal:
343  alpha = cDotProductCuda(Vm->Eigenvec(j), *W);
344  //
345  hproj[j*nev+j] = alpha;
346  }
347 
348  double *evals = (double*)calloc(nev, sizeof(double));
349 
350  cudaHostRegister(hproj, nev*nev*sizeof(Complex), cudaHostRegisterMapped);
351 
352  BlasMagmaArgs magma_args2(nev, nev, sizeof(double));//change precision..
353 
354  magma_args2.MagmaHEEVD(hproj, evals, nev, true);
355 
356  for(int i = 0; i < nev; i++)//newnev
357  {
358  for(int j = 0; j < nev; j++) caxpyCuda(hproj[i*nev+j], Vm->Eigenvec(j), *W);
359 
360  double norm2W = normCuda(*W);
361 
362  matDefl(*W2, *W, tmp);
363 
364  Complex dotWW2 = cDotProductCuda(*W, *W2);
365 
366  evals[i] = dotWW2.real() / norm2W;
367 
368  axCuda(evals[i], *W);
369 
370  mxpyCuda(*W2, *W);
371 
372  zeroCuda(*W);
373 
374  double relerr = sqrt( normCuda(*W) / norm2W );
375 
376  printfQuda("Eigenvalue %d: %1.12e Res.: %1.12e\n", i+1, evals[i], relerr);
377 
378  }
379 
380  delete W;
381  //
382  delete W2;
383  //
384  cudaHostUnregister(hproj);
385  //
386  delete [] hproj;
387 
388  delete evals;
389 
390  return;
391  }
392 
393  // set the required parameters for the initCG solver
394  void fillInitCGSolveParam(SolverParam &initCGparam) {
395  initCGparam.iter = 0;
396  initCGparam.gflops = 0;
397  initCGparam.secs = 0;
398 
399  initCGparam.inv_type = QUDA_CG_INVERTER; // use CG solver
400  initCGparam.use_init_guess = QUDA_USE_INIT_GUESS_YES;// use deflated initial guess...
401  }
402 
404  DeflatedSolver(param, profile), mat(mat), matSloppy(matSloppy), matCGSloppy(matCGSloppy), matDefl(matDefl), search_space_prec(QUDA_INVALID_PRECISION),
405  Vm(0), initCGparam(param), profile(profile), eigcg_alloc(false)
406  {
407  if((param.rhs_idx < param.deflation_grid) || (param.inv_type == QUDA_EIGCG_INVERTER))
408  {
409  if(param.nev > MAX_EIGENVEC_WINDOW )
410  {
411  warningQuda("\nWarning: the eigenvector window is too big, using default value %d.\n", MAX_EIGENVEC_WINDOW);
412  param.nev = MAX_EIGENVEC_WINDOW;
413  }
414 
415  search_space_prec = param.precision_ritz;
416  //
417  use_eigcg = true;
418  //
419  printfQuda("\nInitialize eigCG(m=%d, nev=%d) solver.\n", param.m, param.nev);
420  }
421  else
422  {
423  fillInitCGSolveParam(initCGparam);
424  //
425  use_eigcg = false;
426  //
427  printfQuda("\nIncEigCG will deploy initCG solver.\n");
428  }
429 
430  return;
431  }
432 
434 
435  if(eigcg_alloc) delete Vm;
436 
437  }
438 
440  {
441 
442  if (eigcg_precision != x.Precision()) errorQuda("\nInput/output field precision is incorrect (solver precision: %u spinor precision: %u).\n", eigcg_precision, x.Precision());
443 
444  profile.Start(QUDA_PROFILE_INIT);
445 
446  // Check to see that we're not trying to invert on a zero-field source
447  const double b2 = norm2(b);
448 
449  if(b2 == 0){
450  profile.Stop(QUDA_PROFILE_INIT);
451  printfQuda("Warning: inverting on zero-field source\n");
452  x=b;
453  param.true_res = 0.0;
454  param.true_res_hq = 0.0;
455  return 0;
456  }
457 
459 
461  csParam.create = QUDA_ZERO_FIELD_CREATE;
462  cudaColorSpinorField y(b, csParam);
463 
464  //mat(r, x, y);
465  //double r2 = xmyNormCuda(b, r);//compute residual
466 
467  csParam.setPrecision(eigcg_precision);
468 
469  cudaColorSpinorField Ap(x, csParam);
470 
471  cudaColorSpinorField tmp(x, csParam);
472 
473  cudaColorSpinorField *tmp2_p = &tmp;
474 
475  //matSloppy(r, x, tmp, tmp2);
476  //double r2 = xmyNormCuda(b, r);//compute residual
477 
478  // tmp only needed for multi-gpu Wilson-like kernels
479  if (mat.Type() != typeid(DiracStaggeredPC).name() &&
480  mat.Type() != typeid(DiracStaggered).name()) {
481  tmp2_p = new cudaColorSpinorField(x, csParam);
482  }
483  cudaColorSpinorField &tmp2 = *tmp2_p;
484 
485  matSloppy(r, x, tmp, tmp2);
486  double r2 = xmyNormCuda(b, r);//compute residual
487 
489 
490  zeroCuda(y);
491 
492  const bool use_heavy_quark_res =
493  (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
494 
495  profile.Stop(QUDA_PROFILE_INIT);
496  profile.Start(QUDA_PROFILE_PREAMBLE);
497 
498  double r2_old;
499  double stop = b2*param.tol*param.tol; // stopping condition of solver
500 
501  double heavy_quark_res = 0.0; // heavy quark residual
502  if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
503  int heavy_quark_check = 10; // how often to check the heavy quark residual
504 
505  double alpha=1.0, beta=0.0;
506 
507  double pAp;
508 
509  int eigvRestart = 0;
510 
511  profile.Stop(QUDA_PROFILE_PREAMBLE);
512  profile.Start(QUDA_PROFILE_COMPUTE);
513  blas_flops = 0;
514 
515 //eigCG specific code:
516  if(eigcg_alloc == false){
517 
518  printfQuda("\nAllocating resources for the EigCG solver...\n");
519 
520  //Create an eigenvector set:
521  csParam.create = QUDA_ZERO_FIELD_CREATE;
522  csParam.setPrecision(search_space_prec);//eigCG internal search space precision: must be adjustable.
523  csParam.eigv_dim = param.m;
524 
525  Vm = new cudaColorSpinorField(csParam); //search space for Ritz vectors
526 
527  checkCudaError();
528  printfQuda("\n..done.\n");
529 
530  eigcg_alloc = true;
531  }
532 
533  ColorSpinorParam eigParam(Vm->Eigenvec(0));
534  eigParam.create = QUDA_ZERO_FIELD_CREATE;
535  cudaColorSpinorField *v0 = new cudaColorSpinorField(Vm->Eigenvec(0), eigParam); //temporary field.
536 
537  cudaColorSpinorField Ap0(Ap);
538 
539  //create EigCG objects:
540  EigCGArgs<double, cuDoubleComplex> *eigcg_args = new EigCGArgs<double, cuDoubleComplex>(param.m, param.nev); //must be adjustable..
541 
542  //EigCG additional parameters:
543  double alpha0 = 1.0, beta0 = 0.0;
544 
545 //Begin CG iterations:
546  int k=0, l=0;
547 
548  PrintStats("EigCG", k, r2, b2, heavy_quark_res);
549 
550  double sigma = 0.0;
551 
552  while ( (!convergence(r2, heavy_quark_res, stop, param.tol_hq) && !convergence(r2, heavy_quark_res, eigcg_global_stop, param.tol_hq)) && k < param.maxiter) {
553 
554  if(k > 0)
555  {
556  beta0 = beta;
557 
558  beta = sigma / r2_old;
559  axpyZpbxCuda(alpha, p, x, r, beta);
560 
561  if (use_heavy_quark_res && k%heavy_quark_check==0) {
562  heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(x, y, r).z);//note:y is a zero array here.
563  }
564  }
565 
566  //save previous mat-vec result
567  if (l == param.m) copyCuda(Ap0, Ap);
568 
569  //mat(Ap, p, tmp, tmp2); // tmp as tmp
570  matSloppy(Ap, p, tmp, tmp2);
571 
572  //construct the Lanczos matrix:
573  if(l > 0){
574  eigcg_args->LoadLanczosDiag(l-1, alpha, alpha0, beta0);
575  }
576 
577  //Begin Rayleigh-Ritz procedure:
578  if (l == param.m){
579 
580  eigvRestart++;
581 
582  //Restart search space :
583  int cldn = Vm->EigvTotalLength() >> 1; //complex leading dimension
584  int clen = Vm->EigvLength() >> 1; //complex vector length
585  //
586  int _2nev = eigcg_args->RestartVm(Vm->V(), cldn, clen, Vm->Precision());
587 
588  if(getVerbosity() >= QUDA_DEBUG_VERBOSE) eigcg_args->CheckEigenvalues(Vm, matDefl, eigvRestart);
589 
590  //Fill-up diagonal elements of the matrix T
591  eigcg_args->FillLanczosDiag(_2nev);
592 
593  //Compute Ap0 = Ap - beta*Ap0:
594  xpayCuda(Ap, -beta, Ap0);//mind precision...
595 
596  copyCuda(*v0, Ap0);//convert arrays here:
597  eigcg_args->FillLanczosOffDiag(_2nev, v0, Vm, 1.0 / sqrt(r2));
598 
599  l = _2nev;
600 
601  } else{ //no-RR branch:
602 
603  if(l > 0){
604  eigcg_args->LoadLanczosOffDiag(l-1, alpha, beta);
605  }
606  }
607 
608  //construct Lanczos basis:
609  copyCuda(Vm->Eigenvec(l), r);//convert arrays
610 
611  //rescale the vector
612  axCuda(1.0 / sqrt(r2), Vm->Eigenvec(l));
613 
614  //update search space index
615  l += 1;
616 
617  //end of RR-procedure
618  alpha0 = alpha;
619 
620 
621  pAp = reDotProductCuda(p, Ap);
622  alpha = r2 / pAp;
623 
624  // here we are deploying the alternative beta computation
625 
626  r2_old = r2;
627  Complex cg_norm = axpyCGNormCuda(-alpha, Ap, r);
628  r2 = real(cg_norm); // (r_new, r_new)
629  sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks
630 
631  k++;
632 
633  PrintStats("EigCG", k, r2, b2, heavy_quark_res);
634  }
635 
636 //Free eigcg resources:
637  delete eigcg_args;
638 
639  profile.Stop(QUDA_PROFILE_COMPUTE);
640  profile.Start(QUDA_PROFILE_EPILOGUE);
641 
642  param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
643  double gflops = (quda::blas_flops + mat.flops())*1e-9;
644  reduceDouble(gflops);
645  param.gflops = gflops;
646  param.iter += k;
647 
648  if (k==param.maxiter)
649  warningQuda("Exceeded maximum iterations %d", param.maxiter);
650 
651  if (getVerbosity() >= QUDA_VERBOSE){
652  printfQuda("EigCG: Eigenspace restarts = %d\n", eigvRestart);
653  }
654 
655  // compute the true residuals
656  //mat(r, x, y);
657  matSloppy(r, x, tmp, tmp2);
658 
659  param.true_res = sqrt(xmyNormCuda(b, r) / b2);
660 #if (__COMPUTE_CAPABILITY__ >= 200)
662 #else
663  param.true_res_hq = 0.0;
664 #endif
665  PrintSummary("EigCG", k, r2, b2);
666 
667  // reset the flops counters
668  quda::blas_flops = 0;
669  mat.flops();
670 
671  profile.Stop(QUDA_PROFILE_EPILOGUE);
672  profile.Start(QUDA_PROFILE_FREE);
673 
674  if (&tmp2 != &tmp) delete tmp2_p;
675 
676 //Clean EigCG resources:
677  delete v0;
678 
679  profile.Stop(QUDA_PROFILE_FREE);
680 
681  return eigvRestart;
682  }
683 
684 //END of eigcg solver.
685 
686 //Deflation space management:
688  {
689  printfQuda("\nCreate deflation space...\n");
690 
691  if(eigcgSpinor.SiteSubset() != QUDA_PARITY_SITE_SUBSET) errorQuda("\nRitz spinors must be parity spinors\n");//or adjust it
692 
693  ColorSpinorParam cudaEigvParam(eigcgSpinor);
694 
695  dpar = new DeflationParam(cudaEigvParam, param);
696 
697  printfQuda("\n...done.\n");
698 
699  //dpar->PrintInfo();
700 
701  return;
702  }
703 
705  {
706  if(dpar != 0)
707  {
708  delete dpar;
709 
710  dpar = 0;
711  }
712 
713  return;
714  }
715 
717  {
718  if(eigcg_alloc)
719  {
720  delete Vm;
721 
722  Vm = 0;
723 
724  eigcg_alloc = false;
725  }
726 
727  return;
728  }
729 
730 
731  void IncEigCG::ExpandDeflationSpace(DeflationParam *dpar, const int newnevs)
732  {
733  if(!use_eigcg || (newnevs == 0)) return; //nothing to do
734 
735  if(!eigcg_alloc) errorQuda("\nError: cannot expand deflation spase (eigcg ritz vectors were cleaned).\n");
736 
737  printfQuda("\nConstruct projection matrix..\n");
738 
739  int addednev = 0;
740 
741  if((newnevs + dpar->cur_dim) > dpar->ld) errorQuda("\nIncorrect deflation space...\n"); //nothing to do...
742 
743  //GS orthogonalization
744 
745  Complex alpha;
746 
747  for(int i = dpar->cur_dim; i < (dpar->cur_dim + newnevs); i++)
748  {
749  for(int j = 0; j < i; j++)
750  {
751  alpha = cDotProductCuda(dpar->cudaRitzVectors->Eigenvec(j), dpar->cudaRitzVectors->Eigenvec(i));//<j,i>
752  Complex scale = Complex(-alpha.real(), -alpha.imag());
753  caxpyCuda(scale, dpar->cudaRitzVectors->Eigenvec(j), dpar->cudaRitzVectors->Eigenvec(i)); //i-<j,i>j
754  }
755 
756  alpha = norm2(dpar->cudaRitzVectors->Eigenvec(i));
757  if(alpha.real() > 1e-16)
758  {
759  axCuda(1.0 /sqrt(alpha.real()), dpar->cudaRitzVectors->Eigenvec(i));
760  addednev += 1;
761  }
762  else
763  {
764  errorQuda("\nCannot orthogonalize %dth vector\n", i);
765  }
766  }
767 
770 
771  csParam.eigv_dim = 0;
772  csParam.eigv_id = -1;
773 
776 
778 
779  for (int j = dpar->cur_dim; j < (dpar->cur_dim+addednev); j++)//
780  {
781  matDefl(*W, dpar->cudaRitzVectors->Eigenvec(j), tmp);
782 
783  //off-diagonal:
784  for (int i = 0; i < j; i++)//row id
785  {
786  alpha = cDotProductCuda(dpar->cudaRitzVectors->Eigenvec(i), *W);
787  //
788  dpar->proj_matrix[j*dpar->ld+i] = alpha;
789  dpar->proj_matrix[i*dpar->ld+j] = conj(alpha);//conj
790  }
791 
792  //diagonal:
793  alpha = cDotProductCuda(dpar->cudaRitzVectors->Eigenvec(j), *W);
794  //
795  dpar->proj_matrix[j*dpar->ld+j] = alpha;
796  }
797 
798  dpar->ResetDeflationCurrentDim(addednev);
799 
800  printfQuda("\n.. done.\n");
801 
802  delete W;
803  delete W2;
804 
805  return;
806  }
807 
808 //new:
809  void IncEigCG::ReportEigenvalueAccuracy(DeflationParam *dpar, int nevs_to_print)
810  {
811  int curr_evals = dpar->cur_dim;
812 
813  double *evals = (double*)calloc(curr_evals,sizeof(double));
814 
815  Complex *projm = new Complex[dpar->ld*dpar->tot_dim];
816 
817  cudaHostRegister(projm, dpar->ld*dpar->tot_dim*sizeof(Complex), cudaHostRegisterMapped);
818 
819  memcpy(projm, dpar->proj_matrix, dpar->ld*curr_evals*sizeof(Complex));
820 
821  BlasMagmaArgs magma_args(dpar->tot_dim, dpar->ld, sizeof(double));//change precision..
822 
823  magma_args.MagmaHEEVD(projm, evals, curr_evals, true);
824 
826 
828 
829  csParam.eigv_dim = 0;
830  csParam.eigv_id = -1;
831 
834 
836 
837  for(int i = 0; i < nevs_to_print; i++)//newnev
838  {
839  for(int j = 0; j < curr_evals; j++) caxpyCuda(projm[i*dpar->ld+j], dpar->cudaRitzVectors->Eigenvec(j), *W);
840 
841  double norm2W = normCuda(*W);
842 
843  matDefl(*W2, *W, tmp);
844 
845  Complex dotWW2 = cDotProductCuda(*W, *W2);
846 
847  evals[i] = dotWW2.real() / norm2W;
848 
849  axCuda(evals[i], *W);
850 
851  mxpyCuda(*W2, *W);
852 
853  double relerr = sqrt( norm2(*W) / norm2W );
854 
855  zeroCuda(*W);
856 
857  printfQuda("Eigenvalue %d: %1.12e Residual: %1.12e\n", i+1, evals[i], relerr);
858 
859  }
860 
861  delete W;
862 
863  delete W2;
864 
865  free(evals);
866 
867  cudaHostUnregister(projm);
868 
869  delete projm;
870 
871  return;
872  }
873 
874 
875  void IncEigCG::LoadEigenvectors(DeflationParam *dpar, int max_nevs, double tol /*requested tolerance for the eigenvalues*/)
876  {
877  if(dpar->cur_dim < max_nevs)
878  {
879  printf("\nToo big number of eigenvectors was requested, switched to maximum available number %d\n", dpar->cur_dim);
880  max_nevs = dpar->cur_dim;
881  }
882 
883  double *evals = (double*)calloc(dpar->cur_dim, sizeof(double));//WARNING: Ritz values always in double.
884 
885  Complex *projm = new Complex[dpar->ld*dpar->tot_dim];
886 
887  cudaHostRegister(projm, dpar->ld*dpar->tot_dim*sizeof(Complex), cudaHostRegisterMapped);
888 
889  memcpy(projm, dpar->proj_matrix, dpar->ld*dpar->cur_dim*sizeof(Complex));
890 
891  BlasMagmaArgs magma_args(dpar->tot_dim, dpar->ld, sizeof(double));//change precision..
892 
893  magma_args.MagmaHEEVD(projm, evals, dpar->cur_dim, true);
894 
895  //reset projection matrix:
896  for(int i = 0; i < dpar->cur_dim; i++)
897  {
898  if(fabs(evals[i]) > 1e-16)
899  {
900  dpar->ritz_values[i] = 1.0 / evals[i];
901  }
902  else
903  {
904  errorQuda("\nCannot invert Ritz value.\n");
905  }
906  }
907 
910 
911  csParam.eigv_dim = 0;
912  csParam.eigv_id = -1;
913 
916 
918 
919  if(eigcg_alloc == false){//bug!
920 
921  printfQuda("\nAllocating resources for the eigenvectors...\n");
922 
923  //Create an eigenvector set:
925  //csParam.setPrecision(search_space_prec);//eigCG internal search space precision: must be adjustable.
926  csParam.eigv_dim = max_nevs;
927 
928  Vm = new cudaColorSpinorField(csParam); //search space for Ritz vectors
929 
930  checkCudaError();
931  printfQuda("\n..done.\n");
932 
933  eigcg_alloc = true;
934  }
935 
936  int idx = 0;
937 
938  double relerr = 0.0;
939 
940  while ((relerr < tol) && (idx < max_nevs))//newnev
941  {
942  for(int j = 0; j < dpar->cur_dim; j++) caxpyCuda(projm[idx*dpar->ld+j], dpar->cudaRitzVectors->Eigenvec(j), *W);
943  //load aigenvector into temporary buffer:
944  copyCuda(Vm->Eigenvec(idx), *W);
945 
946  if(getVerbosity() >= QUDA_VERBOSE)
947  {
948  double norm2W = normCuda(*W);
949 
950  matDefl(*W2, *W, tmp);
951 
952  Complex dotWW2 = cDotProductCuda(*W, *W2);
953 
954  evals[idx] = dotWW2.real() / norm2W;
955 
956  axCuda(evals[idx], *W);
957 
958  mxpyCuda(*W2, *W);
959 
960  relerr = sqrt( normCuda(*W) / norm2W );
961  //
962  printfQuda("Eigenvalue %d: %1.12e Residual: %1.12e\n", (idx+1), evals[idx], relerr);
963  }
964 
965  zeroCuda(*W);
966 
967  idx += 1;
968  }
969 
970  dpar->ReshapeDeviceRitzVectorsSet(idx);//
971 
972  //copy all the stuff to cudaRitzVectors set:
973  for(int i = 0; i < idx; i++) copyCuda(dpar->cudaRitzVectors->Eigenvec(i), Vm->Eigenvec(i));
974 
975  //reset current dimension:
976  printfQuda("\nUsed eigenvectors: %d\n", idx);
977 
978  dpar->rtz_dim = idx;//idx never exceeds cur_dim.
979 
980  delete W;
981 
982  delete W2;
983 
984  free(evals);
985 
986  cudaHostUnregister(projm);
987 
988  delete projm;
989 
990  return;
991  }
992 
994  {
995  if(set2zero) zeroCuda(x);
996  if(dpar->cur_dim == 0) return;//nothing to do
997 
998  BlasMagmaArgs magma_args(sizeof(double));//change precision..
999 
1000  Complex *vec = new Complex[dpar->ld];
1001 
1002  double check_nrm2 = norm2(b);
1003  printfQuda("\nSource norm (gpu): %1.15e\n", sqrt(check_nrm2));
1004 
1005 
1006  for(int i = 0; i < dpar->cur_dim; i++)
1007  {
1008  vec[i] = cDotProductCuda(dpar->cudaRitzVectors->Eigenvec(i), b);//<i, b>
1009  }
1010 
1011  magma_args.SolveProjMatrix((void*)vec, dpar->ld, dpar->cur_dim, (void*)dpar->proj_matrix, dpar->ld);
1012 
1013  for(int i = 0; i < dpar->cur_dim; i++)
1014  {
1015  caxpyCuda(vec[i], dpar->cudaRitzVectors->Eigenvec(i), x); //a*i+x
1016  }
1017 
1018  check_nrm2 = norm2(x);
1019  printfQuda("\nDeflated guess spinor norm (gpu): %1.15e\n", sqrt(check_nrm2));
1020 
1021  delete [] vec;
1022 
1023  return;
1024  }
1025 
1027  {
1028  if(set2zero) zeroCuda(x);
1029 
1030  if(dpar->rtz_dim == 0) return;//nothing to do
1031 
1032  double check_nrm2 = norm2(b);
1033  printfQuda("\nSource norm (gpu): %1.15e\n", sqrt(check_nrm2));
1034 
1035 
1036  for(int i = 0; i < dpar->rtz_dim; i++)
1037  {
1038  Complex tmp = cDotProductCuda(dpar->cudaRitzVectors->Eigenvec(i), b);//<i, b>
1039 
1040  tmp = tmp * dpar->ritz_values[i];
1041 
1042  caxpyCuda(tmp, dpar->cudaRitzVectors->Eigenvec(i), x); //a*i+x
1043  }
1044 
1045  check_nrm2 = norm2(x);
1046  printfQuda("\nDeflated guess spinor norm (gpu): %1.15e\n", sqrt(check_nrm2));
1047 
1048  return;
1049  }
1050 
1051 //copy EigCG ritz vectors.
1052  void IncEigCG::SaveEigCGRitzVecs(DeflationParam *dpar, bool cleanEigCGResources)
1053  {
1054  const int first_idx = dpar->cur_dim;
1055 
1056  if(dpar->cudaRitzVectors->EigvDim() < (first_idx+param.nev)) errorQuda("\nNot enough space to copy %d vectors..\n", param.nev);
1057 
1058  else if(!eigcg_alloc || !dpar->cuda_ritz_alloc) errorQuda("\nEigCG resources were cleaned.\n");
1059 
1060  for(int i = 0; i < param.nev; i++) copyCuda(dpar->cudaRitzVectors->Eigenvec(first_idx+i), Vm->Eigenvec(i));
1061 
1062  if(cleanEigCGResources)
1063  {
1065  }
1066  else//just call zeroCuda..
1067  {
1068  zeroCuda(*Vm);
1069  }
1070 
1071  return;
1072  }
1073 
1074 //not optimal : temorary hack!
1075  void IncEigCG::StoreRitzVecs(void *hu, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources)
1076  {
1077  const int spinorSize = 24;
1078  size_t h_size = spinorSize*defl_param->ritz_prec*defl_param->cudaRitzVectors->EigvVolume();//WARNING: might be brocken when padding is set!
1079 
1080  int nev_to_copy = nev > defl_param->cur_dim ? defl_param->cur_dim : nev;
1081  if(nev > defl_param->cur_dim) warningQuda("\nWill copy %d eigenvectors (requested %d)\n", nev_to_copy, nev);
1082 
1083  for(int i = 0; i < nev_to_copy; i++)
1084  {
1085 
1086  size_t offset = i*h_size;
1087  void *hptr = (void*)((char*)hu+offset);
1088 
1089  ColorSpinorParam cpuParam(hptr, *inv_par, X, true);
1090 
1091  cpuColorSpinorField *tmp = new cpuColorSpinorField(cpuParam);
1092 
1093  *tmp = defl_param->cudaRitzVectors->Eigenvec(i);//copy gpu field
1094 
1095  delete tmp;
1096  }
1097 
1098  if(inv_eigenvals) memcpy(inv_eigenvals, defl_param->ritz_values, nev_to_copy*sizeof(double));
1099 
1100  if(cleanResources) CleanResources();
1101 
1102  return;
1103  }
1104 
1106  {
1108 
1109  if(defl_param != 0)
1110  {
1111  DeleteDeflationSpace(defl_param);
1112 
1113  defl_param = 0;
1114  }
1115 
1116  return;
1117  }
1118 
1120  {
1121  const bool use_reduced_vector_set = true;
1122 
1123  const bool use_cg_updates = false;
1124 
1125  const int eigcg_min_restarts = 3;
1126 
1127  int eigcg_restarts = 0;
1128 /*
1129  * Set internal (iterative refinement) tolerance for the cg solver
1130  * In general, this is a tunable parameters, e.g.:
1131  * 24^3x48 : 5e-3
1132  * 48^3x96 : 5e-2, works but not perfect, 1e-1 seems to be better...
1133  */
1134  const double cg_iterref_tol = 5e-2;//this must be external for the end-user tuning!
1135 
1136  if(defl_param == 0)
1137  {
1138  CreateDeflationSpace(*in, defl_param);
1139  }
1140  else if(use_eigcg && !defl_param->in_incremental_stage)
1141  {
1142  use_eigcg = false;
1143 
1144  printfQuda("\nWarning: IncEigCG will deploy initCG solver.\n");
1145 
1147 
1148  //temporary solution!
1149  initCGparam.precision_sloppy = QUDA_HALF_PRECISION; //may not be half, in general?
1150  //
1151  initCGparam.use_sloppy_partial_accumulator=0;
1152 
1153  fillInitCGSolveParam(initCGparam);
1154 
1155  }
1156 
1157  //if this operator applied during the first stage of the incremental eigCG (to construct deflation space):
1158  //then: call eigCG inverter
1159  if(use_eigcg){
1160 
1161  const bool use_mixed_prec = (eigcg_precision != param.precision);
1162 
1163  //deflate initial guess ('out'-field):
1164  DeflateSpinor(*out, *in, defl_param);
1165 
1166  if(!use_mixed_prec) //ok, just run full precision eigcg.
1167  {
1168  printf("\nRunning solo precision EigCG.\n");
1169 
1170  eigcg_restarts = EigCG(*out, *in);
1171 
1172  //store computed Ritz vectors:
1173  SaveEigCGRitzVecs(defl_param);
1174  //Construct(extend) projection matrix:
1175  ExpandDeflationSpace(defl_param, param.nev);
1176  }
1177  else //this is mixed precision eigcg
1178  {
1179 
1180  printf("\nRunning mixed precision EigCG.\n");
1181 
1182  const double ext_tol = param.tol;
1183 
1184  const double stop = norm2(*in)*ext_tol*ext_tol;
1185 
1186  double tot_time = 0.0;
1187  //
1188  ColorSpinorParam cudaParam(*out);
1189  //
1190  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1191  //
1192  cudaParam.setPrecision(eigcg_precision);
1193 
1194  cudaColorSpinorField *outSloppy = new cudaColorSpinorField(cudaParam);
1195  cudaColorSpinorField *inSloppy = new cudaColorSpinorField(cudaParam);
1196 
1197  copyCuda(*inSloppy, *in);//input is outer residual
1198  copyCuda(*outSloppy, *out);
1199 
1200  if (ext_tol < SINGLE_PRECISION_EPSILON) param.tol = SINGLE_PRECISION_EPSILON;//single precision eigcg tolerance
1201 
1202  //the first eigcg cycle:
1203  eigcg_restarts = EigCG(*outSloppy, *inSloppy);
1204 
1205  tot_time += param.secs;
1206 
1207  //store computed Ritz vectors:
1208  SaveEigCGRitzVecs(defl_param);
1209 
1210  //Construct(extend) projection matrix:
1211  ExpandDeflationSpace(defl_param, param.nev);
1212 
1213  cudaColorSpinorField y(*in);//full precision accumulator
1214  cudaColorSpinorField r(*in);//full precision residual
1215 
1216  //launch again eigcg:
1217  copyCuda(*out, *outSloppy);
1218  //
1219  mat(r, *out, y); //here we can use y as tmp
1220  //
1221  double r2 = xmyNormCuda(*in, r);//new residual (and RHS)
1222  //
1223  double stop_div_r2 = stop / r2;
1224  //
1225  eigcg_global_stop = stop;//for the eigcg stuff only
1226 
1227  Solver *initCG = 0;
1228 
1229  initCGparam.tol = cg_iterref_tol;
1230  initCGparam.precision = eigcg_precision;//the same as eigcg
1231  //
1232  initCGparam.precision_sloppy = QUDA_HALF_PRECISION; //may not be half, in general?
1233  initCGparam.use_sloppy_partial_accumulator=0; //more stable single-half solver
1234 
1235  fillInitCGSolveParam(initCGparam);
1236 
1237  //too messy..
1238  bool cg_updates = (use_cg_updates || (eigcg_restarts < eigcg_min_restarts) || (defl_param->cur_dim == defl_param->tot_dim) || (stop_div_r2 > cg_iterref_tol));
1239 
1240  bool eigcg_updates = !cg_updates;
1241 
1242  if(cg_updates) initCG = new CG(matSloppy, matCGSloppy, initCGparam, profile);
1243 
1244  //start the main loop:
1245  while(r2 > stop)
1246  {
1247  zeroCuda(y);
1248  //deflate initial guess:
1249  DeflateSpinor(y, r, defl_param);
1250  //
1251  copyCuda(*inSloppy, r);
1252  //
1253  copyCuda(*outSloppy, y);
1254  //
1255  if(eigcg_updates) //call low precision eigcg solver
1256  {
1257  eigcg_restarts = EigCG(*outSloppy, *inSloppy);
1258  }
1259  else //if(initCG) call low precision initCG solver
1260  {
1261  (*initCG)(*outSloppy, *inSloppy);
1262  }
1263 
1264  copyCuda(y, *outSloppy);
1265  //
1266  xpyCuda(y, *out);
1267  //
1268  mat(r, *out, y);
1269  //
1270  r2 = xmyNormCuda(*in, r);
1271  //
1272  stop_div_r2 = stop / r2;
1273 
1274  tot_time += (eigcg_updates ? param.secs : initCGparam.secs);
1275 
1276  if(eigcg_updates)
1277  {
1278  if(((eigcg_restarts >= eigcg_min_restarts) || (stop_div_r2 < cg_iterref_tol)) && (defl_param->cur_dim < defl_param->tot_dim))
1279  {
1280  SaveEigCGRitzVecs(defl_param);//accumulate
1281  //
1282  ExpandDeflationSpace(defl_param, param.nev);
1283  //
1284  }
1285  else
1286  {
1287  if(!initCG && (r2 > stop)) initCG = new CG(matSloppy, matCGSloppy, initCGparam, profile);
1288  //param.tol = cg_iterref_tol;
1289  cg_updates = true;
1290 
1291  eigcg_updates = false;
1292  }
1293  }
1294  }//endof while loop
1295 
1296  if(cg_updates && initCG)
1297  {
1298  delete initCG;
1299 
1300  initCG = 0;
1301  }
1302 
1303  delete outSloppy;
1304 
1305  delete inSloppy;
1306 
1307  //copy solver statistics:
1308  param.iter += initCGparam.iter;
1309  //
1310  param.gflops += initCGparam.gflops;
1311  //
1312  param.secs = tot_time;
1313 
1314  }//end of the mixed precision branch
1315 
1316  if(getVerbosity() >= QUDA_VERBOSE)
1317  {
1318  printfQuda("\neigCG stat: %i iter / %g secs = %g Gflops. \n", param.iter, param.secs, param.gflops);
1319 
1320  printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", initCGparam.iter, initCGparam.secs, initCGparam.gflops);
1321 
1323 
1324  ReportEigenvalueAccuracy(defl_param, param.nev);
1325  }
1326  }
1327  //else: use deflated CG solver with proper restarting.
1328  else{
1329  double full_tol = initCGparam.tol;
1330 
1331  double restart_tol = initCGparam.tol_restart;
1332 
1333  ColorSpinorParam cudaParam(*out);
1334 
1335  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1336 
1337  cudaParam.eigv_dim = 0;
1338 
1339  cudaParam.eigv_id = -1;
1340 
1341  cudaColorSpinorField *W = new cudaColorSpinorField(cudaParam);
1342 
1343  cudaColorSpinorField tmp (*W, cudaParam);
1344 
1345  Solver *initCG = 0;
1346 
1347  if(use_reduced_vector_set)
1348  DeflateSpinorReduced(*out, *in, defl_param, true);
1349  else
1350  DeflateSpinor(*out, *in, defl_param, true);
1351 
1352  //initCGparam.precision_sloppy = QUDA_HALF_PRECISION;
1353 
1354  const int max_restart_num = 3;//must be external for end-user tuning
1355 
1356  int restart_idx = 0;
1357 
1358  const double inc_tol = 1e-2;//must be external for the end-user tuning
1359 
1360  //In many cases, there is no need to use full precision accumulator, since low-mode deflation stabilizes
1361  //the mixed precision solver. Moreover, full precision acummulation results in worse performance of the deflated solver, upto 15% in my experiments
1362  //However, this parameter should be exposed to the enduser for the performance tuning, just in some rare cases when low-mode deflation will be insufficient
1363  //for stable double-half mixed precision CG.
1364  initCGparam.use_sloppy_partial_accumulator = 1;
1365 
1366  initCGparam.delta = 1e-2; // might be a bit better than the default value 1e-1 (think about this)
1367 
1368  //launch initCG:
1369  while((restart_tol > full_tol) && (restart_idx < max_restart_num))//currently just one restart, think about better algorithm for the restarts.
1370  {
1371  initCGparam.tol = restart_tol;
1372 
1373  initCG = new CG(mat, matCGSloppy, initCGparam, profile);
1374 
1375  (*initCG)(*out, *in);
1376 
1377  delete initCG;
1378 
1379  matDefl(*W, *out, tmp);
1380 
1381  xpayCuda(*in, -1, *W);
1382 
1383  if(use_reduced_vector_set)
1384 
1385  DeflateSpinorReduced(*out, *W, defl_param, false);
1386  else
1387 
1388  DeflateSpinor(*out, *W, defl_param, false);
1389 
1390  if(getVerbosity() >= QUDA_VERBOSE)
1391  {
1392  printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", initCGparam.iter, initCGparam.secs, initCGparam.gflops);
1393  }
1394 
1395  double new_restart_tol = restart_tol*inc_tol;
1396 
1397  restart_tol = (new_restart_tol > full_tol) ? new_restart_tol : full_tol;
1398 
1399  restart_idx += 1;
1400 
1401  param.secs += initCGparam.secs;
1402  }
1403 
1404  initCGparam.tol = full_tol;
1405 
1406  initCG = new CG(mat, matCGSloppy, initCGparam, profile);
1407 
1408  (*initCG)(*out, *in);
1409 
1410  delete initCG;
1411 
1412  if(getVerbosity() >= QUDA_VERBOSE)
1413  {
1414  printfQuda("\ninitCG total stat (%d restarts): %i iter / %g secs = %g Gflops. \n", restart_idx, initCGparam.iter, initCGparam.secs, initCGparam.gflops);
1415  }
1416 
1417  //copy solver statistics:
1418  param.iter += initCGparam.iter;
1419  //
1420  param.secs += initCGparam.secs;
1421  //
1422  param.gflops += initCGparam.gflops;
1423 
1424  delete W;
1425  }
1426 
1427  if( (defl_param->cur_dim == defl_param->tot_dim) && use_eigcg )
1428  {
1429  defl_param->in_incremental_stage = false;//stop the incremental stage now.
1430 
1432 
1433  if(use_reduced_vector_set){
1434 
1435  const int max_nev = defl_param->cur_dim;//param.m;
1436 
1437  double eigenval_tol = 1e-1;
1438 
1439  LoadEigenvectors(defl_param, max_nev, eigenval_tol);
1440 
1441  printfQuda("\n...done. \n");
1442  }
1443  }
1444 
1445  //compute true residual:
1446  ColorSpinorParam cudaParam(*out);
1447  //
1448  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1449  //
1450  cudaColorSpinorField *final_r = new cudaColorSpinorField(cudaParam);
1452 
1453 
1454  mat(*final_r, *out, *tmp2);
1455 
1456  param.true_res = sqrt(xmyNormCuda(*in, *final_r) / norm2(*in));
1457 
1458  delete final_r;
1459 
1460  delete tmp2;
1461 
1462  param.rhs_idx += 1;
1463 
1464  return;
1465  }
1466 
1467 
1468 } // namespace quda
void DeleteDeflationSpace(DeflationParam *&param)
#define SINGLE_PRECISION_EPSILON
void setPrecision(QudaPrecision precision)
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:207
enum QudaPrecision_s QudaPrecision
void FillLanczosOffDiag(const int _2nev, cudaColorSpinorField *v, cudaColorSpinorField *u, double inv_sqrt_r2)
void LoadLanczosDiag(int idx, double alpha, double alpha0, double beta0)
int y[4]
QudaInverterType inv_type
Definition: invert_quda.h:18
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void StoreRitzVecs(void *host_buf, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources=false)
#define errorQuda(...)
Definition: util_quda.h:73
void ReportEigenvalueAccuracy(DeflationParam *param, int nevs_to_print)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
DeflationParam(ColorSpinorParam &eigv_param, SolverParam &param)
std::complex< double > Complex
Definition: eig_variables.h:13
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
Definition: blas_quda.cu:338
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaPrecision precision_ritz
Definition: invert_quda.h:147
void SolveProjMatrix(void *rhs, const int ldn, const int n, void *H, const int ldH)
Definition: blas_magma.cpp:398
IncEigCG(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matCGSloppy, DiracMatrix &matDefl, SolverParam &param, TimeProfile &profile)
int EigCG(cudaColorSpinorField &out, cudaColorSpinorField &in)
void DeflateSpinor(cudaColorSpinorField &out, cudaColorSpinorField &in, DeflationParam *param, bool set2zero=true)
QudaPrecision ritz_prec
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:682
unsigned long long flops() const
Definition: dirac_quda.h:587
cudaColorSpinorField * cudaRitzVectors
QudaGaugeParam param
Definition: pack_test.cpp:17
#define MAX_EIGENVEC_WINDOW
cudaColorSpinorField * tmp2
Definition: dslash_test.cpp:41
cudaColorSpinorField * tmp
void ResetDeflationCurrentDim(const int addedvecs)
QudaResidualType residual_type
Definition: invert_quda.h:35
int RestartVm(void *vm, const int cld, const int clen, const int vprec)
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
Definition: reduce_quda.cu:468
void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:154
FloatingPoint< float > Float
Definition: gtest.h:7350
ColorSpinorParam csParam
Definition: pack_test.cpp:24
cpuColorSpinorField * in
void fillInitCGSolveParam(SolverParam &initCGparam)
#define warningQuda(...)
Definition: util_quda.h:84
int EigvDim() const
for eigcg only:
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:164
double normCuda(const cudaColorSpinorField &b)
Definition: reduce_quda.cu:145
void FillLanczosDiag(const int _2nev)
void operator()(cudaColorSpinorField *out, cudaColorSpinorField *in)
void ExpandDeflationSpace(DeflationParam *param, const int new_nev)
int x[4]
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
Definition: solver.cpp:194
unsigned long long blas_flops
Definition: blas_quda.cu:37
double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &r)
Definition: reduce_quda.cu:782
QudaPrecision precision
Definition: invert_quda.h:81
void DeflateSpinorReduced(cudaColorSpinorField &out, cudaColorSpinorField &in, DeflationParam *param, bool set2zero=true)
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:98
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:170
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
void DeleteEigCGSearchSpace()
void * memset(void *s, int c, size_t n)
void CreateDeflationSpace(cudaColorSpinorField &eigcgSpinor, DeflationParam *&param)
QudaPrecision Precision() const
double Last(QudaProfileType idx)
void reduceDouble(double &)
SolverParam & param
Definition: invert_quda.h:533
#define printfQuda(...)
Definition: util_quda.h:67
void MagmaHEEVD(void *dTvecm, void *hTvalm, const int problem_size, bool host=false)
Definition: blas_magma.cpp:203
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40
void LoadLanczosOffDiag(int idx, double alpha0, double beta0)
void Start(QudaProfileType idx)
std::string Type() const
Definition: dirac_quda.h:592
void ReshapeDeviceRitzVectorsSet(const int nev, QudaPrecision new_ritz_prec=QUDA_INVALID_PRECISION)
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
Definition: solver.cpp:179
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:38
QudaPrecision precision_sloppy
Definition: invert_quda.h:84
#define checkCudaError()
Definition: util_quda.h:110
bool use_sloppy_partial_accumulator
Definition: invert_quda.h:44
void SaveEigCGRitzVecs(DeflationParam *param, bool cleanResources=false)
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
Definition: blas_quda.cu:138
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
Definition: reduce_quda.cu:777
VOLATILE spinorFloat * s
QudaPrecision prec
Definition: test_util.cpp:1551
void CheckEigenvalues(const cudaColorSpinorField *Vm, const DiracMatrix &matDefl, const int restart_num)
void axCuda(const double &a, cudaColorSpinorField &x)
Definition: blas_quda.cu:171
double norm2(const ColorSpinorField &)
void LoadEigenvectors(DeflationParam *param, int max_nevs, double tol=1e-3)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343
QudaSiteSubset SiteSubset() const
QudaPrecision eigcg_precision
Definition: invert_quda.h:538
EigCGArgs(int m, int nev)
cudaColorSpinorField & Eigenvec(const int idx) const
for deflated solvers: