QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
inv_ca_gcr.cpp
Go to the documentation of this file.
1 #include <invert_quda.h>
2 #include <blas_quda.h>
3 #include <Eigen/Dense>
4 
5 namespace quda {
6 
8  : Solver(param, profile), mat(mat), matSloppy(matSloppy), init(false),
9  basis(param.ca_basis), alpha(nullptr), rp(nullptr), tmpp(nullptr), tmp_sloppy(nullptr) { }
10 
13 
14  if (init) {
15  if (alpha) delete []alpha;
16  bool use_source = (param.preserve_source == QUDA_PRESERVE_SOURCE_NO &&
19  if (basis == QUDA_POWER_BASIS) {
20  for (int i=0; i<param.Nkrylov+1; i++) if (i>0 || !use_source) delete p[i];
21  } else {
22  for (int i=0; i<param.Nkrylov; i++) if (i>0 || !use_source) delete p[i];
23  for (int i=0; i<param.Nkrylov; i++) delete q[i];
24  }
25  if (tmp_sloppy) delete tmp_sloppy;
26  if (tmpp) delete tmpp;
27  if (rp) delete rp;
28  }
29 
30  if (deflate_init) {
31  for (auto veci : param.evecs)
32  if (veci) delete veci;
33  delete defl_tmp1[0];
34  delete defl_tmp2[0];
35  }
36 
38  }
39 
41  {
42  if (!init) {
43  if (!param.is_preconditioner) {
44  blas::flops = 0;
45  profile.TPSTART(QUDA_PROFILE_INIT);
46  }
47 
48  alpha = new Complex[param.Nkrylov];
49 
50  bool mixed = param.precision != param.precision_sloppy;
51  bool use_source = (param.preserve_source == QUDA_PRESERVE_SOURCE_NO && !mixed &&
53 
56 
57  // Source needs to be preserved if we're computing the true residual
58  rp = (mixed && !use_source) ? ColorSpinorField::Create(csParam) : nullptr;
59  tmpp = ColorSpinorField::Create(csParam);
60 
61  // now allocate sloppy fields
63 
64  if (basis != QUDA_POWER_BASIS) {
65  warningQuda("CA-GCR does not support any basis besides QUDA_POWER_BASIS. Switching to QUDA_POWER_BASIS...\n");
67  }
68 
69  if (basis == QUDA_POWER_BASIS) {
70  // in power basis q[k] = p[k+1], so we don't need a separate q array
71  p.resize(param.Nkrylov+1);
72  q.resize(param.Nkrylov);
73  for (int i=0; i<param.Nkrylov+1; i++) {
74  p[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
75  if (i>0) q[i-1] = p[i];
76  }
77  } else {
78  p.resize(param.Nkrylov);
79  q.resize(param.Nkrylov);
80  for (int i=0; i<param.Nkrylov; i++) {
81  p[i] = (i==0 && use_source) ? &b : ColorSpinorField::Create(csParam);
82  q[i] = ColorSpinorField::Create(csParam);
83  }
84  }
85 
86  // Once the GCR operator is called, we are able to construct an appropriate
87  // Krylov space for deflation
89 
90  //sloppy temporary for mat-vec
91  tmp_sloppy = mixed ? ColorSpinorField::Create(csParam) : nullptr;
92 
94 
95  init = true;
96  } // init
97  }
98 
99  void CAGCR::solve(Complex *psi_, std::vector<ColorSpinorField*> &q, ColorSpinorField &b)
100  {
101  using namespace Eigen;
102  typedef Matrix<Complex, Dynamic, Dynamic> matrix;
103  typedef Matrix<Complex, Dynamic, 1> vector;
104 
105  const int N = q.size();
106  vector phi(N), psi(N);
107  matrix A(N,N);
108 
109 #if 1
110  // only a single reduction but requires using the full dot product
111  // compute rhs vector phi = Q* b = (q_i, b)
112  std::vector<ColorSpinorField*> Q;
113  for (int i=0; i<N; i++) Q.push_back(q[i]);
114  Q.push_back(&b);
115 
116  // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j)
117  Complex *A_ = new Complex[N*(N+1)];
118  blas::cDotProduct(A_, q, Q);
119  for (int i=0; i<N; i++) {
120  phi(i) = A_[i*(N+1)+N];
121  for (int j=0; j<N; j++) {
122  A(i,j) = A_[i*(N+1)+j];
123  }
124  }
125  delete[] A_;
126 #else
127  // two reductions but uses the Hermitian block dot product
128  // compute rhs vector phi = Q* b = (q_i, b)
129  std::vector<ColorSpinorField*> B;
130  B.push_back(&b);
131  Complex *phi_ = new Complex[N];
132  blas::cDotProduct(phi_,q, B);
133  for (int i=0; i<N; i++) phi(i) = phi_[i];
134  delete phi_;
135 
136  // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j)
137  Complex *A_ = new Complex[N*N];
138  blas::hDotProduct(A_, q, q);
139  for (int i=0; i<N; i++)
140  for (int j=0; j<N; j++)
141  A(i,j) = A_[i*N+j];
142  delete[] A_;
143 #endif
144 
145  if (!param.is_preconditioner) {
148  profile.TPSTART(QUDA_PROFILE_EIGEN);
149  }
150 
151  // use Cholesky LDL since this seems plenty stable
152  LDLT<matrix> cholesky(A);
153  psi = cholesky.solve(phi);
154 
155  for (int i=0; i<N; i++) psi_[i] = psi(i);
156 
157  if (!param.is_preconditioner) {
158  profile.TPSTOP(QUDA_PROFILE_EIGEN);
160  profile.TPSTART(QUDA_PROFILE_COMPUTE);
161  }
162 
163  }
164 
165  /*
166  The main CA-GCR algorithm, which consists of three main steps:
167  1. Build basis vectors q_k = A p_k for k = 1..Nkrlylov
168  2. Minimize the residual in this basis
169  3. Update solution and residual vectors
170  4. (Optional) restart if convergence or maxiter not reached
171  */
173  {
174  const int nKrylov = param.Nkrylov;
175 
176  if (checkPrecision(x,b) != param.precision) errorQuda("Precision mismatch %d %d", checkPrecision(x,b), param.precision);
177  if (param.return_residual && param.preserve_source == QUDA_PRESERVE_SOURCE_YES) errorQuda("Cannot preserve source and return the residual");
178 
179  if (param.maxiter == 0 || nKrylov == 0) {
181  return;
182  }
183 
184  create(b);
185 
186  ColorSpinorField &r = rp ? *rp : *p[0];
188  ColorSpinorField &tmpSloppy = tmp_sloppy ? *tmp_sloppy : tmp;
189 
191 
192  // compute b2, but only if we need to
193  bool fixed_iteration = param.sloppy_converge && nKrylov==param.maxiter && !param.compute_true_res;
194  double b2 = !fixed_iteration ? blas::norm2(b) : 1.0;
195  double r2 = 0.0; // if zero source then we will exit immediately doing no work
196 
197  // compute intitial residual depending on whether we have an initial guess or not
199  mat(r, x, tmp);
200  //r = b - Ax0
201  if (!fixed_iteration) {
202  r2 = blas::xmyNorm(b, r);
203  } else {
204  blas::xpay(b, -1.0, r);
205  r2 = b2; // dummy setting
206  }
207  } else {
208  r2 = b2;
209  blas::copy(r, b);
210  blas::zero(x);
211  }
212 
213  if (param.deflate == true) {
214  std::vector<ColorSpinorField *> rhs;
215  // Use residual from supplied guess r, or original
216  // rhs b. use `defl_tmp2` as a temp.
217  blas::copy(*defl_tmp2[0], r);
218  rhs.push_back(defl_tmp2[0]);
219 
220  // Deflate: Hardcoded to SVD
222 
223  // Compute r_defl = RHS - A * LHS
224  mat(r, *defl_tmp1[0]);
225  r2 = blas::xmyNorm(*rhs[0], r);
226 
227  // defl_tmp must be added to the solution at the end
228  blas::axpy(1.0, *defl_tmp1[0], x);
229  }
230 
231  // Check to see that we're not trying to invert on a zero-field source
232  if (b2 == 0) {
234  warningQuda("inverting on zero-field source\n");
235  x = b;
236  param.true_res = 0.0;
237  param.true_res_hq = 0.0;
238  return;
239  } else {
240  b2 = r2;
241  }
242  }
243 
244  double stop = !fixed_iteration ? stopping(param.tol, b2, param.residual_type) : 0.0; // stopping condition of solver
245 
246  const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
247 
248  // this parameter determines how many consective reliable update
249  // reisudal increases we tolerate before terminating the solver,
250  // i.e., how long do we want to keep trying to converge
251  const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance
252  const int maxResIncreaseTotal = param.max_res_increase_total;
253 
254  double heavy_quark_res = 0.0; // heavy quark residual
255  if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r).z);
256 
257  int resIncrease = 0;
258  int resIncreaseTotal = 0;
259 
260  if (!param.is_preconditioner) {
261  blas::flops = 0;
263  profile.TPSTART(QUDA_PROFILE_COMPUTE);
264  }
265  int total_iter = 0;
266  int restart = 0;
267  double r2_old = r2;
268  bool l2_converge = false;
269 
270  blas::copy(*p[0], r); // no op if uni-precision
271 
272  PrintStats("CA-GCR", total_iter, r2, b2, heavy_quark_res);
273  while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) {
274 
275  // build up a space of size nKrylov
276  for (int k=0; k<nKrylov; k++) {
277  matSloppy(*q[k], *p[k], tmpSloppy);
278  if (k<nKrylov-1 && basis != QUDA_POWER_BASIS) blas::copy(*p[k+1], *q[k]);
279  }
280 
281  solve(alpha, q, *p[0]);
282 
283  // update the solution vector
284  std::vector<ColorSpinorField*> X;
285  X.push_back(&x);
286  // need to make sure P is only length nKrylov
287  std::vector<ColorSpinorField*> P;
288  for (int i=0; i<nKrylov; i++) P.push_back(p[i]);
289  blas::caxpy(alpha, P, X);
290 
291  // no need to compute residual vector if not returning
292  // residual vector and only doing a single fixed iteration
293  if (!fixed_iteration || param.return_residual) {
294  // update the residual vector
295  std::vector<ColorSpinorField*> R;
296  R.push_back(&r);
297  for (int i=0; i<nKrylov; i++) alpha[i] = -alpha[i];
298  blas::caxpy(alpha, q, R);
299  }
300 
301  total_iter+=nKrylov;
302  if ( !fixed_iteration || getVerbosity() >= QUDA_DEBUG_VERBOSE) {
303  // only compute the residual norm if we need to
304  r2 = blas::norm2(r);
305  }
306 
307  PrintStats("CA-GCR", total_iter, r2, b2, heavy_quark_res);
308 
309  // update since nKrylov or maxiter reached, converged or reliable update required
310  // note that the heavy quark residual will by definition only be checked every nKrylov steps
311  if (total_iter>=param.maxiter || (r2 < stop && !l2_converge) || sqrt(r2/r2_old) < param.delta) {
312 
313  if ( (r2 < stop || total_iter>=param.maxiter) && param.sloppy_converge) break;
314  mat(r, x, tmp);
315  r2 = blas::xmyNorm(b, r);
316  if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z);
317 
318  // break-out check if we have reached the limit of the precision
319  if (r2 > r2_old) {
320  resIncrease++;
321  resIncreaseTotal++;
322  warningQuda("CA-GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
323  sqrt(r2), sqrt(r2_old), resIncreaseTotal);
324  if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
325  warningQuda("CA-GCR: solver exiting due to too many true residual norm increases");
326  break;
327  }
328  } else {
329  resIncrease = 0;
330  }
331 
332  r2_old = r2;
333  }
334 
335  // No matter what, if we haven't converged, we do a restart.
336  if ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) ) {
337  restart++; // restarting if residual is still too great
338 
339  PrintStats("CA-GCR (restart)", restart, r2, b2, heavy_quark_res);
340  blas::copy(*p[0],r); // no-op if uni-precision
341 
342  r2_old = r2;
343 
344  // prevent ending the Krylov space prematurely if other convergence criteria not met
345  if (r2 < stop) l2_converge = true;
346  }
347 
348  }
349 
350  if (total_iter>param.maxiter && getVerbosity() >= QUDA_SUMMARIZE)
351  warningQuda("Exceeded maximum iterations %d", param.maxiter);
352 
353  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CA-GCR: number of restarts = %d\n", restart);
354 
355  if (param.compute_true_res) {
356  // Calculate the true residual
357  mat(r, x, tmp);
358  double true_res = blas::xmyNorm(b, r);
359  param.true_res = sqrt(true_res / b2);
361  if (param.return_residual) blas::copy(b, r);
362  } else {
363  if (param.return_residual) blas::copy(b, r);
364  }
365 
366  if (!param.is_preconditioner) {
367  qudaDeviceSynchronize(); // ensure solver is complete before ending timing
371 
372  // store flops and reset counters
373  double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
374 
375  param.gflops += gflops;
376  param.iter += total_iter;
377 
378  // reset the flops counters
379  blas::flops = 0;
380  mat.flops();
381  matSloppy.flops();
382 
384  }
385 
386  PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq);
387  }
388 
389 } // namespace quda
Complex * alpha
Definition: invert_quda.h:1001
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void create(ColorSpinorField &b)
Initiate the fields needed by the solver.
Definition: inv_ca_gcr.cpp:40
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE) ...
Definition: solver.cpp:256
CAGCR(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
Definition: inv_ca_gcr.cpp:7
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
static ColorSpinorField * Create(const ColorSpinorParam &param)
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:223
const DiracMatrix & mat
Definition: invert_quda.h:993
TimeProfile & profile
Definition: invert_quda.h:464
static int R[4]
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:75
QudaPreserveSource preserve_source
Definition: invert_quda.h:154
int max_res_increase_total
Definition: invert_quda.h:96
virtual ~CAGCR()
Definition: inv_ca_gcr.cpp:11
std::vector< ColorSpinorField * > defl_tmp1
Definition: invert_quda.h:547
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
void solve(Complex *psi_, std::vector< ColorSpinorField *> &q, ColorSpinorField &b)
Solve the equation A p_k psi_k = q_k psi_k = b by minimizing the least square residual using Eigen&#39;s ...
Definition: inv_ca_gcr.cpp:99
QudaGaugeParam param
Definition: pack_test.cpp:17
ColorSpinorField * rp
Definition: invert_quda.h:1003
std::vector< ColorSpinorField * > defl_tmp2
Definition: invert_quda.h:548
QudaComputeNullVector compute_null_vector
Definition: invert_quda.h:67
void deflateSVD(std::vector< ColorSpinorField *> vec_defl, std::vector< ColorSpinorField *> vec, std::vector< ColorSpinorField *> evecs, std::vector< Complex > evals)
Deflate vector with both left and Right singular vectors.
double Last(QudaProfileType idx)
Definition: timer.h:251
#define qudaDeviceSynchronize()
QudaResidualType residual_type
Definition: invert_quda.h:49
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:206
ColorSpinorParam csParam
Definition: pack_test.cpp:24
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:35
#define warningQuda(...)
Definition: util_quda.h:133
bool is_preconditioner
verbosity to use for preconditioner
Definition: invert_quda.h:241
std::vector< ColorSpinorField * > q
Definition: invert_quda.h:1008
void constructDeflationSpace(const ColorSpinorField &meta, const DiracMatrix &mat, bool svd)
Constructs the deflation space.
Definition: solver.cpp:159
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
EigenSolver * eig_solve
Definition: invert_quda.h:545
std::vector< Complex > evals
Definition: invert_quda.h:61
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:512
QudaCABasis basis
Definition: invert_quda.h:999
ColorSpinorField * tmp_sloppy
Definition: invert_quda.h:1005
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
std::vector< ColorSpinorField * > evecs
Definition: invert_quda.h:58
QudaPrecision precision
Definition: invert_quda.h:142
std::vector< ColorSpinorField * > p
Definition: invert_quda.h:1007
SolverParam & param
Definition: invert_quda.h:463
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
const DiracMatrix & matSloppy
Definition: invert_quda.h:994
unsigned long long flops() const
Definition: dirac_quda.h:1119
QudaCABasis ca_basis
Definition: test_util.cpp:1631
#define printfQuda(...)
Definition: util_quda.h:115
unsigned long long flops
Definition: blas_quda.cu:22
ColorSpinorField * tmpp
Definition: invert_quda.h:1004
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:64
void operator()(ColorSpinorField &out, ColorSpinorField &in)
Definition: inv_ca_gcr.cpp:172
QudaPrecision precision_sloppy
Definition: invert_quda.h:145
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE). Assumes SolverParam.true_res and SolverParam.true_res_hq has been set.
Definition: solver.cpp:270
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
bool deflate_init
Definition: invert_quda.h:546
const Dirac * Expose() const
Definition: dirac_quda.h:1135