QUDA  v1.1.0
A library for QCD on GPUs
inv_mpcg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <iostream>
5 
6 #include <quda_internal.h>
7 #include <color_spinor_field.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <sys/time.h>
13 
14 namespace quda {
15 
16 
17  template<typename T>
18  static void applyT(T d_out[], const T d_in[], const T gamma[], const T rho[], int N)
19  {
20  if(N <= 0) return;
21  for(int i=0; i<N; ++i){
22  d_out[i] = d_in[i]/gamma[i];
23  }
24 
25  for(int i=0; i<N-1; ++i){
26  d_out[i] += (d_in[i+1]*(1-rho[i+1])/(gamma[i+1]*rho[i+1]));
27  }
28 
29  for(int i=1; i<N; ++i){
30  d_out[i] -= d_in[i-1]/(rho[i-1]*gamma[i-1]);
31  }
32 
33  return;
34  }
35 
36  template<typename T>
37  static void applyB(T d_out[], const T d_in[], int N)
38  {
39  d_out[0] = static_cast<T>(0);
40  for(int i=1; i<N; ++i) d_out[i] = d_in[i-1];
41  return;
42  }
43 
44  void print(const double d[], int n){
45  for(int i=0; i<n; ++i){
46  std::cout << d[i] << " ";
47  }
48  std::cout << std::endl;
49  }
50 
51  template<typename T>
52  static void zero(T d[], int N){
53  for(int i=0; i<N; ++i) d[i] = static_cast<T>(0);
54  }
55 
56  template<typename T>
57  static void applyThirdTerm(T d_out[], const T d_in[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
58  {
59  // s is the number of steps
60  // The input and output vectors are of dimension 2*s + 1
61  const int dim = 2*s + 1;
62 
63  zero(d_out, dim);
64  if(k) applyT(d_out, d_in, gamma_kprev, rho_kprev, s); // compute the upper half of the vector
65 
66 
67  applyB(d_out+s, d_in+s, s+1); // update the lower half
68 
69  // This has to come after applyB
70  if(k) d_out[s] -= d_in[s-1]/(rho_kprev[s-1]*gamma_kprev[s-1]);
71 
72  // Finally scale everything
73  for(int i=0; i<dim; ++i) d_out[i] *= -rho[j]*gamma[j];
74 
75  return;
76  }
77 
78  template<typename T>
79  static void computeCoeffs(T d_out[], const T d_p1[], const T d_p2[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
80  {
81  applyThirdTerm(d_out, d_p1, k, j-1, s, gamma, rho, gamma_kprev, rho_kprev);
82 
83 
84  for(int i=0; i<(2*s+1); ++i){
85  d_out[i] += rho[j-1]*d_p1[i] + (1 - rho[j-1])*d_p2[i];
86  }
87  return;
88  }
89 
91  Solver(mat, mat, mat, mat, param, profile)
92  {
93  }
94 
96 
97  }
98 
99  void MPCG::computeMatrixPowers(cudaColorSpinorField out[], cudaColorSpinorField &in, int nvec)
100  {
101  cudaColorSpinorField temp(in);
102  out[0] = in;
103  for(int i=1; i<nvec; ++i){
104  mat(out[i], out[i-1], temp);
105  }
106  return;
107  }
108 
109  void MPCG::computeMatrixPowers(std::vector<cudaColorSpinorField>& out, std::vector<cudaColorSpinorField>& in, int nsteps)
110  {
111  cudaColorSpinorField temp(in[0]);
112 
113  for(int i=0; i<=nsteps; ++i) out[i] = in[i];
114 
115  for(int i=(nsteps+1); i<=(2*nsteps); ++i){
116  mat(out[i], out[i-1], temp);
117  }
118  return;
119  }
120 
121 #ifdef SSTEP
122  static void computeGramMatrix(double** G, std::vector<cudaColorSpinorField>& v, double* mu){
123 
124  const int dim = v.size();
125  const int nsteps = (dim-1)/2;
126 
127  {
128  std::vector<cudaColorSpinorField*> vp1; vp1.reserve((nsteps+1)*nsteps);
129  std::vector<cudaColorSpinorField*> vp2; vp2.reserve((nsteps+1)*nsteps);
130  double g[(nsteps+1)*nsteps];
131 
132  for(int i=0; i<nsteps; ++i){
133  for(int j=nsteps; j<dim; j++){
134  vp1.push_back(&v[i]);
135  vp2.push_back(&v[j]);
136  }
137  }
138  blas::reDotProduct(g, vp1, vp2);
139  int k=0;
140  for(int i=0; i<nsteps; ++i){
141  for(int j=nsteps; j<dim; j++){
142  G[i][j] = g[k++];
143  }
144  }
145  }
146 
147 
148  const int num = dim-nsteps;
149  const int offset = nsteps;
150  double d[2*nsteps+1];
151  std::vector<cudaColorSpinorField*> vp1; vp1.reserve(2*nsteps+1);
152  std::vector<cudaColorSpinorField*> vp2; vp2.reserve(2*nsteps+1);
153  for(int i=0; i<=nsteps; ++i){
154  vp1.push_back(&v[0+offset]);
155  vp2.push_back(&v[i+offset]);
156  }
157  for(int i=1; i<=nsteps; ++i){
158  vp1.push_back(&v[i+offset]);
159  vp2.push_back(&v[nsteps+offset]);
160  }
161 
162 
163  blas::reDotProduct(d,vp1,vp2);
164 
165  for(int i=0; i<num; ++i){
166  for(int j=0; j<=i; ++j){
167  G[j+offset][i+offset] = G[i+offset][j+offset] = d[i+j];
168  }
169  }
170  for(int i=0; i<nsteps; ++i){
171  G[i][i] = mu[i];
172  }
173  }
174 
175 
176  static void computeMuNu(double& result, const double* u, double** G, const double* v, int dim){
177 
178  result = 0.0;
179  const int nsteps = (dim-1)/2;
180 
181  for(int i=nsteps; i<dim; ++i){
182  for(int j=nsteps; j<dim; ++j){
183  result += u[i]*v[j]*G[i][j];
184  }
185  }
186 
187  for(int i=0; i<nsteps; ++i){
188  for(int j=nsteps; j<dim; ++j){
189  result += (u[i]*v[j] + u[j]*v[i])*G[i][j];
190  }
191  result += u[i]*v[i]*G[i][i];
192  }
193 
194 
195  return;
196  }
197 #endif // SSTEP
198 
200  {
201 #ifndef SSTEP
202  errorQuda("S-step solvers not built\n");
203 #else
204  // Check to see that we're not trying to invert on a zero-field source
205  const double b2 = blas::norm2(b);
206  if(b2 == 0){
207  profile.TPSTOP(QUDA_PROFILE_INIT);
208  printfQuda("Warning: inverting on zero-field source\n");
209  x=b;
210  param.true_res = 0.0;
211  param.true_res_hq = 0.0;
212  return;
213  }
214 
215 
216  cudaColorSpinorField temp(b); // temporary field
217 
218  // Use ColorSpinorParam to create zerod fields
221  cudaColorSpinorField x_prev(x,csParam);
222  cudaColorSpinorField x_new(x,csParam);
223 
224 
225  const int s = 2;
226 
227  // create the residual array and the matrix powers array
228  std::vector<cudaColorSpinorField> R(s+1,cudaColorSpinorField(b,csParam));
229  std::vector<cudaColorSpinorField> V(2*s+1,cudaColorSpinorField(b,csParam));
230 
231  // Set up the first residual
232  for(int i=0; i<s; ++i) blas::zero(R[i]);
233 
234 
235  mat(R[s], x, temp);
236  double r2 = blas::xmyNorm(b,R[s]);
237 
238  double stop = stopping(param.tol, b2, param.residual_type);
239 
240 
241  int it = 0;
242  double* d = new double[2*s+1];
243  double* d_p1 = new double[2*s+1];
244  double* d_p2 = new double[2*s+1];
245  double* g = new double[2*s+1];
246  double* g_p1 = new double[2*s+1];
247  double* g_p2 = new double[2*s+1];
248  double** G = new double*[2*s+1];
249  for(int i=0; i<(2*s+1); ++i){
250  G[i] = new double[2*s+1];
251  }
252 
253 
254  // Matrix powers kernel
255  // The first s vectors hold the previous s residuals
256  // v[s] holds current residual
257  // v[s+1] holds A r
258  // v[s+2] holds A^(2)r
259  // v[2*s] holds A^(s)r
261 
262  double rAr;
263 
264  // double gamma_prev = 0.0;
265  // double mu_prev = 0.0;
266  // double rho_prev = 0.0;
267  double rho[s];
268  double mu[s];
269  double gamma[s];
270  double rho_kprev[s];
271  double gamma_kprev[s];
272 
273  zero(mu,s);
274 
275  for(int i=0; i<(2*s+1); ++i){
276  zero(G[i], (2*s+1));
277  }
278 
279 
280  int k = 0;
281  while(!convergence(r2,0.0,stop,0.0) && it < param.maxiter){
282  // compute the matrix powers kernel - need to set r[s] above
283  computeMatrixPowers(V, R, s);
284  computeGramMatrix(G,V, mu);
285 
286  R[0] = R[s];
287 
288  int j = 0;
289  while(!convergence(r2,0.0,stop,0.0) && j<s){
290  const int prev_idx = j ? j-1 : s-1;
291  cudaColorSpinorField& R_prev = R[prev_idx];
292  double& mu_prev = mu[prev_idx];
293  double& rho_prev = rho[prev_idx];
294  double& gamma_prev = gamma[prev_idx];
295 
296 
297 
298  if(j == 0){
299  zero(d, 2*s+1); d[s+1] = 1.0;
300  w = V[s+1];
301  zero(g, 2*s+1); g[s] = 1.0;
302  }else{
303  if(j==1){
304  zero(d_p2, 2*s+1);
305  if(k > 0){
306  d_p2[s-2] = (1 - rho_kprev[s-1])/(rho_kprev[s-1]*gamma_kprev[s-1]);
307  d_p2[s-1] = (1./gamma_kprev[s-1]);
308  d_p2[s] = (-1./(gamma_kprev[s-1]*rho_kprev[s-1]));
309  }
310  zero(g_p2, 2*s+1);
311  if(k > 0) g_p2[s-1] = 1.0;
312  }
313  // compute g coeffs
314  for(int i=0; i<(2*s+1); ++i){
315  g[i] = rho[j-1]*g_p1[i] - rho[j-1]*gamma[j-1]*d_p1[i]
316  + (1 - rho[j-1])*g_p2[i];
317  }
318 
319  // compute d coeffs
320  computeCoeffs(d, d_p1, d_p2, k, j, s, gamma, rho, gamma_kprev, rho_kprev);
321  blas::zero(w);
322  for(int i=0; i<(2*s+1); ++i){
323  if(d[i] != 0.) blas::axpy(d[i], V[i], w);
324  }
325  }
326 
327  computeMuNu(r2, g, G, g, 2*s+1);
328  computeMuNu(rAr, g, G, d, 2*s+1);
329 
330  mu[j] = r2;
331  gamma[j] = r2/rAr;
332  rho[j] = (it==0) ? 1.0 : 1.0/(1.0 - (gamma[j]/gamma_prev)*(mu[j]/mu_prev)*(1.0/rho_prev));
333 
334  R[j+1] = R_prev;
335  blas::ax((1.0 - rho[j]), R[j+1]);
336  blas::axpy(rho[j], R[j], R[j+1]);
337  blas::axpy(-rho[j]*gamma[j], w, R[j+1]);
338 
339  x_new = x_prev;
340  blas::ax((1.0 - rho[j]), x_new);
341  blas::axpy(rho[j], x, x_new);
342  blas::axpy(gamma[j]*rho[j], R[j], x_new);
343 
344 
345 
346  // copy d to d_p1
347  if(j>0){
348  for(int i=0; i<(2*s+1); ++i) d_p2[i] = d_p1[i];
349  for(int i=0; i<(2*s+1); ++i) g_p2[i] = g_p1[i];
350  }
351  for(int i=0; i<(2*s+1); ++i) d_p1[i] = d[i];
352  for(int i=0; i<(2*s+1); ++i) g_p1[i] = g[i];
353 
354 
355  PrintStats("MPCG", it, r2, b2, 0.0);
356  it++;
357 
358  x_prev = x;
359  x = x_new;
360  ++j;
361  } // loop over j
362 
363 
364 
365  for(int i=0; i<s; ++i){
366  rho_kprev[i] = rho[i];
367  gamma_kprev[i] = gamma[i];
368  }
369  k++;
370  }
371 
372 
373  mat(R[0], x_prev, temp);
374  param.true_res = sqrt(blas::xmyNorm(b, R[0]) / b2);
375 
376 
377  PrintSummary("MPCG", it, r2, b2, stop, param.tol_hq);
378 
379  delete[] d;
380  delete[] d_p1;
381  delete[] d_p2;
382 
383  delete[] g;
384  delete[] g_p1;
385  delete[] g_p2;
386 
387  for(int i=0; i<(2*s+1); ++i){
388  delete[] G[i];
389  }
390  delete G;
391 #endif // sstep
392  return;
393  }
394 
395 } // namespace quda
void operator()(ColorSpinorField &out, ColorSpinorField &in)
virtual ~MPCG()
MPCG(const DiracMatrix &mat, SolverParam &param, TimeProfile &profile)
TimeProfile & profile
Definition: invert_quda.h:471
const DiracMatrix & mat
Definition: invert_quda.h:465
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:328
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE)....
Definition: solver.cpp:386
SolverParam & param
Definition: invert_quda.h:470
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:311
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE)
Definition: solver.cpp:373
std::array< int, 4 > dim
quda::mgarray< int > nvec
double mu
int V
Definition: host_utils.cpp:37
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:79
void ax(double a, ColorSpinorField &x)
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:43
void stop()
Stop profiling.
Definition: device.cpp:228
__device__ __host__ void zero(double &a)
Definition: float_vector.h:318
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
@ QUDA_PROFILE_INIT
Definition: timer.h:106
void print(const double d[], int n)
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
QudaResidualType residual_type
Definition: invert_quda.h:49
#define printfQuda(...)
Definition: util_quda.h:114
#define errorQuda(...)
Definition: util_quda.h:120