QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inv_mpcg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda_internal.h>
6 #include <color_spinor_field.h>
7 #include <blas_quda.h>
8 #include <dslash_quda.h>
9 #include <invert_quda.h>
10 #include <util_quda.h>
11 #include <sys/time.h>
12 
13 #include <face_quda.h>
14 
15 #include <iostream>
16 
17 namespace quda {
18 
19 
20  template<typename T>
21  static void applyT(T d_out[], const T d_in[], const T gamma[], const T rho[], int N)
22  {
23  if(N <= 0) return;
24  for(int i=0; i<N; ++i){
25  d_out[i] = d_in[i]/gamma[i];
26  }
27 
28  for(int i=0; i<N-1; ++i){
29  d_out[i] += (d_in[i+1]*(1-rho[i+1])/(gamma[i+1]*rho[i+1]));
30  }
31 
32  for(int i=1; i<N; ++i){
33  d_out[i] -= d_in[i-1]/(rho[i-1]*gamma[i-1]);
34  }
35 
36  return;
37  }
38 
39  template<typename T>
40  static void applyB(T d_out[], const T d_in[], int N)
41  {
42  d_out[0] = static_cast<T>(0);
43  for(int i=1; i<N; ++i) d_out[i] = d_in[i-1];
44  return;
45  }
46 
47  void print(const double d[], int n){
48  for(int i=0; i<n; ++i){
49  std::cout << d[i] << " ";
50  }
51  std::cout << std::endl;
52  }
53 
54  template<typename T>
55  static void zero(T d[], int N){
56  for(int i=0; i<N; ++i) d[i] = static_cast<T>(0);
57  }
58 
59  template<typename T>
60  static void applyThirdTerm(T d_out[], const T d_in[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
61  {
62  // s is the number of steps
63  // The input and output vectors are of dimension 2*s + 1
64  const int dim = 2*s + 1;
65 
66  zero(d_out, dim);
67  if(k) applyT(d_out, d_in, gamma_kprev, rho_kprev, s); // compute the upper half of the vector
68 
69 
70  applyB(d_out+s, d_in+s, s+1); // update the lower half
71 
72  // This has to come after applyB
73  if(k) d_out[s] -= d_in[s-1]/(rho_kprev[s-1]*gamma_kprev[s-1]);
74 
75  // Finally scale everything
76  for(int i=0; i<dim; ++i) d_out[i] *= -rho[j]*gamma[j];
77 
78  return;
79  }
80 
81  template<typename T>
82  static void computeCoeffs(T d_out[], const T d_p1[], const T d_p2[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
83  {
84  applyThirdTerm(d_out, d_p1, k, j-1, s, gamma, rho, gamma_kprev, rho_kprev);
85 
86 
87  for(int i=0; i<(2*s+1); ++i){
88  d_out[i] += rho[j-1]*d_p1[i] + (1 - rho[j-1])*d_p2[i];
89  }
90  return;
91  }
92 
93 
94 
95 
97  Solver(param, profile), mat(mat)
98  {
99 
100  }
101 
103 
104  }
105 
106  void MPCG::computeMatrixPowers(cudaColorSpinorField out[], cudaColorSpinorField &in, int nvec)
107  {
108  cudaColorSpinorField temp(in);
109  out[0] = in;
110  for(int i=1; i<nvec; ++i){
111  mat(out[i], out[i-1], temp);
112  }
113  return;
114  }
115 
116  void MPCG::computeMatrixPowers(std::vector<cudaColorSpinorField>& out, std::vector<cudaColorSpinorField>& in, int nsteps)
117  {
118  cudaColorSpinorField temp(in[0]);
119 
120  for(int i=0; i<=nsteps; ++i) out[i] = in[i];
121 
122  for(int i=(nsteps+1); i<=(2*nsteps); ++i){
123  mat(out[i], out[i-1], temp);
124  }
125  return;
126  }
127 
128 #ifdef SSTEP
129  static void computeGramMatrix(double** G, std::vector<cudaColorSpinorField>& v, double* mu){
130 
131  const int dim = v.size();
132  const int nsteps = (dim-1)/2;
133 
134  {
135  std::vector<cudaColorSpinorField*> vp1; vp1.reserve((nsteps+1)*nsteps);
136  std::vector<cudaColorSpinorField*> vp2; vp2.reserve((nsteps+1)*nsteps);
137  double g[(nsteps+1)*nsteps];
138 
139  for(int i=0; i<nsteps; ++i){
140  for(int j=nsteps; j<dim; j++){
141  vp1.push_back(&v[i]);
142  vp2.push_back(&v[j]);
143  }
144  }
145  reDotProductCuda(g, vp1, vp2);
146  int k=0;
147  for(int i=0; i<nsteps; ++i){
148  for(int j=nsteps; j<dim; j++){
149  G[i][j] = g[k++];
150  }
151  }
152  }
153 
154 
155  const int num = dim-nsteps;
156  const int offset = nsteps;
157  double d[2*nsteps+1];
158  std::vector<cudaColorSpinorField*> vp1; vp1.reserve(2*nsteps+1);
159  std::vector<cudaColorSpinorField*> vp2; vp2.reserve(2*nsteps+1);
160  for(int i=0; i<=nsteps; ++i){
161  vp1.push_back(&v[0+offset]);
162  vp2.push_back(&v[i+offset]);
163  }
164  for(int i=1; i<=nsteps; ++i){
165  vp1.push_back(&v[i+offset]);
166  vp2.push_back(&v[nsteps+offset]);
167  }
168 
169 
170  reDotProductCuda(d,vp1,vp2);
171 
172  for(int i=0; i<num; ++i){
173  for(int j=0; j<=i; ++j){
174  G[j+offset][i+offset] = G[i+offset][j+offset] = d[i+j];
175  }
176  }
177  for(int i=0; i<nsteps; ++i){
178  G[i][i] = mu[i];
179  }
180  }
181 
182 
183  static void computeMuNu(double& result, const double* u, double** G, const double* v, int dim){
184 
185  result = 0.0;
186  const int nsteps = (dim-1)/2;
187 
188  for(int i=nsteps; i<dim; ++i){
189  for(int j=nsteps; j<dim; ++j){
190  result += u[i]*v[j]*G[i][j];
191  }
192  }
193 
194  for(int i=0; i<nsteps; ++i){
195  for(int j=nsteps; j<dim; ++j){
196  result += (u[i]*v[j] + u[j]*v[i])*G[i][j];
197  }
198  result += u[i]*v[i]*G[i][i];
199  }
200 
201 
202  return;
203  }
204 #endif // SSTEP
205 
207  {
208 #ifndef SSTEP
209  errorQuda("S-step solvers not built\n");
210 #else
211  // Check to see that we're not trying to invert on a zero-field source
212  const double b2 = norm2(b);
213  if(b2 == 0){
215  printfQuda("Warning: inverting on zero-field source\n");
216  x=b;
217  param.true_res = 0.0;
218  param.true_res_hq = 0.0;
219  return;
220  }
221 
222 
223  cudaColorSpinorField temp(b); // temporary field
224 
225  // Use ColorSpinorParam to create zerod fields
227  csParam.create = QUDA_ZERO_FIELD_CREATE;
228  cudaColorSpinorField x_prev(x,csParam);
229  cudaColorSpinorField x_new(x,csParam);
230 
231 
232  const int s = 2;
233 
234  // create the residual array and the matrix powers array
235  std::vector<cudaColorSpinorField> R(s+1,cudaColorSpinorField(b,csParam));
236  std::vector<cudaColorSpinorField> V(2*s+1,cudaColorSpinorField(b,csParam));
237 
238  // Set up the first residual
239  for(int i=0; i<s; ++i) zeroCuda(R[i]);
240 
241 
242  mat(R[s], x, temp);
243  double r2 = xmyNormCuda(b,R[s]);
244 
245  double stop = stopping(param.tol, b2, param.residual_type);
246 
247 
248  int it = 0;
249  double* d = new double[2*s+1];
250  double* d_p1 = new double[2*s+1];
251  double* d_p2 = new double[2*s+1];
252  double* g = new double[2*s+1];
253  double* g_p1 = new double[2*s+1];
254  double* g_p2 = new double[2*s+1];
255  double** G = new double*[2*s+1];
256  for(int i=0; i<(2*s+1); ++i){
257  G[i] = new double[2*s+1];
258  }
259 
260 
261  // Matrix powers kernel
262  // The first s vectors hold the previous s residuals
263  // v[s] holds current residual
264  // v[s+1] holds A r
265  // v[s+2] holds A^(2)r
266  // v[2*s] holds A^(s)r
268 
269  double rAr;
270 
271  // double gamma_prev = 0.0;
272  // double mu_prev = 0.0;
273  // double rho_prev = 0.0;
274  double rho[s];
275  double mu[s];
276  double gamma[s];
277  double rho_kprev[s];
278  double gamma_kprev[s];
279 
280  zero(mu,s);
281 
282  for(int i=0; i<(2*s+1); ++i){
283  zero(G[i], (2*s+1));
284  }
285 
286 
287  int k = 0;
288  while(!convergence(r2,0.0,stop,0.0) && it < param.maxiter){
289  // compute the matrix powers kernel - need to set r[s] above
290  computeMatrixPowers(V, R, s);
291  computeGramMatrix(G,V, mu);
292 
293  R[0] = R[s];
294 
295  int j = 0;
296  while(!convergence(r2,0.0,stop,0.0) && j<s){
297  const int prev_idx = j ? j-1 : s-1;
298  cudaColorSpinorField& R_prev = R[prev_idx];
299  double& mu_prev = mu[prev_idx];
300  double& rho_prev = rho[prev_idx];
301  double& gamma_prev = gamma[prev_idx];
302 
303 
304 
305  if(j == 0){
306  zero(d, 2*s+1); d[s+1] = 1.0;
307  w = V[s+1];
308  zero(g, 2*s+1); g[s] = 1.0;
309  }else{
310  if(j==1){
311  zero(d_p2, 2*s+1);
312  if(k > 0){
313  d_p2[s-2] = (1 - rho_kprev[s-1])/(rho_kprev[s-1]*gamma_kprev[s-1]);
314  d_p2[s-1] = (1./gamma_kprev[s-1]);
315  d_p2[s] = (-1./(gamma_kprev[s-1]*rho_kprev[s-1]));
316  }
317  zero(g_p2, 2*s+1);
318  if(k > 0) g_p2[s-1] = 1.0;
319  }
320  // compute g coeffs
321  for(int i=0; i<(2*s+1); ++i){
322  g[i] = rho[j-1]*g_p1[i] - rho[j-1]*gamma[j-1]*d_p1[i]
323  + (1 - rho[j-1])*g_p2[i];
324  }
325 
326  // compute d coeffs
327  computeCoeffs(d, d_p1, d_p2, k, j, s, gamma, rho, gamma_kprev, rho_kprev);
328  zeroCuda(w);
329  for(int i=0; i<(2*s+1); ++i){
330  if(d[i] != 0.) axpyCuda(d[i], V[i], w);
331  }
332  }
333 
334  computeMuNu(r2, g, G, g, 2*s+1);
335  computeMuNu(rAr, g, G, d, 2*s+1);
336 
337  mu[j] = r2;
338  gamma[j] = r2/rAr;
339  rho[j] = (it==0) ? 1.0 : 1.0/(1.0 - (gamma[j]/gamma_prev)*(mu[j]/mu_prev)*(1.0/rho_prev));
340 
341  R[j+1] = R_prev;
342  axCuda((1.0 - rho[j]), R[j+1]);
343  axpyCuda(rho[j], R[j], R[j+1]);
344  axpyCuda(-rho[j]*gamma[j], w, R[j+1]);
345 
346  x_new = x_prev;
347  axCuda((1.0 - rho[j]), x_new);
348  axpyCuda(rho[j], x, x_new);
349  axpyCuda(gamma[j]*rho[j], R[j], x_new);
350 
351 
352 
353  // copy d to d_p1
354  if(j>0){
355  for(int i=0; i<(2*s+1); ++i) d_p2[i] = d_p1[i];
356  for(int i=0; i<(2*s+1); ++i) g_p2[i] = g_p1[i];
357  }
358  for(int i=0; i<(2*s+1); ++i) d_p1[i] = d[i];
359  for(int i=0; i<(2*s+1); ++i) g_p1[i] = g[i];
360 
361 
362  PrintStats("MPCG", it, r2, b2, 0.0);
363  it++;
364 
365  x_prev = x;
366  x = x_new;
367  ++j;
368  } // loop over j
369 
370 
371 
372  for(int i=0; i<s; ++i){
373  rho_kprev[i] = rho[i];
374  gamma_kprev[i] = gamma[i];
375  }
376  k++;
377  }
378 
379 
380  mat(R[0], x_prev, temp);
381  param.true_res = sqrt(xmyNormCuda(b, R[0]) / b2);
382 
383 
384  PrintSummary("MPCG", it, r2, b2);
385 
386  delete[] d;
387  delete[] d_p1;
388  delete[] d_p2;
389 
390  delete[] g;
391  delete[] g_p1;
392  delete[] g_p2;
393 
394  for(int i=0; i<(2*s+1); ++i){
395  delete[] G[i];
396  }
397  delete G;
398 #endif // sstep
399  return;
400  }
401 
402 } // namespace quda
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:82
virtual ~MPCG()
int V
Definition: test_util.cpp:29
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
Definition: solver.cpp:65
#define errorQuda(...)
Definition: util_quda.h:73
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
TimeProfile & profile
Definition: invert_quda.h:224
QudaGaugeParam param
Definition: pack_test.cpp:17
MPCG(DiracMatrix &mat, SolverParam &param, TimeProfile &profile)
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
Definition: solver.cpp:137
QudaResidualType residual_type
Definition: invert_quda.h:35
ColorSpinorParam csParam
Definition: pack_test.cpp:24
cpuColorSpinorField * in
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:115
int x[4]
SolverParam & param
Definition: invert_quda.h:223
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:170
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
Definition: solver.cpp:122
#define printfQuda(...)
Definition: util_quda.h:67
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40
void operator()(cudaColorSpinorField &out, cudaColorSpinorField &in)
void print(const double d[], int n)
VOLATILE spinorFloat * s
void axCuda(const double &a, cudaColorSpinorField &x)
Definition: blas_quda.cu:171
double norm2(const ColorSpinorField &)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343