QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
inv_mpcg_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <iostream>
5 
6 #include <quda_internal.h>
7 #include <color_spinor_field.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <sys/time.h>
13 
14 namespace quda {
15 
16 
17  template<typename T>
18  static void applyT(T d_out[], const T d_in[], const T gamma[], const T rho[], int N)
19  {
20  if(N <= 0) return;
21  for(int i=0; i<N; ++i){
22  d_out[i] = d_in[i]/gamma[i];
23  }
24 
25  for(int i=0; i<N-1; ++i){
26  d_out[i] += (d_in[i+1]*(1-rho[i+1])/(gamma[i+1]*rho[i+1]));
27  }
28 
29  for(int i=1; i<N; ++i){
30  d_out[i] -= d_in[i-1]/(rho[i-1]*gamma[i-1]);
31  }
32 
33  return;
34  }
35 
36  template<typename T>
37  static void applyB(T d_out[], const T d_in[], int N)
38  {
39  d_out[0] = static_cast<T>(0);
40  for(int i=1; i<N; ++i) d_out[i] = d_in[i-1];
41  return;
42  }
43 
44  void print(const double d[], int n){
45  for(int i=0; i<n; ++i){
46  std::cout << d[i] << " ";
47  }
48  std::cout << std::endl;
49  }
50 
51  template<typename T>
52  static void zero(T d[], int N){
53  for(int i=0; i<N; ++i) d[i] = static_cast<T>(0);
54  }
55 
56  template<typename T>
57  static void applyThirdTerm(T d_out[], const T d_in[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
58  {
59  // s is the number of steps
60  // The input and output vectors are of dimension 2*s + 1
61  const int dim = 2*s + 1;
62 
63  zero(d_out, dim);
64  if(k) applyT(d_out, d_in, gamma_kprev, rho_kprev, s); // compute the upper half of the vector
65 
66 
67  applyB(d_out+s, d_in+s, s+1); // update the lower half
68 
69  // This has to come after applyB
70  if(k) d_out[s] -= d_in[s-1]/(rho_kprev[s-1]*gamma_kprev[s-1]);
71 
72  // Finally scale everything
73  for(int i=0; i<dim; ++i) d_out[i] *= -rho[j]*gamma[j];
74 
75  return;
76  }
77 
78  template<typename T>
79  static void computeCoeffs(T d_out[], const T d_p1[], const T d_p2[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
80  {
81  applyThirdTerm(d_out, d_p1, k, j-1, s, gamma, rho, gamma_kprev, rho_kprev);
82 
83 
84  for(int i=0; i<(2*s+1); ++i){
85  d_out[i] += rho[j-1]*d_p1[i] + (1 - rho[j-1])*d_p2[i];
86  }
87  return;
88  }
89 
90 
91 
92 
94  Solver(param, profile), mat(mat)
95  {
96 
97  }
98 
100 
101  }
102 
104  {
105  cudaColorSpinorField temp(in);
106  out[0] = in;
107  for(int i=1; i<nvec; ++i){
108  mat(out[i], out[i-1], temp);
109  }
110  return;
111  }
112 
113  void MPCG::computeMatrixPowers(std::vector<cudaColorSpinorField>& out, std::vector<cudaColorSpinorField>& in, int nsteps)
114  {
115  cudaColorSpinorField temp(in[0]);
116 
117  for(int i=0; i<=nsteps; ++i) out[i] = in[i];
118 
119  for(int i=(nsteps+1); i<=(2*nsteps); ++i){
120  mat(out[i], out[i-1], temp);
121  }
122  return;
123  }
124 
125 #ifdef SSTEP
126  static void computeGramMatrix(double** G, std::vector<cudaColorSpinorField>& v, double* mu){
127 
128  const int dim = v.size();
129  const int nsteps = (dim-1)/2;
130 
131  {
132  std::vector<cudaColorSpinorField*> vp1; vp1.reserve((nsteps+1)*nsteps);
133  std::vector<cudaColorSpinorField*> vp2; vp2.reserve((nsteps+1)*nsteps);
134  double g[(nsteps+1)*nsteps];
135 
136  for(int i=0; i<nsteps; ++i){
137  for(int j=nsteps; j<dim; j++){
138  vp1.push_back(&v[i]);
139  vp2.push_back(&v[j]);
140  }
141  }
142  blas::reDotProduct(g, vp1, vp2);
143  int k=0;
144  for(int i=0; i<nsteps; ++i){
145  for(int j=nsteps; j<dim; j++){
146  G[i][j] = g[k++];
147  }
148  }
149  }
150 
151 
152  const int num = dim-nsteps;
153  const int offset = nsteps;
154  double d[2*nsteps+1];
155  std::vector<cudaColorSpinorField*> vp1; vp1.reserve(2*nsteps+1);
156  std::vector<cudaColorSpinorField*> vp2; vp2.reserve(2*nsteps+1);
157  for(int i=0; i<=nsteps; ++i){
158  vp1.push_back(&v[0+offset]);
159  vp2.push_back(&v[i+offset]);
160  }
161  for(int i=1; i<=nsteps; ++i){
162  vp1.push_back(&v[i+offset]);
163  vp2.push_back(&v[nsteps+offset]);
164  }
165 
166 
167  blas::reDotProduct(d,vp1,vp2);
168 
169  for(int i=0; i<num; ++i){
170  for(int j=0; j<=i; ++j){
171  G[j+offset][i+offset] = G[i+offset][j+offset] = d[i+j];
172  }
173  }
174  for(int i=0; i<nsteps; ++i){
175  G[i][i] = mu[i];
176  }
177  }
178 
179 
180  static void computeMuNu(double& result, const double* u, double** G, const double* v, int dim){
181 
182  result = 0.0;
183  const int nsteps = (dim-1)/2;
184 
185  for(int i=nsteps; i<dim; ++i){
186  for(int j=nsteps; j<dim; ++j){
187  result += u[i]*v[j]*G[i][j];
188  }
189  }
190 
191  for(int i=0; i<nsteps; ++i){
192  for(int j=nsteps; j<dim; ++j){
193  result += (u[i]*v[j] + u[j]*v[i])*G[i][j];
194  }
195  result += u[i]*v[i]*G[i][i];
196  }
197 
198 
199  return;
200  }
201 #endif // SSTEP
202 
204  {
205 #ifndef SSTEP
206  errorQuda("S-step solvers not built\n");
207 #else
208  // Check to see that we're not trying to invert on a zero-field source
209  const double b2 = blas::norm2(b);
210  if(b2 == 0){
211  profile.TPSTOP(QUDA_PROFILE_INIT);
212  printfQuda("Warning: inverting on zero-field source\n");
213  x=b;
214  param.true_res = 0.0;
215  param.true_res_hq = 0.0;
216  return;
217  }
218 
219 
220  cudaColorSpinorField temp(b); // temporary field
221 
222  // Use ColorSpinorParam to create zerod fields
224  csParam.create = QUDA_ZERO_FIELD_CREATE;
225  cudaColorSpinorField x_prev(x,csParam);
226  cudaColorSpinorField x_new(x,csParam);
227 
228 
229  const int s = 2;
230 
231  // create the residual array and the matrix powers array
232  std::vector<cudaColorSpinorField> R(s+1,cudaColorSpinorField(b,csParam));
233  std::vector<cudaColorSpinorField> V(2*s+1,cudaColorSpinorField(b,csParam));
234 
235  // Set up the first residual
236  for(int i=0; i<s; ++i) blas::zero(R[i]);
237 
238 
239  mat(R[s], x, temp);
240  double r2 = blas::xmyNorm(b,R[s]);
241 
242  double stop = stopping(param.tol, b2, param.residual_type);
243 
244 
245  int it = 0;
246  double* d = new double[2*s+1];
247  double* d_p1 = new double[2*s+1];
248  double* d_p2 = new double[2*s+1];
249  double* g = new double[2*s+1];
250  double* g_p1 = new double[2*s+1];
251  double* g_p2 = new double[2*s+1];
252  double** G = new double*[2*s+1];
253  for(int i=0; i<(2*s+1); ++i){
254  G[i] = new double[2*s+1];
255  }
256 
257 
258  // Matrix powers kernel
259  // The first s vectors hold the previous s residuals
260  // v[s] holds current residual
261  // v[s+1] holds A r
262  // v[s+2] holds A^(2)r
263  // v[2*s] holds A^(s)r
265 
266  double rAr;
267 
268  // double gamma_prev = 0.0;
269  // double mu_prev = 0.0;
270  // double rho_prev = 0.0;
271  double rho[s];
272  double mu[s];
273  double gamma[s];
274  double rho_kprev[s];
275  double gamma_kprev[s];
276 
277  zero(mu,s);
278 
279  for(int i=0; i<(2*s+1); ++i){
280  zero(G[i], (2*s+1));
281  }
282 
283 
284  int k = 0;
285  while(!convergence(r2,0.0,stop,0.0) && it < param.maxiter){
286  // compute the matrix powers kernel - need to set r[s] above
287  computeMatrixPowers(V, R, s);
288  computeGramMatrix(G,V, mu);
289 
290  R[0] = R[s];
291 
292  int j = 0;
293  while(!convergence(r2,0.0,stop,0.0) && j<s){
294  const int prev_idx = j ? j-1 : s-1;
295  cudaColorSpinorField& R_prev = R[prev_idx];
296  double& mu_prev = mu[prev_idx];
297  double& rho_prev = rho[prev_idx];
298  double& gamma_prev = gamma[prev_idx];
299 
300 
301 
302  if(j == 0){
303  zero(d, 2*s+1); d[s+1] = 1.0;
304  w = V[s+1];
305  zero(g, 2*s+1); g[s] = 1.0;
306  }else{
307  if(j==1){
308  zero(d_p2, 2*s+1);
309  if(k > 0){
310  d_p2[s-2] = (1 - rho_kprev[s-1])/(rho_kprev[s-1]*gamma_kprev[s-1]);
311  d_p2[s-1] = (1./gamma_kprev[s-1]);
312  d_p2[s] = (-1./(gamma_kprev[s-1]*rho_kprev[s-1]));
313  }
314  zero(g_p2, 2*s+1);
315  if(k > 0) g_p2[s-1] = 1.0;
316  }
317  // compute g coeffs
318  for(int i=0; i<(2*s+1); ++i){
319  g[i] = rho[j-1]*g_p1[i] - rho[j-1]*gamma[j-1]*d_p1[i]
320  + (1 - rho[j-1])*g_p2[i];
321  }
322 
323  // compute d coeffs
324  computeCoeffs(d, d_p1, d_p2, k, j, s, gamma, rho, gamma_kprev, rho_kprev);
325  blas::zero(w);
326  for(int i=0; i<(2*s+1); ++i){
327  if(d[i] != 0.) blas::axpy(d[i], V[i], w);
328  }
329  }
330 
331  computeMuNu(r2, g, G, g, 2*s+1);
332  computeMuNu(rAr, g, G, d, 2*s+1);
333 
334  mu[j] = r2;
335  gamma[j] = r2/rAr;
336  rho[j] = (it==0) ? 1.0 : 1.0/(1.0 - (gamma[j]/gamma_prev)*(mu[j]/mu_prev)*(1.0/rho_prev));
337 
338  R[j+1] = R_prev;
339  blas::ax((1.0 - rho[j]), R[j+1]);
340  blas::axpy(rho[j], R[j], R[j+1]);
341  blas::axpy(-rho[j]*gamma[j], w, R[j+1]);
342 
343  x_new = x_prev;
344  blas::ax((1.0 - rho[j]), x_new);
345  blas::axpy(rho[j], x, x_new);
346  blas::axpy(gamma[j]*rho[j], R[j], x_new);
347 
348 
349 
350  // copy d to d_p1
351  if(j>0){
352  for(int i=0; i<(2*s+1); ++i) d_p2[i] = d_p1[i];
353  for(int i=0; i<(2*s+1); ++i) g_p2[i] = g_p1[i];
354  }
355  for(int i=0; i<(2*s+1); ++i) d_p1[i] = d[i];
356  for(int i=0; i<(2*s+1); ++i) g_p1[i] = g[i];
357 
358 
359  PrintStats("MPCG", it, r2, b2, 0.0);
360  it++;
361 
362  x_prev = x;
363  x = x_new;
364  ++j;
365  } // loop over j
366 
367 
368 
369  for(int i=0; i<s; ++i){
370  rho_kprev[i] = rho[i];
371  gamma_kprev[i] = gamma[i];
372  }
373  k++;
374  }
375 
376 
377  mat(R[0], x_prev, temp);
378  param.true_res = sqrt(blas::xmyNorm(b, R[0]) / b2);
379 
380 
381  PrintSummary("MPCG", it, r2, b2, stop, param.tol_hq);
382 
383  delete[] d;
384  delete[] d_p1;
385  delete[] d_p2;
386 
387  delete[] g;
388  delete[] g_p1;
389  delete[] g_p2;
390 
391  for(int i=0; i<(2*s+1); ++i){
392  delete[] G[i];
393  }
394  delete G;
395 #endif // sstep
396  return;
397  }
398 
399 } // namespace quda
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508
virtual ~MPCG()
double mu
Definition: test_util.cpp:1648
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void PrintStats(const char *name, int k, double r2, double b2, double hq2)
Prints out the running statistics of the solver (requires a verbosity of QUDA_VERBOSE) ...
Definition: solver.cpp:256
bool convergence(double r2, double hq2, double r2_tol, double hq_tol)
Definition: solver.cpp:223
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728
TimeProfile & profile
Definition: invert_quda.h:464
static int R[4]
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:75
const DiracMatrix & mat
Definition: invert_quda.h:675
static void computeCoeffs(T d_out[], const T d_p1[], const T d_p2[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
static void applyT(T d_out[], const T d_in[], const T gamma[], const T rho[], int N)
QudaGaugeParam param
Definition: pack_test.cpp:17
MPCG(DiracMatrix &mat, SolverParam &param, TimeProfile &profile)
QudaResidualType residual_type
Definition: invert_quda.h:49
static double stopping(double tol, double b2, QudaResidualType residual_type)
Set the solver L2 stopping condition.
Definition: solver.cpp:206
ColorSpinorParam csParam
Definition: pack_test.cpp:24
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:35
cpuColorSpinorField * in
int nvec[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1637
static map::iterator it
Definition: tune.cpp:109
static void applyB(T d_out[], const T d_in[], int N)
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
int V
Definition: test_util.cpp:27
void computeMatrixPowers(cudaColorSpinorField out[], cudaColorSpinorField &in, int nvec)
SolverParam & param
Definition: invert_quda.h:463
cpuColorSpinorField * out
__shared__ float s[]
#define printfQuda(...)
Definition: util_quda.h:115
void operator()(ColorSpinorField &out, ColorSpinorField &in)
void print(const double d[], int n)
void PrintSummary(const char *name, int k, double r2, double b2, double r2_tol, double hq_tol)
Prints out the summary of the solver convergence (requires a verbosity of QUDA_SUMMARIZE). Assumes SolverParam.true_res and SolverParam.true_res_hq has been set.
Definition: solver.cpp:270
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static void applyThirdTerm(T d_out[], const T d_in[], int k, int j, int s, const T gamma[], const T rho[], const T gamma_kprev[], const T rho_kprev[])
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:54