QUDA  0.9.0
coarse_op.cuh
Go to the documentation of this file.
1 
2 namespace quda {
3 
4  // For coarsening un-preconditioned operators we use uni-directional
5  // coarsening to reduce the set up code. For debugging we can force
6  // bi-directional coarsening.
7  static bool bidirectional_debug = false;
8 
9  template <typename Float, typename coarseGauge, typename fineGauge, typename fineSpinor,
10  typename fineSpinorTmp, typename fineClover>
11  struct CalculateYArg {
12 
13  coarseGauge Y;
14  coarseGauge X;
15  coarseGauge Xinv;
17  fineSpinorTmp UV;
18  fineSpinor AV;
20  const fineGauge U;
21  const fineSpinor V;
22  const fineClover C;
23  const fineClover Cinv;
29  const int spin_bs;
33  Float kappa;
34  Float mu;
35  Float mu_factor;
37  const int fineVolumeCB;
38  const int coarseVolumeCB;
40  CalculateYArg(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinor &V,
41  const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor, const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_)
42  : Y(Y), X(X), Xinv(Xinv), UV(UV), AV(AV), U(U), V(V), C(C), Cinv(Cinv), spin_bs(spin_bs_), kappa(static_cast<Float>(kappa)), mu(static_cast<Float>(mu)),
43  mu_factor(static_cast<Float>(mu_factor)), fineVolumeCB(V.VolumeCB()), coarseVolumeCB(X.VolumeCB())
44  {
45  if (V.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS)
46  errorQuda("Gamma basis %d not supported", V.GammaBasis());
47 
48  for (int i=0; i<QUDA_MAX_DIM; i++) {
49  x_size[i] = x_size_[i];
50  xc_size[i] = xc_size_[i];
51  geo_bs[i] = geo_bs_[i];
53  }
54  }
55  };
56 
61  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
62  __device__ __host__ inline void computeUV(Arg &arg, int parity, int x_cb, int ic_c) {
63 
64  // only for preconditioned clover is V != AV
65  auto &W = (dir == QUDA_FORWARDS) ? arg.V : arg.AV;
66 
67  int coord[5];
68  coord[4] = 0;
69  getCoords(coord, x_cb, arg.x_size, parity);
70 
71  constexpr int uvSpin = fineSpin * (from_coarse ? 2 : 1);
72 
73  complex<Float> UV[uvSpin][fineColor];
74 
75  for(int s = 0; s < uvSpin; s++) {
76  for(int c = 0; c < fineColor; c++) {
77  UV[s][c] = static_cast<Float>(0.0);
78  }
79  }
80 
81  if ( arg.comm_dim[dim] && (coord[dim] + 1 >= arg.x_size[dim]) ) {
82  int nFace = 1;
83  int ghost_idx = ghostFaceIndex<1>(coord, arg.x_size, dim, nFace);
84 
85  for(int s = 0; s < fineSpin; s++) { //Fine Spin
86  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
87  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
88  if (!from_coarse)
89  UV[s][ic] += arg.U(dim, parity, x_cb, ic, jc) * W.Ghost(dim, 1, (parity+1)&1, ghost_idx, s, jc, ic_c);
90  else
91  for (int s_col=0; s_col<fineSpin; s_col++) {
92  // on the coarse lattice if forwards then use the forwards links
93  UV[s_col*fineSpin+s][ic] += arg.U(dim + (dir == QUDA_FORWARDS ? 4 : 0), parity, x_cb, s, s_col, ic, jc) *
94  W.Ghost(dim, 1, (parity+1)&1, ghost_idx, s_col, jc, ic_c);
95  } // which chiral block
96  } //Fine color columns
97  } //Fine color rows
98  } //Fine Spin
99 
100  } else {
101  int y_cb = linkIndexP1(coord, arg.x_size, dim);
102 
103  for(int s = 0; s < fineSpin; s++) { //Fine Spin
104  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
105  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
106  if (!from_coarse)
107  UV[s][ic] += arg.U(dim, parity, x_cb, ic, jc) * W((parity+1)&1, y_cb, s, jc, ic_c);
108  else
109  for (int s_col=0; s_col<fineSpin; s_col++) {
110  // on the coarse lattice if forwards then use the forwards links
111  UV[s_col*fineSpin+s][ic] +=
112  arg.U(dim + (dir == QUDA_FORWARDS ? 4 : 0), parity, x_cb, s, s_col, ic, jc) *
113  W((parity+1)&1, y_cb, s_col, jc, ic_c);
114  } // which chiral block
115  } //Fine color columns
116  } //Fine color rows
117  } //Fine Spin
118 
119  }
120 
121 
122  for(int s = 0; s < uvSpin; s++) {
123  for(int c = 0; c < fineColor; c++) {
124  arg.UV(parity,x_cb,s,c,ic_c) = UV[s][c];
125  }
126  }
127 
128 
129  } // computeUV
130 
131  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
132  void ComputeUVCPU(Arg &arg) {
133  for (int parity=0; parity<2; parity++) {
134  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
135  for (int ic_c=0; ic_c < coarseColor; ic_c++) // coarse color
136  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(arg, parity, x_cb, ic_c);
137  } // c/b volume
138  } // parity
139  }
140 
141  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
142  __global__ void ComputeUVGPU(Arg arg) {
143  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
144  if (x_cb >= arg.fineVolumeCB) return;
145 
146  int parity = blockDim.y*blockIdx.y + threadIdx.y;
147  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
148  if (ic_c >= coarseColor) return;
149  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(arg, parity, x_cb, ic_c);
150  }
151 
156  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
157  __device__ __host__ inline void computeAV(Arg &arg, int parity, int x_cb, int ic_c) {
158 
159  for(int s = 0; s < fineSpin; s++) {
160  for(int c = 0; c < fineColor; c++) {
161  arg.AV(parity,x_cb,s,c,ic_c) = static_cast<Float>(0.0);
162  }
163  }
164 
165  for(int s = 0; s < fineSpin; s++) { //Fine Spin
166  const int s_c = s/arg.spin_bs;
167 
168  //On the fine lattice, the clover field is chirally blocked, so loop over rows/columns
169  //in the same chiral block.
170  for(int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) { //Loop over fine spin column
171 
172  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
173  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
174  arg.AV(parity, x_cb, s, ic, ic_c) +=
175  arg.Cinv(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, ic_c);
176  } //Fine color columns
177  } //Fine color rows
178  }
179  } //Fine Spin
180 
181  } // computeAV
182 
183  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
184  void ComputeAVCPU(Arg &arg) {
185  for (int parity=0; parity<2; parity++) {
186  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
187  for (int ic_c=0; ic_c < coarseColor; ic_c++) // coarse color
188  computeAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, ic_c);
189  } // c/b volume
190  } // parity
191  }
192 
193  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
194  __global__ void ComputeAVGPU(Arg arg) {
195  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
196  if (x_cb >= arg.fineVolumeCB) return;
197 
198  int parity = blockDim.y*blockIdx.y + threadIdx.y;
199  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
200  if (ic_c >= coarseColor) return;
201  computeAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, ic_c);
202  }
203 
208  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
209  __device__ __host__ inline void computeTMAV(Arg &arg, int parity, int x_cb, int v) {
210 
211  complex<Float> fp(1./(1.+arg.mu*arg.mu),-arg.mu/(1.+arg.mu*arg.mu));
212  complex<Float> fm(1./(1.+arg.mu*arg.mu),+arg.mu/(1.+arg.mu*arg.mu));
213 
214  for(int s = 0; s < fineSpin/2; s++) {
215  for(int c = 0; c < fineColor; c++) {
216  arg.AV(parity,x_cb,s,c,v) = arg.V(parity,x_cb,s,c,v)*fp;
217  }
218  }
219 
220  for(int s = fineSpin/2; s < fineSpin; s++) {
221  for(int c = 0; c < fineColor; c++) {
222  arg.AV(parity,x_cb,s,c,v) = arg.V(parity,x_cb,s,c,v)*fm;
223  }
224  }
225 
226  } // computeTMAV
227 
228  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
229  void ComputeTMAVCPU(Arg &arg) {
230  for (int parity=0; parity<2; parity++) {
231  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
232  for (int v=0; v<coarseColor; v++) // coarse color
233  computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, v);
234  } // c/b volume
235  } // parity
236  }
237 
238  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
239  __global__ void ComputeTMAVGPU(Arg arg) {
240  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
241  if (x_cb >= arg.fineVolumeCB) return;
242 
243  int parity = blockDim.y*blockIdx.y + threadIdx.y;
244  int v = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
245  if (v >= coarseColor) return;
246 
247  computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, v);
248  }
249 
250 #ifdef DYNAMIC_CLOVER
251  #ifdef UGLY_DYNCLOV
252  #include<dyninv_clover_mg.cuh>
253  #else
254 
255  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
256  __device__ __host__ inline void applyInvClover(Arg &arg, int parity, int x_cb) {
257  /* Applies the inverse of the clover term squared plus mu2 to the spinor */
258  /* Compute (T^2 + mu2) first, then invert */
259  /* We proceed by chiral blocks */
260 
261  for (int ch = 0; ch < 2; ch++) { /* Loop over chiral blocks */
262  Float diag[6], tmp[6];
263  complex<Float> tri[15]; /* Off-diagonal components of the inverse clover term */
264 
265  /* This macro avoid the infinitely long expansion of the tri products */
266 
267  #define Cl(s1,c1,s2,c2) (arg.C(0, parity, x_cb, s1+2*ch, s2+2*ch, c1, c2))
268 
269  tri[0] = Cl(0,1,0,0)*Cl(0,0,0,0).real() + Cl(0,1,0,1)*Cl(0,1,0,0) + Cl(0,1,0,2)*Cl(0,2,0,0) + Cl(0,1,1,0)*Cl(1,0,0,0) + Cl(0,1,1,1)*Cl(1,1,0,0) + Cl(0,1,1,2)*Cl(1,2,0,0);
270  tri[1] = Cl(0,2,0,0)*Cl(0,0,0,0).real() + Cl(0,2,0,2)*Cl(0,2,0,0) + Cl(0,2,0,1)*Cl(0,1,0,0) + Cl(0,2,1,0)*Cl(1,0,0,0) + Cl(0,2,1,1)*Cl(1,1,0,0) + Cl(0,2,1,2)*Cl(1,2,0,0);
271  tri[3] = Cl(1,0,0,0)*Cl(0,0,0,0).real() + Cl(1,0,1,0)*Cl(1,0,0,0) + Cl(1,0,0,1)*Cl(0,1,0,0) + Cl(1,0,0,2)*Cl(0,2,0,0) + Cl(1,0,1,1)*Cl(1,1,0,0) + Cl(1,0,1,2)*Cl(1,2,0,0);
272  tri[6] = Cl(1,1,0,0)*Cl(0,0,0,0).real() + Cl(1,1,1,1)*Cl(1,1,0,0) + Cl(1,1,0,1)*Cl(0,1,0,0) + Cl(1,1,0,2)*Cl(0,2,0,0) + Cl(1,1,1,0)*Cl(1,0,0,0) + Cl(1,1,1,2)*Cl(1,2,0,0);
273  tri[10] = Cl(1,2,0,0)*Cl(0,0,0,0).real() + Cl(1,2,1,2)*Cl(1,2,0,0) + Cl(1,2,0,1)*Cl(0,1,0,0) + Cl(1,2,0,2)*Cl(0,2,0,0) + Cl(1,2,1,0)*Cl(1,0,0,0) + Cl(1,2,1,1)*Cl(1,1,0,0);
274 
275  tri[2] = Cl(0,2,0,1)*Cl(0,1,0,1).real() + Cl(0,2,0,2)*Cl(0,2,0,1) + Cl(0,2,0,0)*Cl(0,0,0,1) + Cl(0,2,1,0)*Cl(1,0,0,1) + Cl(0,2,1,1)*Cl(1,1,0,1) + Cl(0,2,1,2)*Cl(1,2,0,1);
276  tri[4] = Cl(1,0,0,1)*Cl(0,1,0,1).real() + Cl(1,0,1,0)*Cl(1,0,0,1) + Cl(1,0,0,0)*Cl(0,0,0,1) + Cl(1,0,0,2)*Cl(0,2,0,1) + Cl(1,0,1,1)*Cl(1,1,0,1) + Cl(1,0,1,2)*Cl(1,2,0,1);
277  tri[7] = Cl(1,1,0,1)*Cl(0,1,0,1).real() + Cl(1,1,1,1)*Cl(1,1,0,1) + Cl(1,1,0,0)*Cl(0,0,0,1) + Cl(1,1,0,2)*Cl(0,2,0,1) + Cl(1,1,1,0)*Cl(1,0,0,1) + Cl(1,1,1,2)*Cl(1,2,0,1);
278  tri[11] = Cl(1,2,0,1)*Cl(0,1,0,1).real() + Cl(1,2,1,2)*Cl(1,2,0,1) + Cl(1,2,0,0)*Cl(0,0,0,1) + Cl(1,2,0,2)*Cl(0,2,0,1) + Cl(1,2,1,0)*Cl(1,0,0,1) + Cl(1,2,1,1)*Cl(1,1,0,1);
279 
280  tri[5] = Cl(1,0,0,2)*Cl(0,2,0,2).real() + Cl(1,0,1,0)*Cl(1,0,0,2) + Cl(1,0,0,0)*Cl(0,0,0,2) + Cl(1,0,0,1)*Cl(0,1,0,2) + Cl(1,0,1,1)*Cl(1,1,0,2) + Cl(1,0,1,2)*Cl(1,2,0,2);
281  tri[8] = Cl(1,1,0,2)*Cl(0,2,0,2).real() + Cl(1,1,1,1)*Cl(1,1,0,2) + Cl(1,1,0,0)*Cl(0,0,0,2) + Cl(1,1,0,1)*Cl(0,1,0,2) + Cl(1,1,1,0)*Cl(1,0,0,2) + Cl(1,1,1,2)*Cl(1,2,0,2);
282  tri[12] = Cl(1,2,0,2)*Cl(0,2,0,2).real() + Cl(1,2,1,2)*Cl(1,2,0,2) + Cl(1,2,0,0)*Cl(0,0,0,2) + Cl(1,2,0,1)*Cl(0,1,0,2) + Cl(1,2,1,0)*Cl(1,0,0,2) + Cl(1,2,1,1)*Cl(1,1,0,2);
283 
284  tri[9] = Cl(1,1,1,0)*Cl(1,0,1,0).real() + Cl(1,1,1,1)*Cl(1,1,1,0) + Cl(1,1,0,0)*Cl(0,0,1,0) + Cl(1,1,0,1)*Cl(0,1,1,0) + Cl(1,1,0,2)*Cl(0,2,1,0) + Cl(1,1,1,2)*Cl(1,2,1,0);
285  tri[13] = Cl(1,2,1,0)*Cl(1,0,1,0).real() + Cl(1,2,1,2)*Cl(1,2,1,0) + Cl(1,2,0,0)*Cl(0,0,1,0) + Cl(1,2,0,1)*Cl(0,1,1,0) + Cl(1,2,0,2)*Cl(0,2,1,0) + Cl(1,2,1,1)*Cl(1,1,1,0);
286  tri[14] = Cl(1,2,1,1)*Cl(1,1,1,1).real() + Cl(1,2,1,2)*Cl(1,2,1,1) + Cl(1,2,0,0)*Cl(0,0,1,1) + Cl(1,2,0,1)*Cl(0,1,1,1) + Cl(1,2,0,2)*Cl(0,2,1,1) + Cl(1,2,1,0)*Cl(1,0,1,1);
287 
288  diag[0] = arg.mu*arg.mu + Cl(0,0,0,0).real()*Cl(0,0,0,0).real() + norm(Cl(0,1,0,0)) + norm(Cl(0,2,0,0)) + norm(Cl(1,0,0,0)) + norm(Cl(1,1,0,0)) + norm(Cl(1,2,0,0));
289  diag[1] = arg.mu*arg.mu + Cl(0,1,0,1).real()*Cl(0,1,0,1).real() + norm(Cl(0,0,0,1)) + norm(Cl(0,2,0,1)) + norm(Cl(1,0,0,1)) + norm(Cl(1,1,0,1)) + norm(Cl(1,2,0,1));
290  diag[2] = arg.mu*arg.mu + Cl(0,2,0,2).real()*Cl(0,2,0,2).real() + norm(Cl(0,0,0,2)) + norm(Cl(0,1,0,2)) + norm(Cl(1,0,0,2)) + norm(Cl(1,1,0,2)) + norm(Cl(1,2,0,2));
291  diag[3] = arg.mu*arg.mu + Cl(1,0,1,0).real()*Cl(1,0,1,0).real() + norm(Cl(0,0,1,0)) + norm(Cl(0,1,1,0)) + norm(Cl(0,2,1,0)) + norm(Cl(1,1,1,0)) + norm(Cl(1,2,1,0));
292  diag[4] = arg.mu*arg.mu + Cl(1,1,1,1).real()*Cl(1,1,1,1).real() + norm(Cl(0,0,1,1)) + norm(Cl(0,1,1,1)) + norm(Cl(0,2,1,1)) + norm(Cl(1,0,1,1)) + norm(Cl(1,2,1,1));
293  diag[5] = arg.mu*arg.mu + Cl(1,2,1,2).real()*Cl(1,2,1,2).real() + norm(Cl(0,0,1,2)) + norm(Cl(0,1,1,2)) + norm(Cl(0,2,1,2)) + norm(Cl(1,0,1,2)) + norm(Cl(1,1,1,2));
294 
295  #undef Cl
296 
297  /* INVERSION STARTS */
298 
299  for (int j=0; j<6; j++) {
300  diag[j] = sqrt(diag[j]);
301  tmp[j] = 1./diag[j];
302 
303  for (int k=j+1; k<6; k++) {
304  int kj = k*(k-1)/2+j;
305  tri[kj] *= tmp[j];
306  }
307 
308  for(int k=j+1;k<6;k++){
309  int kj=k*(k-1)/2+j;
310  diag[k] -= (tri[kj] * conj(tri[kj])).real();
311  for(int l=k+1;l<6;l++){
312  int lj=l*(l-1)/2+j;
313  int lk=l*(l-1)/2+k;
314  tri[lk] -= tri[lj] * conj(tri[kj]);
315  }
316  }
317  }
318 
319  /* Now use forward and backward substitution to construct inverse */
320  complex<Float> v1[6];
321  for (int k=0;k<6;k++) {
322  for(int l=0;l<k;l++) v1[l] = complex<Float>(0.0, 0.0);
323 
324  /* Forward substitute */
325  v1[k] = complex<Float>(tmp[k], 0.0);
326  for(int l=k+1;l<6;l++){
327  complex<Float> sum = complex<Float>(0.0, 0.0);
328  for(int j=k;j<l;j++){
329  int lj=l*(l-1)/2+j;
330  sum -= tri[lj] * v1[j];
331  }
332  v1[l] = sum * tmp[l];
333  }
334 
335  /* Backward substitute */
336  v1[5] = v1[5] * tmp[5];
337  for(int l=4;l>=k;l--){
338  complex<Float> sum = v1[l];
339  for(int j=l+1;j<6;j++){
340  int jl=j*(j-1)/2+l;
341  sum -= conj(tri[jl]) * v1[j];
342  }
343  v1[l] = sum * tmp[l];
344  }
345 
346  /* Overwrite column k */
347  diag[k] = v1[k].real();
348  for(int l=k+1;l<6;l++){
349  int lk=l*(l-1)/2+k;
350  tri[lk] = v1[l];
351  }
352  }
353 
354  /* Calculate the product for the current chiral block */
355 
356  //Then we calculate AV = Cinv UV, so [AV = (C^2 + mu^2)^{-1} (Clover -/+ i mu)·Vector]
357  //for in twisted-clover fermions, Cinv keeps (C^2 + mu^2)^{-1}
358 
359  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { // Coarse Color
360  for (int j=0; j<(fineSpin/2)*fineColor; j++) { // This won't work for anything different than fineColor = 3, fineSpin = 4
361  int s = j / fineColor, ic = j % fineColor;
362 
363  arg.AV(parity, x_cb, s+2*ch, ic, ic_c) += diag[j] * arg.UV(parity, x_cb, s+2*ch, ic, ic_c); // Diagonal clover
364 
365  for (int k=0; k<j; k++) {
366  const int jk = j*(j-1)/2 + k;
367  const int s_col = k / fineColor, jc = k % fineColor;
368 
369  arg.AV(parity, x_cb, s+2*ch, ic, ic_c) += tri[jk] * arg.UV(parity, x_cb, s_col+2*ch, jc, ic_c); // Off-diagonal
370  }
371 
372  for (int k=j+1; k<(fineSpin/2)*fineColor; k++) {
373  int kj = k*(k-1)/2 + j;
374  int s_col = k / fineColor, jc = k % fineColor;
375 
376  arg.AV(parity, x_cb, s+2*ch, ic, ic_c) += conj(tri[kj]) * arg.UV(parity, x_cb, s_col+2*ch, jc, ic_c); // Off-diagonal
377  }
378  }
379  } // Coarse color
380  } // Chirality
381  }
382 
383  #endif // UGLY_DYNCLOV
384 #endif // DYNAMIC_CLOVER
385 
390  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
391  __device__ __host__ inline void computeTMCAV(Arg &arg, int parity, int x_cb) {
392 
393  complex<Float> mu(0.,arg.mu);
394 
395  for(int s = 0; s < fineSpin; s++) {
396  for(int c = 0; c < fineColor; c++) {
397  for(int v = 0; v < coarseColor; v++) {
398  arg.UV(parity,x_cb,s,c,v) = static_cast<Float>(0.0);
399  arg.AV(parity,x_cb,s,c,v) = static_cast<Float>(0.0);
400  }
401  }
402  }
403 
404  //First we store in UV the product [(Clover -/+ i mu)·Vector]
405  for(int s = 0; s < fineSpin; s++) { //Fine Spin
406  const int s_c = s/arg.spin_bs;
407 
408  //On the fine lattice, the clover field is chirally blocked, so loop over rows/columns
409  //in the same chiral block.
410  for(int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) { //Loop over fine spin column
411 
412  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color
413  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
414  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
415  arg.UV(parity, x_cb, s, ic, ic_c) +=
416  arg.C(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, ic_c);
417  } //Fine color columns
418  } //Fine color rows
419  } //Coarse color
420  }
421  } //Fine Spin
422 
423  for(int s = 0; s < fineSpin/2; s++) { //Fine Spin
424  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color
425  for(int ic = 0; ic < fineColor; ic++) { //Fine Color
426  arg.UV(parity, x_cb, s, ic, ic_c) -= mu * arg.V(parity, x_cb, s, ic, ic_c);
427  } //Fine color
428  } //Coarse color
429  } //Fine Spin
430 
431  for(int s = fineSpin/2; s < fineSpin; s++) { //Fine Spin
432  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color
433  for(int ic = 0; ic < fineColor; ic++) { //Fine Color
434  arg.UV(parity, x_cb, s, ic, ic_c) += mu * arg.V(parity, x_cb, s, ic, ic_c);
435  } //Fine color
436  } //Coarse color
437  } //Fine Spin
438 
439 #ifndef DYNAMIC_CLOVER
440  //Then we calculate AV = Cinv UV, so [AV = (C^2 + mu^2)^{-1} (Clover -/+ i mu)·Vector]
441  //for in twisted-clover fermions, Cinv keeps (C^2 + mu^2)^{-1}
442  for(int s = 0; s < fineSpin; s++) { //Fine Spin
443  const int s_c = s/arg.spin_bs;
444 
445  //On the fine lattice, the clover field is chirally blocked, so loop over rows/columns
446  //in the same chiral block.
447  for(int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) { //Loop over fine spin column
448 
449  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color
450  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
451  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
452  arg.AV(parity, x_cb, s, ic, ic_c) +=
453  arg.Cinv(0, parity, x_cb, s, s_col, ic, jc) * arg.UV(parity, x_cb, s_col, jc, ic_c);
454  } //Fine color columns
455  } //Fine color rows
456  } //Coarse color
457  }
458  } //Fine Spin
459 #else
460  applyInvClover<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb);
461 #endif
462  } // computeTMCAV
463 
464  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
465  void ComputeTMCAVCPU(Arg &arg) {
466  for (int parity=0; parity<2; parity++) {
467  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
468  computeTMCAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb);
469  } // c/b volume
470  } // parity
471  }
472 
473  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
474  __global__ void ComputeTMCAVGPU(Arg arg) {
475  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
476  if (x_cb >= arg.fineVolumeCB) return;
477 
478  int parity = blockDim.y*blockIdx.y + threadIdx.y;
479  computeTMCAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb);
480  }
481 
493  template <bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
494  __device__ __host__ inline void multiplyVUV(complex<Float> vuv[], Arg &arg, int parity, int x_cb, int ic_c) {
495 
497 
498  for (int i=0; i<coarseSpin*coarseSpin*coarseColor; i++) vuv[i] = 0.0;
499 
500  if (!from_coarse) { // fine grid is top level
501 
502  for(int s = 0; s < fineSpin; s++) { //Loop over fine spin
503 
504  //Spin part of the color matrix. Will always consist
505  //of two terms - diagonal and off-diagonal part of
506  //P_mu = (1+/-\gamma_mu)
507 
508  int s_c_row = s/arg.spin_bs; //Coarse spin row index
509 
510  //Use Gamma to calculate off-diagonal coupling and
511  //column index. Diagonal coupling is always 1.
512  // If computing the backwards (forwards) direction link then
513  // we desire the positive (negative) projector
514 
515  int s_col;
516  complex<Float> coupling = gamma.getrowelem(s, s_col);
517  int s_c_col = s_col/arg.spin_bs;
518 
519  { //for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color row
520  for(int jc_c = 0; jc_c < coarseColor; jc_c++) { //Coarse Color column
521  for(int ic = 0; ic < fineColor; ic++) { //Sum over fine color
522  if (dir == QUDA_BACKWARDS) {
523  // here UV is really UAV
524  //Diagonal Spin
525  // vuv[((s_c_row*coarseSpin+s_c_row)*coarseColor+ic_c)*coarseColor+jc_c] +=
526  vuv[(s_c_row*coarseSpin+s_c_row)*coarseColor+jc_c] +=
527  conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.UV(parity, x_cb, s, ic, jc_c);
528 
529  //Off-diagonal Spin (backward link / positive projector applied)
530  //vuv[((s_c_row*coarseSpin+s_c_col)*coarseColor+ic_c)*coarseColor+jc_c] +=
531  vuv[(s_c_row*coarseSpin+s_c_col)*coarseColor+jc_c] +=
532  coupling * conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.UV(parity, x_cb, s_col, ic, jc_c);
533  } else {
534  //Diagonal Spin
535  //vuv[((s_c_row*coarseSpin+s_c_row)*coarseColor+ic_c)*coarseColor+jc_c] +=
536  vuv[(s_c_row*coarseSpin+s_c_row)*coarseColor+jc_c] +=
537  conj(arg.AV(parity, x_cb, s, ic, ic_c)) * arg.UV(parity, x_cb, s, ic, jc_c);
538 
539  //Off-diagonal Spin (forward link / negative projector applied)
540  //vuv[((s_c_row*coarseSpin+s_c_col)*coarseColor+ic_c)*coarseColor+jc_c] -=
541  vuv[(s_c_row*coarseSpin+s_c_col)*coarseColor+jc_c] -=
542  coupling * conj(arg.AV(parity, x_cb, s, ic, ic_c)) * arg.UV(parity, x_cb, s_col, ic, jc_c);
543  }
544  } //Fine color
545  } //Coarse Color column
546  } //Coarse Color row
547  }
548 
549  } else { // fine grid operator is a coarse operator
550 
551  for (int s_col=0; s_col<fineSpin; s_col++) { // which chiral block
552  for (int s = 0; s < fineSpin; s++) {
553  //for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color row
554  for(int jc_c = 0; jc_c < coarseColor; jc_c++) { //Coarse Color column
555  for(int ic = 0; ic < fineColor; ic++) { //Sum over fine color
556  //vuv[((s*coarseSpin+s_col)*coarseColor+ic_)c*coarseColo+jc_c] +=
557  vuv[(s*coarseSpin+s_col)*coarseColor+jc_c] +=
558  conj(arg.AV(parity, x_cb, s, ic, ic_c)) * arg.UV(parity, x_cb, s_col*fineSpin+s, ic, jc_c);
559  } //Fine color
560  } //Coarse Color column
561  //} //Coarse Color row
562  } //Fine spin
563  }
564 
565  } // from_coarse
566 
567  }
568 
569  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
570  __device__ __host__ void computeVUV(Arg &arg, int parity, int x_cb, int c_row) {
571 
572  const int nDim = 4;
573  int coord[QUDA_MAX_DIM];
574  int coord_coarse[QUDA_MAX_DIM];
575  int coarse_size = 1;
576  for(int d = 0; d<nDim; d++) coarse_size *= arg.xc_size[d];
577 
578  getCoords(coord, x_cb, arg.x_size, parity);
579  for(int d = 0; d < nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
580 
581  //Check to see if we are on the edge of a block. If adjacent site
582  //is in same block, M = X, else M = Y
583  const bool isDiagonal = ((coord[dim]+1)%arg.x_size[dim])/arg.geo_bs[dim] == coord_coarse[dim] ? true : false;
584 
585  // store the forward and backward clover contributions separately for now since they can't be added coeherently easily
586  auto &M = isDiagonal ? (dir == QUDA_BACKWARDS ? arg.X : arg.Xinv) : arg.Y;
587  const int dim_index = isDiagonal ? 0 : (dir == QUDA_BACKWARDS ? dim : dim + 4);
588 
589  int coarse_parity = 0;
590  for (int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
591  coarse_parity &= 1;
592  coord_coarse[0] /= 2;
593  int coarse_x_cb = ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
594  coord[0] /= 2;
595 
596  //complex<Float> vuv[coarseSpin*coarseSpin*coarseColor*coarseColor];
597  complex<Float> vuv[coarseSpin*coarseSpin*coarseColor];
598  multiplyVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(vuv, arg, parity, x_cb, c_row);
599 
600  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
601  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
602  // for(int c_row = 0; c_row < coarseColor; c_row++) { // Coarse Color row
603  for(int c_col = 0; c_col < coarseColor; c_col++) { // Coarse Color column
604  M.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,
605  vuv[(s_row*coarseSpin+s_col)*coarseColor+c_col]);
606  } //Coarse Color column
607  //} //Coarse Color row
608  }
609  }
610 
611  }
612 
613  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
614  void ComputeVUVCPU(Arg arg) {
615  for (int parity=0; parity<2; parity++) {
616  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) { // Loop over fine volume
617  for (int c_row=0; c_row<coarseColor; c_row++)
618  computeVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(arg, parity, x_cb, c_row);
619  } // c/b volume
620  } // parity
621  }
622 
623  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
624  __global__ void ComputeVUVGPU(Arg arg) {
625  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
626  if (x_cb >= arg.fineVolumeCB) return;
627 
628  int parity = blockDim.y*blockIdx.y + threadIdx.y;
629  int c_row = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
630  if (c_row >= coarseColor) return;
631  computeVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(arg, parity, x_cb, c_row);
632  }
633 
638  template<typename Float, int nSpin, int nColor, typename Arg>
639  __device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb) {
640  auto &Y = arg.Y;
641 
642  for (int d=0; d<4; d++) {
643  for(int s_row = 0; s_row < nSpin; s_row++) { //Spin row
644  for(int s_col = 0; s_col < nSpin; s_col++) { //Spin column
645 
646  const Float sign = (s_row == s_col) ? static_cast<Float>(1.0) : static_cast<Float>(-1.0);
647 
648  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color row
649  for(int jc_c = 0; jc_c < nColor; jc_c++) { //Color column
650  Y(d+4,parity,x_cb,s_row,s_col,ic_c,jc_c) = sign*Y(d,parity,x_cb,s_row,s_col,ic_c,jc_c);
651  } //Color column
652  } //Color row
653  } //Spin column
654  } //Spin row
655 
656  } // dimension
657 
658  }
659 
660  template<typename Float, int nSpin, int nColor, typename Arg>
661  void ComputeYReverseCPU(Arg &arg) {
662  for (int parity=0; parity<2; parity++) {
663  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
664  computeYreverse<Float,nSpin,nColor,Arg>(arg, parity, x_cb);
665  } // c/b volume
666  } // parity
667  }
668 
669  template<typename Float, int nSpin, int nColor, typename Arg>
670  __global__ void ComputeYReverseGPU(Arg arg) {
671  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
672  if (x_cb >= arg.coarseVolumeCB) return;
673 
674  int parity = blockDim.y*blockIdx.y + threadIdx.y;
675  computeYreverse<Float,nSpin,nColor,Arg>(arg, parity, x_cb);
676  }
677 
685  template<bool bidirectional, typename Float, int nSpin, int nColor, typename Arg>
686  __device__ __host__ void computeCoarseLocal(Arg &arg, int parity, int x_cb)
687  {
688  complex<Float> Xlocal[nSpin*nSpin*nColor*nColor];
689 
690  for(int s_row = 0; s_row < nSpin; s_row++) { //Spin row
691  for(int s_col = 0; s_col < nSpin; s_col++) { //Spin column
692 
693  //Copy the Hermitian conjugate term to temp location
694  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color row
695  for(int jc_c = 0; jc_c < nColor; jc_c++) { //Color column
696  //Flip s_col, s_row on the rhs because of Hermitian conjugation. Color part left untransposed.
697  Xlocal[((nSpin*s_col+s_row)*nColor+ic_c)*nColor+jc_c] = arg.X(0,parity,x_cb,s_row, s_col, ic_c, jc_c);
698  }
699  }
700  }
701  }
702 
703  for(int s_row = 0; s_row < nSpin; s_row++) { //Spin row
704  for(int s_col = 0; s_col < nSpin; s_col++) { //Spin column
705 
706  const Float sign = (s_row == s_col) ? static_cast<Float>(1.0) : static_cast<Float>(-1.0);
707 
708  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color row
709  for(int jc_c = 0; jc_c < nColor; jc_c++) { //Color column
710  if (bidirectional) {
711  // here we have forwards links in Xinv and backwards links in X
712  arg.X(0,parity,x_cb,s_row,s_col,ic_c,jc_c) =
713  -arg.kappa*(arg.Xinv(0,parity,x_cb,s_row,s_col,ic_c,jc_c)
714  +conj(Xlocal[((nSpin*s_row+s_col)*nColor+jc_c)*nColor+ic_c]));
715  } else {
716  // here we have just backwards links
717  arg.X(0,parity,x_cb,s_row,s_col,ic_c,jc_c) =
718  -arg.kappa*(sign*arg.X(0,parity,x_cb,s_row,s_col,ic_c,jc_c)
719  +conj(Xlocal[((nSpin*s_row+s_col)*nColor+jc_c)*nColor+ic_c]));
720  }
721  } //Color column
722  } //Color row
723  } //Spin column
724  } //Spin row
725 
726  }
727 
728  template<bool bidirectional, typename Float, int nSpin, int nColor, typename Arg>
730  for (int parity=0; parity<2; parity++) {
731  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
732  computeCoarseLocal<bidirectional,Float,nSpin,nColor,Arg>(arg, parity, x_cb);
733  } // c/b volume
734  } // parity
735  }
736 
737  template<bool bidirectional, typename Float, int nSpin, int nColor, typename Arg>
738  __global__ void ComputeCoarseLocalGPU(Arg arg) {
739  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
740  if (x_cb >= arg.coarseVolumeCB) return;
741 
742  int parity = blockDim.y*blockIdx.y + threadIdx.y;
743  computeCoarseLocal<bidirectional,Float,nSpin,nColor,Arg>(arg, parity, x_cb);
744  }
745 
746 
747  template<bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
748  __device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c) {
749 
750  const int nDim = 4;
751 
752  int coord[QUDA_MAX_DIM];
753  int coord_coarse[QUDA_MAX_DIM];
754  int coarse_size = 1;
755  for(int d = 0; d<nDim; d++) coarse_size *= arg.xc_size[d];
756 
757  getCoords(coord, x_cb, arg.x_size, parity);
758  for (int d=0; d<nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
759 
760  int coarse_parity = 0;
761  for (int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
762  coarse_parity &= 1;
763  coord_coarse[0] /= 2;
764  int coarse_x_cb = ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
765 
766  coord[0] /= 2;
767 
768  complex<Float> X[coarseSpin*coarseSpin*coarseColor];
769  for (int i=0; i<coarseSpin*coarseSpin*coarseColor; i++) X[i] = 0.0;
770 
771  if (!from_coarse) {
772  //If Nspin = 4, then the clover term has structure C_{\mu\nu} = \gamma_{\mu\nu}C^{\mu\nu}
773  for(int s = 0; s < fineSpin; s++) { //Loop over fine spin row
774  int s_c = s/arg.spin_bs;
775  //On the fine lattice, the clover field is chirally blocked, so loop over rows/columns
776  //in the same chiral block.
777  for(int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) { //Loop over fine spin column
778  //for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color row
779  for(int jc_c = 0; jc_c < coarseColor; jc_c++) { //Coarse Color column
780  for(int ic = 0; ic < fineColor; ic++) { //Sum over fine color row
781  for(int jc = 0; jc < fineColor; jc++) { //Sum over fine color column
782  X[ (s_c*coarseSpin + s_c)*coarseColor + jc_c] +=
783  conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.C(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, jc_c);
784  } //Fine color column
785  } //Fine color row
786  } //Coarse Color column
787  //} //Coarse Color row
788  } //Fine spin column
789  } //Fine spin
790  } else {
791  //If Nspin != 4, then spin structure is a dense matrix and there is now spin aggregation
792  //N.B. assumes that no further spin blocking is done in this case.
793  for(int s = 0; s < fineSpin; s++) { //Loop over spin row
794  for(int s_col = 0; s_col < fineSpin; s_col++) { //Loop over spin column
795  //for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color row
796  for(int jc_c = 0; jc_c <coarseColor; jc_c++) { //Coarse Color column
797  for(int ic = 0; ic < fineColor; ic++) { //Sum over fine color row
798  for(int jc = 0; jc < fineColor; jc++) { //Sum over fine color column
799  X[ (s*coarseSpin + s_col)*coarseColor + jc_c] +=
800  conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.C(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, jc_c);
801  } //Fine color column
802  } //Fine color row
803  } //Coarse Color column
804  //} //Coarse Color row
805  } //Fine spin column
806  } //Fine spin
807  }
808 
809  for (int si = 0; si < coarseSpin; si++) {
810  for (int sj = 0; sj < coarseSpin; sj++) {
811  //for (int ic = 0; ic < coarseColor; ic++) {
812  for (int jc = 0; jc < coarseColor; jc++) {
813  arg.X.atomicAdd(0,coarse_parity,coarse_x_cb,si,sj,ic_c,jc,X[(si*coarseSpin+sj)*coarseColor+jc]);
814  }
815  //}
816  }
817  }
818 
819  }
820 
821  template <bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
823  for (int parity=0; parity<2; parity++) {
824  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
825  for (int ic_c=0; ic_c<coarseColor; ic_c++) {
826  computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg, parity, x_cb, ic_c);
827  }
828  } // c/b volume
829  } // parity
830  }
831 
832  template <bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
833  __global__ void ComputeCoarseCloverGPU(Arg arg) {
834  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
835  if (x_cb >= arg.fineVolumeCB) return;
836  int parity = blockDim.y*blockIdx.y + threadIdx.y;
837  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
838  if (ic_c >= coarseColor) return;
839  computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg, parity, x_cb, ic_c);
840  }
841 
842 
843 
844  //Adds the identity matrix to the coarse local term.
845  template<typename Float, int nSpin, int nColor, typename Arg>
847  for (int parity=0; parity<2; parity++) {
848  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
849  for(int s = 0; s < nSpin; s++) { //Spin
850  for(int c = 0; c < nColor; c++) { //Color
851  arg.X(0,parity,x_cb,s,s,c,c) += static_cast<Float>(1.0);
852  } //Color
853  } //Spin
854  } // x_cb
855  } //parity
856  }
857 
858 
859  //Adds the identity matrix to the coarse local term.
860  template<typename Float, int nSpin, int nColor, typename Arg>
861  __global__ void AddCoarseDiagonalGPU(Arg arg) {
862  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
863  if (x_cb >= arg.coarseVolumeCB) return;
864  int parity = blockDim.y*blockIdx.y + threadIdx.y;
865 
866  for(int s = 0; s < nSpin; s++) { //Spin
867  for(int c = 0; c < nColor; c++) { //Color
868  arg.X(0,parity,x_cb,s,s,c,c) += static_cast<Float>(1.0);
869  } //Color
870  } //Spin
871  }
872 
873  //Adds the twisted-mass term to the coarse local term.
874  template<typename Float, int nSpin, int nColor, typename Arg>
876 
877  const complex<Float> mu(0., arg.mu*arg.mu_factor);
878 
879  for (int parity=0; parity<2; parity++) {
880  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
881  for(int s = 0; s < nSpin/2; s++) { //Spin
882  for(int c = 0; c < nColor; c++) { //Color
883  arg.X(0,parity,x_cb,s,s,c,c) += mu;
884  } //Color
885  } //Spin
886  for(int s = nSpin/2; s < nSpin; s++) { //Spin
887  for(int c = 0; c < nColor; c++) { //Color
888  arg.X(0,parity,x_cb,s,s,c,c) -= mu;
889  } //Color
890  } //Spin
891  } // x_cb
892  } //parity
893  }
894 
895  //Adds the twisted-mass term to the coarse local term.
896  template<typename Float, int nSpin, int nColor, typename Arg>
897  __global__ void AddCoarseTmDiagonalGPU(Arg arg) {
898  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
899  if (x_cb >= arg.coarseVolumeCB) return;
900  int parity = blockDim.y*blockIdx.y + threadIdx.y;
901 
902  const complex<Float> mu(0., arg.mu*arg.mu_factor);
903 
904  for(int s = 0; s < nSpin/2; s++) { //Spin
905  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color
906  arg.X(0,parity,x_cb,s,s,ic_c,ic_c) += mu;
907  } //Color
908  } //Spin
909  for(int s = nSpin/2; s < nSpin; s++) { //Spin
910  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color
911  arg.X(0,parity,x_cb,s,s,ic_c,ic_c) -= mu;
912  } //Color
913  } //Spin
914  }
915 
916  enum ComputeType {
928  };
929 
930  template <bool from_coarse, typename Float, int fineSpin,
931  int fineColor, int coarseSpin, int coarseColor, typename Arg>
932  class CalculateY : public TunableVectorYZ {
933 
934  protected:
935  Arg &arg;
940 
941  int dim;
945 
946  long long flops() const
947  {
948  long long flops_ = 0;
949  switch (type) {
950  case COMPUTE_UV:
951  // when fine operator is coarse take into account that the link matrix has spin dependence
952  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * coarseColor * fineColor * fineColor * (!from_coarse ? 1 : fineSpin);
953  break;
954  case COMPUTE_AV:
955  case COMPUTE_TMAV:
956  // # chiral blocks * size of chiral block * number of null space vectors
957  flops_ = 2l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
958  break;
959  case COMPUTE_TMCAV:
960  // # Twice chiral blocks * size of chiral block * number of null space vectors
961  flops_ = 4l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
962  break;
963  case COMPUTE_VUV:
964  // when the fine operator is truly fine the VUV multiplication is block sparse which halves the number of operations
965  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor / (!from_coarse ? coarseSpin : 1);
966  break;
968  // when the fine operator is truly fine the clover multiplication is block sparse which halves the number of operations
969  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor * fineColor / (!from_coarse ? coarseSpin : 1);
970  break;
971  case COMPUTE_REVERSE_Y:
972  // no floating point operations
973  flops_ = 0;
974  break;
976  // complex addition over all components
977  flops_ = 2l * arg.coarseVolumeCB*coarseSpin*coarseSpin*coarseColor*coarseColor*2;
978  break;
979  case COMPUTE_DIAGONAL:
980  case COMPUTE_TMDIAGONAL:
981  // read addition on the diagonal
982  flops_ = 2l * arg.coarseVolumeCB*coarseSpin*coarseColor;
983  break;
984  default:
985  errorQuda("Undefined compute type %d", type);
986  }
987  // 2 from parity, 8 from complex
988  return flops_;
989  }
990  long long bytes() const
991  {
992  long long bytes_ = 0;
993  switch (type) {
994  case COMPUTE_UV:
995  bytes_ = arg.UV.Bytes() + arg.V.Bytes() + 2*arg.U.Bytes()*coarseColor;
996  break;
997  case COMPUTE_AV:
998  bytes_ = arg.AV.Bytes() + arg.V.Bytes() + 2*arg.C.Bytes();
999  break;
1000  case COMPUTE_TMAV:
1001  bytes_ = arg.AV.Bytes() + arg.V.Bytes();
1002  break;
1003  case COMPUTE_TMCAV:
1004  bytes_ = arg.AV.Bytes() + arg.V.Bytes() + arg.UV.Bytes() + 4*arg.C.Bytes(); // Two clover terms and more temporary storage
1005  break;
1006  case COMPUTE_VUV:
1007  bytes_ = arg.UV.Bytes() + arg.V.Bytes();
1008  break;
1009  case COMPUTE_COARSE_CLOVER:
1010  bytes_ = 2*arg.X.Bytes() + 2*arg.C.Bytes() + arg.V.Bytes(); // 2 from parity
1011  break;
1012  case COMPUTE_REVERSE_Y:
1013  bytes_ = 4*2*2*arg.Y.Bytes(); // 4 from direction, 2 from i/o, 2 from parity
1014  case COMPUTE_COARSE_LOCAL:
1015  case COMPUTE_DIAGONAL:
1016  case COMPUTE_TMDIAGONAL:
1017  bytes_ = 2*2*arg.X.Bytes(); // 2 from i/o, 2 from parity
1018  break;
1019  default:
1020  errorQuda("Undefined compute type %d", type);
1021  }
1022  return bytes_;
1023  }
1024 
1025  unsigned int minThreads() const {
1026  unsigned int threads = 0;
1027  switch (type) {
1028  case COMPUTE_UV:
1029  case COMPUTE_AV:
1030  case COMPUTE_TMAV:
1031  case COMPUTE_TMCAV:
1032  case COMPUTE_VUV:
1033  case COMPUTE_COARSE_CLOVER:
1034  threads = arg.fineVolumeCB;
1035  break;
1036  case COMPUTE_REVERSE_Y:
1037  case COMPUTE_COARSE_LOCAL:
1038  case COMPUTE_DIAGONAL:
1039  case COMPUTE_TMDIAGONAL:
1040  threads = arg.coarseVolumeCB;
1041  break;
1042  default:
1043  errorQuda("Undefined compute type %d", type);
1044  }
1045  return threads;
1046  }
1047 
1048  bool tuneGridDim() const { return false; } // don't tune the grid dimension
1049 
1050  public:
1054  meta(meta), Y(Y), X(X), Xinv(Xinv), dim(0), dir(QUDA_BACKWARDS)
1055  {
1056  strcpy(aux, meta.AuxString());
1058  }
1059  virtual ~CalculateY() { }
1060 
1061  void apply(const cudaStream_t &stream) {
1062  TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE);
1063 
1065 
1066  if (type == COMPUTE_UV) {
1067 
1068  if (dir == QUDA_BACKWARDS) {
1069  if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1070  else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1071  else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1072  else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1073  } else if (dir == QUDA_FORWARDS) {
1074  if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1075  else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1076  else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1077  else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1078  } else {
1079  errorQuda("Undefined direction %d", dir);
1080  }
1081 
1082  } else if (type == COMPUTE_AV) {
1083 
1084  if (from_coarse) errorQuda("ComputeAV should only be called from the fine grid");
1085  ComputeAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
1086 
1087  } else if (type == COMPUTE_TMAV) {
1088 
1089  if (from_coarse) errorQuda("ComputeTMAV should only be called from the fine grid");
1090  ComputeTMAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
1091 
1092  } else if (type == COMPUTE_TMCAV) {
1093 
1094  if (from_coarse) errorQuda("ComputeTMCAV should only be called from the fine grid");
1095  ComputeTMCAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
1096 
1097  } else if (type == COMPUTE_VUV) {
1098 
1099  if (dir == QUDA_BACKWARDS) {
1100  if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1101  else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1102  else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1103  else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1104  } else if (dir == QUDA_FORWARDS) {
1105  if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1106  else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1107  else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1108  else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
1109  } else {
1110  errorQuda("Undefined direction %d", dir);
1111  }
1112 
1113  } else if (type == COMPUTE_COARSE_CLOVER) {
1114 
1115  ComputeCoarseCloverCPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg);
1116 
1117  } else if (type == COMPUTE_REVERSE_Y) {
1118 
1119  ComputeYReverseCPU<Float,coarseSpin,coarseColor>(arg);
1120 
1121  } else if (type == COMPUTE_COARSE_LOCAL) {
1122 
1123  if (bidirectional) ComputeCoarseLocalCPU<true,Float,coarseSpin,coarseColor>(arg);
1124  else ComputeCoarseLocalCPU<false,Float,coarseSpin,coarseColor>(arg);
1125 
1126  } else if (type == COMPUTE_DIAGONAL) {
1127 
1128  AddCoarseDiagonalCPU<Float,coarseSpin,coarseColor>(arg);
1129 
1130  } else if (type == COMPUTE_TMDIAGONAL) {
1131 
1132  AddCoarseTmDiagonalCPU<Float,coarseSpin,coarseColor>(arg);
1133 
1134  } else {
1135  errorQuda("Undefined compute type %d", type);
1136  }
1137  } else {
1138 
1139  if (type == COMPUTE_UV) {
1140 
1141  if (dir == QUDA_BACKWARDS) {
1142  if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1143  else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1144  else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1145  else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1146  } else if (dir == QUDA_FORWARDS) {
1147  if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1148  else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1149  else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1150  else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1151  } else {
1152  errorQuda("Undefined direction %d", dir);
1153  }
1154 
1155  } else if (type == COMPUTE_AV) {
1156 
1157  if (from_coarse) errorQuda("ComputeAV should only be called from the fine grid");
1158  ComputeAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1159 
1160  } else if (type == COMPUTE_TMAV) {
1161 
1162  if (from_coarse) errorQuda("ComputeTMAV should only be called from the fine grid");
1163  ComputeTMAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1164 
1165  } else if (type == COMPUTE_TMCAV) {
1166 
1167  if (from_coarse) errorQuda("ComputeTMCAV should only be called from the fine grid");
1168  ComputeTMCAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1169 
1170  } else if (type == COMPUTE_VUV) {
1171 
1172  if (dir == QUDA_BACKWARDS) {
1173  if (dim==0) ComputeVUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1174  else if (dim==1) ComputeVUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1175  else if (dim==2) ComputeVUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1176  else if (dim==3) ComputeVUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1177  } else if (dir == QUDA_FORWARDS) {
1178  if (dim==0) ComputeVUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1179  else if (dim==1) ComputeVUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1180  else if (dim==2) ComputeVUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1181  else if (dim==3) ComputeVUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1182  } else {
1183  errorQuda("Undefined direction %d", dir);
1184  }
1185 
1186  } else if (type == COMPUTE_COARSE_CLOVER) {
1187 
1188  ComputeCoarseCloverGPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>
1189  <<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1190 
1191  } else if (type == COMPUTE_REVERSE_Y) {
1192 
1193  ComputeYReverseGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1194 
1195  } else if (type == COMPUTE_COARSE_LOCAL) {
1196 
1197  if (bidirectional) ComputeCoarseLocalGPU<true,Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1198  else ComputeCoarseLocalGPU<false,Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1199 
1200  } else if (type == COMPUTE_DIAGONAL) {
1201 
1202  AddCoarseDiagonalGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1203 
1204  } else if (type == COMPUTE_TMDIAGONAL) {
1205 
1206  AddCoarseTmDiagonalGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1207 
1208  } else {
1209  errorQuda("Undefined compute type %d", type);
1210  }
1211  }
1212  }
1213 
1217  void setDimension(int dim_) { dim = dim_; }
1218 
1222  void setDirection(QudaDirection dir_) { dir = dir_; }
1223 
1228  type = type_;
1229  switch(type) {
1230  case COMPUTE_UV:
1231  case COMPUTE_AV:
1232  case COMPUTE_TMAV:
1233  case COMPUTE_VUV:
1234  case COMPUTE_COARSE_CLOVER:
1235  resizeVector(2,coarseColor);
1236  break;
1237  default:
1238  resizeVector(2,1);
1239  break;
1240  }
1241  }
1242 
1245  else return false;
1246  }
1247 
1248  TuneKey tuneKey() const {
1249  char Aux[TuneKey::aux_n];
1250  strcpy(Aux,aux);
1251 
1252  if (type == COMPUTE_UV) strcat(Aux,",computeUV");
1253  else if (type == COMPUTE_AV) strcat(Aux,",computeAV");
1254  else if (type == COMPUTE_TMAV) strcat(Aux,",computeTmAV");
1255  else if (type == COMPUTE_TMCAV) strcat(Aux,",computeTmcAV");
1256  else if (type == COMPUTE_VUV) strcat(Aux,",computeVUV");
1257  else if (type == COMPUTE_COARSE_CLOVER) strcat(Aux,",computeCoarseClover");
1258  else if (type == COMPUTE_REVERSE_Y) strcat(Aux,",computeYreverse");
1259  else if (type == COMPUTE_COARSE_LOCAL) strcat(Aux,",computeCoarseLocal");
1260  else if (type == COMPUTE_DIAGONAL) strcat(Aux,",computeCoarseDiagonal");
1261  else if (type == COMPUTE_TMDIAGONAL) strcat(Aux,",computeCoarseTmDiagonal");
1262  else errorQuda("Unknown type=%d\n", type);
1263 
1264  if (type == COMPUTE_UV || type == COMPUTE_VUV) {
1265  if (dim == 0) strcat(Aux,",dim=0");
1266  else if (dim == 1) strcat(Aux,",dim=1");
1267  else if (dim == 2) strcat(Aux,",dim=2");
1268  else if (dim == 3) strcat(Aux,",dim=3");
1269 
1270  if (dir == QUDA_BACKWARDS) strcat(Aux,",dir=back");
1271  else if (dir == QUDA_FORWARDS) strcat(Aux,",dir=fwd");
1272  }
1273 
1274  if (type == COMPUTE_VUV || type == COMPUTE_COARSE_CLOVER) {
1275  strcat(Aux,meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU," : ",CPU,");
1276  strcat(Aux,"coarse_vol=");
1277  strcat(Aux,X.VolString());
1278  } else {
1279  strcat(Aux,meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU" : ",CPU");
1280  }
1281 
1282  return TuneKey(meta.VolString(), typeid(*this).name(), Aux);
1283  }
1284 
1285  void preTune() {
1286  switch (type) {
1287  case COMPUTE_VUV:
1288  Y.backup();
1289  Xinv.backup();
1290  case COMPUTE_COARSE_LOCAL:
1291  case COMPUTE_DIAGONAL:
1292  case COMPUTE_TMDIAGONAL:
1293  case COMPUTE_COARSE_CLOVER:
1294  X.backup();
1295  case COMPUTE_UV:
1296  case COMPUTE_AV:
1297  case COMPUTE_TMAV:
1298  case COMPUTE_TMCAV:
1299  case COMPUTE_REVERSE_Y:
1300  break;
1301  default:
1302  errorQuda("Undefined compute type %d", type);
1303  }
1304  }
1305 
1306  void postTune() {
1307  switch (type) {
1308  case COMPUTE_VUV:
1309  Y.restore();
1310  Xinv.restore();
1311  case COMPUTE_COARSE_LOCAL:
1312  case COMPUTE_DIAGONAL:
1313  case COMPUTE_TMDIAGONAL:
1314  case COMPUTE_COARSE_CLOVER:
1315  X.restore();
1316  case COMPUTE_UV:
1317  case COMPUTE_AV:
1318  case COMPUTE_TMAV:
1319  case COMPUTE_TMCAV:
1320  case COMPUTE_REVERSE_Y:
1321  break;
1322  default:
1323  errorQuda("Undefined compute type %d", type);
1324  }
1325  }
1326  };
1327 
1328 
1329  template <typename Flloat, typename Gauge, int n>
1331  Gauge Yhat;
1332  const Gauge Y;
1333  const Gauge Xinv;
1336  int nFace;
1337  const int coarseVolumeCB;
1339  CalculateYhatArg(const Gauge &Yhat, const Gauge Y, const Gauge Xinv, const int *dim, const int *comm_dim, int nFace)
1340  : Yhat(Yhat), Y(Y), Xinv(Xinv), nFace(nFace), coarseVolumeCB(Y.VolumeCB()) {
1341  for (int i=0; i<4; i++) {
1342  this->comm_dim[i] = comm_dim[i];
1343  this->dim[i] = dim[i];
1344  }
1345  }
1346  };
1347 
1348  template<typename Float, int n, typename Arg>
1349  __device__ __host__ void computeYhat(Arg &arg, int d, int x_cb, int parity, int i) {
1350 
1351  int coord[5];
1352  getCoords(coord, x_cb, arg.dim, parity);
1353  coord[4] = 0;
1354 
1355  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, d, arg.nFace);
1356 
1357  // first do the backwards links Y^{+\mu} * X^{-\dagger}
1358  if ( arg.comm_dim[d] && (coord[d] - arg.nFace < 0) ) {
1359 
1360  for(int j = 0; j<n; j++) {
1361  arg.Yhat.Ghost(d,1-parity,ghost_idx,i,j) = 0.0;
1362  for(int k = 0; k<n; k++) {
1363  arg.Yhat.Ghost(d,1-parity,ghost_idx,i,j) += arg.Y.Ghost(d,1-parity,ghost_idx,i,k) * conj(arg.Xinv(0,parity,x_cb,j,k));
1364  }
1365  }
1366 
1367  } else {
1368  const int back_idx = linkIndexM1(coord, arg.dim, d);
1369 
1370  for(int j = 0; j<n; j++) {
1371  arg.Yhat(d,1-parity,back_idx,i,j) = 0.0;
1372  for(int k = 0; k<n; k++) {
1373  arg.Yhat(d,1-parity,back_idx,i,j) += arg.Y(d,1-parity,back_idx,i,k) * conj(arg.Xinv(0,parity,x_cb,j,k));
1374  }
1375  }
1376 
1377  }
1378 
1379  // now do the forwards links X^{-1} * Y^{-\mu}
1380  for(int j = 0; j<n; j++) {
1381  arg.Yhat(d+4,parity,x_cb,i,j) = 0.0;
1382  for(int k = 0; k<n; k++) {
1383  arg.Yhat(d+4,parity,x_cb,i,j) += arg.Xinv(0,parity,x_cb,i,k) * arg.Y(d+4,parity,x_cb,k,j);
1384  }
1385  }
1386 
1387  }
1388 
1389  template<typename Float, int n, typename Arg>
1390  void CalculateYhatCPU(Arg &arg) {
1391 
1392  for (int d=0; d<4; d++) {
1393  for (int parity=0; parity<2; parity++) {
1394  for (int x_cb=0; x_cb<arg.Y.VolumeCB(); x_cb++) {
1395  for (int i=0; i<n; i++) computeYhat<Float,n>(arg, d, x_cb, parity, i);
1396  } // x_cb
1397  } //parity
1398  } // dimension
1399  }
1400 
1401  template<typename Float, int n, typename Arg>
1402  __global__ void CalculateYhatGPU(Arg arg) {
1403  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1404  if (x_cb >= arg.coarseVolumeCB) return;
1405  int i_parity = blockDim.y*blockIdx.y + threadIdx.y;
1406  if (i_parity >= 2*n) return;
1407  int d = blockDim.z*blockIdx.z + threadIdx.z;
1408  if (d >= 4) return;
1409 
1410  int i = i_parity % n;
1411  int parity = i_parity / n;
1412  // first do the backwards links Y^{+\mu} * X^{-\dagger}
1413  computeYhat<Float,n>(arg, d, x_cb, parity, i);
1414  }
1415 
1416  template <typename Float, int n, typename Arg>
1418 
1419  protected:
1420  Arg &arg;
1422 
1423  long long flops() const { return 2l * arg.coarseVolumeCB * 8 * n * n * (8*n-2); } // 8 from dir, 8 from complexity,
1424  long long bytes() const { return 2l * (arg.Xinv.Bytes() + 8*arg.Y.Bytes() + 8*arg.Yhat.Bytes()); }
1425 
1426  unsigned int minThreads() const { return arg.coarseVolumeCB; }
1427 
1428  bool tuneGridDim() const { return false; } // don't tune the grid dimension
1429 
1430  public:
1432  {
1434  }
1435  virtual ~CalculateYhat() { }
1436 
1437  void apply(const cudaStream_t &stream) {
1438  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1440  CalculateYhatCPU<Float,n,Arg>(arg);
1441  } else {
1442  CalculateYhatGPU<Float,n,Arg> <<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
1443  }
1444  }
1445 
1448  else return false;
1449  }
1450 
1451  TuneKey tuneKey() const {
1452  char Aux[TuneKey::aux_n];
1453  strcpy(Aux,aux);
1454  strcat(Aux,meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU" : ",CPU");
1455  return TuneKey(meta.VolString(), typeid(*this).name(), Aux);
1456  }
1457  };
1458 
1459 
1485  template<bool from_coarse, typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor,
1486  QudaGaugeFieldOrder gOrder, typename F, typename Ftmp, typename coarseGauge, typename fineGauge, typename fineClover>
1487  void calculateY(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, Ftmp &UV, F &AV, F &V, fineGauge &G, fineClover &C, fineClover &Cinv,
1488  GaugeField &Y_, GaugeField &X_, GaugeField &Xinv_, GaugeField &Yhat_, ColorSpinorField &av, const ColorSpinorField &v,
1489  double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc) {
1490 
1491  // sanity checks
1493  errorQuda("Unsupported coarsening of matpc = %d", matpc);
1494 
1495  bool is_dirac_coarse = (dirac == QUDA_COARSE_DIRAC || dirac == QUDA_COARSEPC_DIRAC) ? true : false;
1496  if (is_dirac_coarse && fineSpin != 2)
1497  errorQuda("Input Dirac operator %d should have nSpin=2, not nSpin=%d\n", dirac, fineSpin);
1498  if (!is_dirac_coarse && fineSpin != 4)
1499  errorQuda("Input Dirac operator %d should have nSpin=4, not nSpin=%d\n", dirac, fineSpin);
1500  if (!is_dirac_coarse && fineColor != 3)
1501  errorQuda("Input Dirac operator %d should have nColor=3, not nColor=%d\n", dirac, fineColor);
1502 
1503  if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
1504  const int nDim = 4;
1505 
1506  int x_size[5];
1507  for (int i=0; i<4; i++) x_size[i] = v.X(i);
1508  x_size[4] = 1;
1509 
1510  int xc_size[5];
1511  for (int i=0; i<4; i++) xc_size[i] = X_.X()[i];
1512  xc_size[4] = 1;
1513 
1514  int geo_bs[QUDA_MAX_DIM];
1515  for(int d = 0; d < nDim; d++) geo_bs[d] = x_size[d]/xc_size[d];
1516  int spin_bs = V.Nspin()/Y.NspinCoarse();
1517 
1518  //Calculate UV and then VUV for each dimension, accumulating directly into the coarse gauge field Y
1519 
1521  Arg arg(Y, X, Xinv, UV, AV, G, V, C, Cinv, kappa, mu, mu_factor, x_size, xc_size, geo_bs, spin_bs);
1523 
1524  QudaFieldLocation location = checkLocation(Y_, X_, Xinv_, Yhat_, av, v);
1525  printfQuda("Running link coarsening on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
1526 
1527  // If doing a preconditioned operator with a clover term then we
1528  // have bi-directional links, though we can do the bidirectional setup for all operators for debugging
1529  bool bidirectional_links = (dirac == QUDA_CLOVERPC_DIRAC || dirac == QUDA_COARSEPC_DIRAC || bidirectional_debug ||
1531  if (bidirectional_links) printfQuda("Doing bi-directional link coarsening\n");
1532  else printfQuda("Doing uni-directional link coarsening\n");
1533 
1534  printfQuda("V2 = %e\n", V.norm2());
1535 
1536  // do exchange of null-space vectors
1537  const int nFace = 1;
1538  v.exchangeGhost(QUDA_INVALID_PARITY, nFace, 0);
1539  arg.V.resetGhost(v.Ghost()); // point the accessor to the correct ghost buffer
1540  if (&v == &av) arg.AV.resetGhost(av.Ghost());
1541  LatticeField::bufferIndex = (1 - LatticeField::bufferIndex); // update ghost bufferIndex for next exchange
1542 
1543  // If doing preconditioned clover then we first multiply the
1544  // null-space vectors by the clover inverse matrix, since this is
1545  // needed for the coarse link computation
1547  printfQuda("Computing AV\n");
1548 
1549  y.setComputeType(COMPUTE_AV);
1550  y.apply(0);
1551 
1552  printfQuda("AV2 = %e\n", AV.norm2());
1553  }
1554 
1555  // If doing preconditioned twisted-mass then we first multiply the
1556  // null-space vectors by the inverse twist, since this is
1557  // needed for the coarse link computation
1559  printfQuda("Computing TMAV\n");
1560 
1561  y.setComputeType(COMPUTE_TMAV);
1562  y.apply(0);
1563 
1564  printfQuda("AV2 = %e\n", AV.norm2());
1565  }
1566 
1567  // If doing preconditioned twisted-clover then we first multiply the
1568  // null-space vectors by the inverse of the squared clover matrix plus
1569  // mu^2, and then we multiply the result by the clover matrix. This is
1570  // needed for the coarse link computation
1572  printfQuda("Computing TMCAV\n");
1573 
1574  y.setComputeType(COMPUTE_TMCAV);
1575  y.apply(0);
1576 
1577  printfQuda("AV2 = %e\n", AV.norm2());
1578  }
1579 
1580  // First compute the coarse forward links if needed
1581  if (bidirectional_links) {
1582  for (int d = 0; d < nDim; d++) {
1583  y.setDimension(d);
1584  y.setDirection(QUDA_FORWARDS);
1585  printfQuda("Computing forward %d UV and VUV\n", d);
1586 
1587  y.setComputeType(COMPUTE_UV); // compute U*V product
1588  y.apply(0);
1589  printfQuda("UV2[%d] = %e\n", d, UV.norm2());
1590 
1591  y.setComputeType(COMPUTE_VUV); // compute Y += VUV
1592  y.apply(0);
1593  printfQuda("Y2[%d] = %e\n", d, Y.norm2(4+d));
1594  }
1595  }
1596 
1599  av.exchangeGhost(QUDA_INVALID_PARITY, nFace, 0);
1600  arg.AV.resetGhost(av.Ghost()); // make sure we point to the correct pointer in the accessor
1601  LatticeField::bufferIndex = (1 - LatticeField::bufferIndex); // update ghost bufferIndex for next exchange
1602  }
1603 
1604  // Now compute the backward links
1605  for (int d = 0; d < nDim; d++) {
1606  y.setDimension(d);
1607  y.setDirection(QUDA_BACKWARDS);
1608  printfQuda("Computing backward %d UV and VUV\n", d);
1609 
1610  y.setComputeType(COMPUTE_UV); // compute U*A*V product
1611  y.apply(0);
1612  printfQuda("UAV2[%d] = %e\n", d, UV.norm2());
1613 
1614  y.setComputeType(COMPUTE_VUV); // compute Y += VUV
1615  y.apply(0);
1616  printfQuda("Y2[%d] = %e\n", d, Y.norm2(d));
1617  }
1618  printfQuda("X2 = %e\n", X.norm2(0));
1619 
1620  cudaDeviceSynchronize(); checkCudaError();
1621 
1622  // if not doing a preconditioned operator then we can trivially
1623  // construct the forward links from the backward links
1624  if ( !bidirectional_links ) {
1625  printfQuda("Reversing links\n");
1626  y.setComputeType(COMPUTE_REVERSE_Y); // reverse the links for the forwards direction
1627  y.apply(0);
1628  }
1629 
1630  cudaDeviceSynchronize(); checkCudaError();
1631 
1632  printfQuda("Computing coarse local\n");
1633  y.setComputeType(COMPUTE_COARSE_LOCAL);
1634  y.apply(0);
1635  printfQuda("X2 = %e\n", X.norm2(0));
1636 
1637  cudaDeviceSynchronize(); checkCudaError();
1638 
1639  // Check if we have a clover term that needs to be coarsened
1641  printfQuda("Computing fine->coarse clover term\n");
1642  y.setComputeType(COMPUTE_COARSE_CLOVER);
1643  y.apply(0);
1644  } else { //Otherwise, we just have to add the identity matrix
1645  printfQuda("Summing diagonal contribution to coarse clover\n");
1646  y.setComputeType(COMPUTE_DIAGONAL);
1647  y.apply(0);
1648  }
1649 
1650  cudaDeviceSynchronize(); checkCudaError();
1651 
1652  if (arg.mu*arg.mu_factor!=0 || dirac == QUDA_TWISTED_MASS_DIRAC || dirac == QUDA_TWISTED_CLOVER_DIRAC) {
1654  arg.mu_factor += 1.;
1655  printfQuda("Adding mu = %e\n",arg.mu*arg.mu_factor);
1656  y.setComputeType(COMPUTE_TMDIAGONAL);
1657  y.apply(0);
1658  }
1659 
1660  cudaDeviceSynchronize(); checkCudaError();
1661 
1662  printfQuda("X2 = %e\n", X.norm2(0));
1663 
1664  // invert the clover matrix field
1665  const int n = X_.Ncolor();
1667  GaugeFieldParam param(X_);
1668  // need to copy into AoS format for MAGMA
1669  param.order = QUDA_MILC_GAUGE_ORDER;
1671  cudaGaugeField Xinv(param);
1672  X.copy(X_);
1673  blas::flops += cublas::BatchInvertMatrix((void*)Xinv.Gauge_p(), (void*)X.Gauge_p(), n, X.Volume(), X_.Precision(), X.Location());
1674  Xinv_.copy(Xinv);
1675  } else if (X_.Location() == QUDA_CPU_FIELD_LOCATION && X_.Order() == QUDA_QDP_GAUGE_ORDER) {
1676  cpuGaugeField *X_h = static_cast<cpuGaugeField*>(&X_);
1677  cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv_);
1678  blas::flops += cublas::BatchInvertMatrix(((void**)Xinv_h->Gauge_p())[0], ((void**)X_h->Gauge_p())[0], n, X_h->Volume(), X_.Precision(), QUDA_CPU_FIELD_LOCATION);
1679  } else {
1680  errorQuda("Unsupported location=%d and order=%d", X_.Location(), X_.Order());
1681  }
1682 
1683  // now exchange Y halos of both forwards and backwards links for multi-process dslash
1685 
1686  // compute the preconditioned links
1687  // Yhat_back(x-\mu) = Y_back(x-\mu) * Xinv^dagger(x) (positive projector)
1688  // Yhat_fwd(x) = Xinv(x) * Y_fwd(x) (negative projector)
1689  {
1690  // use spin-ignorant accessor to make multiplication simpler
1691  // also with new accessor we ensure we're accessing the same ghost buffer in Y_ as was just exchanged
1693  gCoarse yAccessor(const_cast<GaugeField&>(Y_));
1694  gCoarse yHatAccessor(const_cast<GaugeField&>(Yhat_));
1695  gCoarse xInvAccessor(const_cast<GaugeField&>(Xinv_));
1696  printfQuda("Xinv = %e\n", xInvAccessor.norm2(0));
1697 
1698  int comm_dim[4];
1699  for (int i=0; i<4; i++) comm_dim[i] = comm_dim_partitioned(i);
1701  yHatArg arg(yHatAccessor, yAccessor, xInvAccessor, xc_size, comm_dim, 1);
1703  yHat.apply(0);
1704 
1705  for (int d=0; d<8; d++) printfQuda("Yhat[%d] = %e\n", d, Y.norm2(d));
1706  }
1707 
1708  // fill back in the bulk of Yhat so that the backward link is updated on the previous node
1709  // need to put this in the bulk of the previous node - but only send backwards the backwards links to and not overwrite the forwards bulk
1711 
1712  // exchange forwards links for multi-process dslash dagger
1713  // need to put this in the ghost zone of the next node - but only send forwards the forwards links and not overwrite the backwards ghost
1715 
1716  }
1717 
1718 
1719 
1720 } // namespace quda
__device__ __host__ void multiplyVUV(complex< Float > vuv[], Arg &arg, int parity, int x_cb, int ic_c)
Do a single (AV)^ * UV product, where for preconditioned clover, AV correspond to the clover inverse ...
Definition: coarse_op.cuh:494
int comm_dim[QUDA_MAX_DIM]
Definition: coarse_op.cuh:1335
void setDirection(QudaDirection dir_)
Definition: coarse_op.cuh:1222
TuneKey tuneKey() const
Definition: coarse_op.cuh:1248
dim3 dim3 blockDim
long long flops() const
Definition: coarse_op.cuh:946
__global__ void ComputeVUVGPU(Arg arg)
Definition: coarse_op.cuh:624
int dim[QUDA_MAX_DIM]
Definition: coarse_op.cuh:1334
double mu
Definition: test_util.cpp:1643
void resizeVector(int y, int z)
Definition: tune_quda.h:448
const int coarseVolumeCB
Definition: coarse_op.cuh:38
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_mpi.cpp:342
long long bytes() const
Definition: coarse_op.cuh:990
long long bytes() const
Definition: coarse_op.cuh:1424
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
int comm_dim[QUDA_MAX_DIM]
Definition: coarse_op.cuh:31
__device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c)
Definition: coarse_op.cuh:748
#define errorQuda(...)
Definition: util_quda.h:90
double mu_factor[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1659
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
int comm_dim(int dim)
__device__ __host__ void computeUV(Arg &arg, int parity, int x_cb, int ic_c)
Definition: coarse_op.cuh:62
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__global__ void ComputeCoarseCloverGPU(Arg arg)
Definition: coarse_op.cuh:833
char * strcpy(char *__dst, const char *__src)
CalculateY(Arg &arg, QudaDiracType dirac, const ColorSpinorField &meta, GaugeField &Y, GaugeField &X, GaugeField &Xinv)
Definition: coarse_op.cuh:1051
const char * VolString() const
virtual void restore()
Restores the cpuGaugeField.
int xc_size[QUDA_MAX_DIM]
Definition: coarse_op.cuh:26
char * strcat(char *__s1, const char *__s2)
__global__ void ComputeTMCAVGPU(Arg arg)
Definition: coarse_op.cuh:474
void ComputeTMAVCPU(Arg &arg)
Definition: coarse_op.cuh:229
__device__ __host__ void computeAV(Arg &arg, int parity, int x_cb, int ic_c)
Definition: coarse_op.cuh:157
coarseGauge Xinv
Definition: coarse_op.cuh:15
virtual ~CalculateY()
Definition: coarse_op.cuh:1059
unsigned int minThreads() const
Definition: coarse_op.cuh:1426
QudaGaugeParam param
Definition: pack_test.cpp:17
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
int Ncolor() const
Definition: gauge_field.h:202
long long flops() const
Definition: coarse_op.cuh:1423
void ComputeVUVCPU(Arg arg)
Definition: coarse_op.cuh:614
enum QudaDirection_s QudaDirection
void AddCoarseTmDiagonalCPU(Arg &arg)
Definition: coarse_op.cuh:875
const int fineVolumeCB
Definition: coarse_op.cuh:37
GaugeField & X
Definition: coarse_op.cuh:938
TuneKey tuneKey() const
Definition: coarse_op.cuh:1451
static int bufferIndex
const fineClover Cinv
Definition: coarse_op.cuh:23
VOLATILE spinorFloat kappa
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const =0
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
unsigned int minThreads() const
Definition: coarse_op.cuh:1025
__device__ __host__ void computeCoarseLocal(Arg &arg, int parity, int x_cb)
Definition: coarse_op.cuh:686
const int nColor
Definition: covdev_test.cpp:77
void setComputeType(ComputeType type_)
Definition: coarse_op.cuh:1227
cpuGaugeField * Xinv_h
__host__ __device__ void sum(double &a, double &b)
__global__ void CalculateYhatGPU(Arg arg)
Definition: coarse_op.cuh:1402
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
Definition: blas_cublas.cu:33
for(int s=0;s< param.dc.Ls;s++)
void setDimension(int dim_)
Definition: coarse_op.cuh:1217
int V
Definition: test_util.cpp:28
enum QudaMatPCType_s QudaMatPCType
fineSpinorTmp UV
Definition: coarse_op.cuh:17
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
const fineGauge U
Definition: coarse_op.cuh:20
void CalculateYhatCPU(Arg &arg)
Definition: coarse_op.cuh:1390
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
int Volume() const
__global__ void ComputeUVGPU(Arg arg)
Definition: coarse_op.cuh:142
void apply(const cudaStream_t &stream)
Definition: coarse_op.cuh:1437
virtual void backup() const
Backs up the LatticeField.
__device__ __host__ void computeTMCAV(Arg &arg, int parity, int x_cb)
Definition: coarse_op.cuh:391
int x_size[QUDA_MAX_DIM]
Definition: coarse_op.cuh:25
double gamma(double) __attribute__((availability(macosx
virtual void injectGhost(QudaLinkDirection=QUDA_LINK_BACKWARDS)=0
__global__ void AddCoarseDiagonalGPU(Arg arg)
Definition: coarse_op.cuh:861
__device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb)
Definition: coarse_op.cuh:639
QudaDirection dir
Definition: coarse_op.cuh:942
__global__ void ComputeCoarseLocalGPU(Arg arg)
Definition: coarse_op.cuh:738
int geo_bs[QUDA_MAX_DIM]
Definition: coarse_op.cuh:28
void * Ghost(const int i)
QudaFieldLocation Location() const
GaugeField & Y
Definition: coarse_op.cuh:937
const fineSpinor V
Definition: coarse_op.cuh:21
bool advanceTuneParam(TuneParam &param) const
Definition: coarse_op.cuh:1446
enum QudaFieldLocation_s QudaFieldLocation
GaugeCovDev * dirac
Definition: covdev_test.cpp:75
const ColorSpinorField & meta
Definition: coarse_op.cuh:936
CalculateYhatArg(const Gauge &Yhat, const Gauge Y, const Gauge Xinv, const int *dim, const int *comm_dim, int nFace)
Definition: coarse_op.cuh:1339
void apply(const cudaStream_t &stream)
Definition: coarse_op.cuh:1061
cpuGaugeField * X_h
CalculateYArg(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinor &V, const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor, const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_)
Definition: coarse_op.cuh:40
__global__ void ComputeYReverseGPU(Arg arg)
Definition: coarse_op.cuh:670
static const int aux_n
Definition: tune_key.h:12
#define printfQuda(...)
Definition: util_quda.h:84
void ComputeCoarseCloverCPU(Arg &arg)
Definition: coarse_op.cuh:822
virtual ~CalculateYhat()
Definition: coarse_op.cuh:1435
unsigned long long flops
Definition: blas_quda.cu:42
ComputeType
Definition: coarse_op.cuh:916
virtual void exchangeGhost(QudaLinkDirection=QUDA_LINK_BACKWARDS)=0
void ComputeCoarseLocalCPU(Arg &arg)
Definition: coarse_op.cuh:729
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
static bool bidirectional_debug
Definition: coarse_op.cuh:7
const LatticeField & meta
Definition: coarse_op.cuh:1421
void ComputeAVCPU(Arg &arg)
Definition: coarse_op.cuh:184
bool tuneGridDim() const
Definition: coarse_op.cuh:1428
__device__ __host__ void computeYhat(Arg &arg, int d, int x_cb, int parity, int i)
Definition: coarse_op.cuh:1349
void AddCoarseDiagonalCPU(Arg &arg)
Definition: coarse_op.cuh:846
const void * c
ComputeType type
Definition: coarse_op.cuh:943
const int * X() const
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
void ComputeYReverseCPU(Arg &arg)
Definition: coarse_op.cuh:661
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
CalculateYhat(Arg &arg, const LatticeField &meta)
Definition: coarse_op.cuh:1431
GaugeField & Xinv
Definition: coarse_op.cuh:939
__device__ __host__ void computeVUV(Arg &arg, int parity, int x_cb, int c_row)
Definition: coarse_op.cuh:570
__global__ void ComputeTMAVGPU(Arg arg)
Definition: coarse_op.cuh:239
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void ComputeUVCPU(Arg &arg)
Definition: coarse_op.cuh:132
__global__ void AddCoarseTmDiagonalGPU(Arg arg)
Definition: coarse_op.cuh:897
QudaParity parity
Definition: covdev_test.cpp:53
const fineClover C
Definition: coarse_op.cuh:22
__device__ __host__ void computeTMAV(Arg &arg, int parity, int x_cb, int v)
Definition: coarse_op.cuh:209
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
void ComputeTMCAVCPU(Arg &arg)
Definition: coarse_op.cuh:465
virtual void copy(const GaugeField &src)=0
int comm_dim_partitioned(int dim)
void calculateY(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, Ftmp &UV, F &AV, F &V, fineGauge &G, fineClover &C, fineClover &Cinv, GaugeField &Y_, GaugeField &X_, GaugeField &Xinv_, GaugeField &Yhat_, ColorSpinorField &av, const ColorSpinorField &v, double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc)
Calculate the coarse-link field, include the clover field, and its inverse, and finally also compute ...
Definition: coarse_op.cuh:1487
enum QudaDiracType_s QudaDiracType
const int * X() const
bool tuneGridDim() const
Definition: coarse_op.cuh:1048
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:260
__global__ void ComputeAVGPU(Arg arg)
Definition: coarse_op.cuh:194
bool advanceTuneParam(TuneParam &param) const
Definition: coarse_op.cuh:1243
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)