QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
coarse_op_kernel.cuh
Go to the documentation of this file.
2 #include <gauge_field_order.h>
3 #include <clover_field_order.h>
4 #include <multigrid_helper.cuh>
5 #include <index_helper.cuh>
6 #include <gamma.cuh>
7 #include <linalg.cuh>
8 
9 #define max_color_per_block 8
10 
11 namespace quda {
12 
13  // this is the storage type used when computing the coarse link variables
14  // by using integers we have deterministic atomics
15  typedef int storeType;
16 
17  template <typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor,
18  typename coarseGauge, typename coarseGaugeAtomic, typename fineGauge, typename fineSpinor,
19  typename fineSpinorTmp, typename fineSpinorV, typename fineClover>
20  struct CalculateYArg {
21 
22  coarseGauge Y;
23  coarseGauge X;
25  coarseGaugeAtomic Y_atomic;
26  coarseGaugeAtomic X_atomic;
28  fineSpinorTmp UV;
29  fineSpinor AV;
31  const fineGauge U;
32  const fineSpinorV V;
33  const fineClover C;
34  const fineClover Cinv;
40  const int spin_bs;
45  Float kappa;
46  Float mu;
47  Float mu_factor;
48  Float rescale;
50  const int fineVolumeCB;
51  const int coarseVolumeCB;
53  const int *fine_to_coarse;
54  const int *coarse_to_fine;
55 
56  const bool bidirectional;
57 
58  static constexpr int coarse_color = coarseColor;
59 
60  // To increase L2 locality we can schedule the geometry to grid.y and
61  // the coarse colors to grid.x. This will increase the potential for
62  // L2 reuse since a given wave of thread blocks will be for different
63  // coarse color but the same coarse grid point which will have common
64  // loads.
65  static constexpr bool coarse_color_wave = true;
66 
67  // Enable this for shared-memory atomics instead of global atomics.
68  // Doing so means that all (modulo the parity) of the coarsening for a
69  // coarse degree of freedom is handled by a single thread block.
70  // For computeVUV only at present
72 
73  // With parity_flip enabled we make parity the slowest running
74  // dimension in the y-thread axis, and coarse color runs faster. This
75  // improves read locality at the expense of write locality
77 
78  int_fastdiv aggregates_per_block; // number of aggregates per thread block
79  int_fastdiv grid_z; // this is the coarseColor grid that is wrapped into the x grid when coarse_color_wave is enabled
80  int_fastdiv coarse_color_grid_z; // constant we ned to divide by
81 
82  Float max_h; // scalar that stores the maximum element of the dynamic clover inverse
83  Float *max_d; // array that stores the maximum element per lattice site of the dynamic clover inverse
84 
85  int dim_index; // which direction / dimension we are working on
86 
87  CalculateYArg(coarseGauge &Y, coarseGauge &X,
88  coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic,
89  fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinorV &V,
90  const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor,
91  const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_,
92  const int *fine_to_coarse, const int *coarse_to_fine, bool bidirectional)
93  : Y(Y), X(X), Y_atomic(Y_atomic), X_atomic(X_atomic),
94  UV(UV), AV(AV), U(U), V(V), C(C), Cinv(Cinv), spin_bs(spin_bs_), spin_map(),
95  kappa(static_cast<Float>(kappa)), mu(static_cast<Float>(mu)), mu_factor(static_cast<Float>(mu_factor)),
96  fineVolumeCB(V.VolumeCB()), coarseVolumeCB(X.VolumeCB()),
97  fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
98  bidirectional(bidirectional), shared_atomic(false), parity_flip(shared_atomic ? true : false),
99  aggregates_per_block(1), max_d(nullptr)
100  {
101  if (V.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS)
102  errorQuda("Gamma basis %d not supported", V.GammaBasis());
103 
104  for (int i=0; i<QUDA_MAX_DIM; i++) {
105  x_size[i] = x_size_[i];
106  xc_size[i] = xc_size_[i];
107  geo_bs[i] = geo_bs_[i];
108  comm_dim[i] = comm_dim_partitioned(i);
109  }
110  }
111 
113  };
114 
115  // complex multiply-add with optimal use of fma
116  template<typename Float>
117  inline __device__ __host__ void caxpy(const complex<Float> &a, const complex<Float> &x, complex<Float> &y) {
118  y.x += a.x*x.x;
119  y.x -= a.y*x.y;
120  y.y += a.y*x.x;
121  y.y += a.x*x.y;
122  }
123 
128  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor,
129  int coarseSpin, int coarseColor, typename Wtype, typename Arg>
130  __device__ __host__ inline void computeUV(Arg &arg, const Wtype &W, int parity, int x_cb, int ic_c) {
131 
132  int coord[4];
133  getCoords(coord, x_cb, arg.x_size, parity);
134 
135  constexpr int uvSpin = fineSpin * (from_coarse ? 2 : 1);
136 
137  complex<Float> UV[uvSpin][fineColor];
138 
139  for(int s = 0; s < uvSpin; s++) {
140  for(int c = 0; c < fineColor; c++) {
141  UV[s][c] = static_cast<Float>(0.0);
142  }
143  }
144 
145  if ( arg.comm_dim[dim] && (coord[dim] + 1 >= arg.x_size[dim]) ) {
146  int nFace = 1;
147  int ghost_idx = ghostFaceIndex<1>(coord, arg.x_size, dim, nFace);
148 
149  for(int s = 0; s < fineSpin; s++) { //Fine Spin
150  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
151  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
152  if (!from_coarse) {
153  caxpy(arg.U(dim, parity, x_cb, ic, jc), W.Ghost(dim, 1, (parity+1)&1, ghost_idx, s, jc, ic_c), UV[s][ic]);
154  } else {
155  for (int s_col=0; s_col<fineSpin; s_col++) {
156  // on the coarse lattice if forwards then use the forwards links
157  caxpy(arg.U(dim + (dir == QUDA_FORWARDS ? 4 : 0), parity, x_cb, s, s_col, ic, jc),
158  W.Ghost(dim, 1, (parity+1)&1, ghost_idx, s_col, jc, ic_c), UV[s_col*fineSpin+s][ic]);
159  } // which chiral block
160  }
161  } //Fine color columns
162  } //Fine color rows
163  } //Fine Spin
164 
165  } else {
166  int y_cb = linkIndexP1(coord, arg.x_size, dim);
167 
168  for(int s = 0; s < fineSpin; s++) { //Fine Spin
169  for(int ic = 0; ic < fineColor; ic++) { //Fine Color rows of gauge field
170  for(int jc = 0; jc < fineColor; jc++) { //Fine Color columns of gauge field
171  if (!from_coarse) {
172  caxpy(arg.U(dim, parity, x_cb, ic, jc), W((parity+1)&1, y_cb, s, jc, ic_c), UV[s][ic]);
173  } else {
174  for (int s_col=0; s_col<fineSpin; s_col++) {
175  // on the coarse lattice if forwards then use the forwards links
176  caxpy(arg.U(dim + (dir == QUDA_FORWARDS ? 4 : 0), parity, x_cb, s, s_col, ic, jc),
177  W((parity+1)&1, y_cb, s_col, jc, ic_c), UV[s_col*fineSpin+s][ic]);
178  } // which chiral block
179  }
180  } //Fine color columns
181  } //Fine color rows
182  } //Fine Spin
183 
184  }
185 
186 
187  for(int s = 0; s < uvSpin; s++) {
188  for(int c = 0; c < fineColor; c++) {
189  arg.UV(parity,x_cb,s,c,ic_c) = UV[s][c];
190  }
191  }
192 
193 
194  } // computeUV
195 
196  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
198 
199  for (int parity=0; parity<2; parity++) {
200 #pragma omp parallel for
201  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
202  for (int ic_c=0; ic_c < coarseColor; ic_c++) // coarse color
203  if (dir == QUDA_FORWARDS) // only for preconditioned clover is V != AV
204  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, arg.V, parity, x_cb, ic_c);
205  else
206  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, arg.AV, parity, x_cb, ic_c);
207  } // c/b volume
208  } // parity
209  }
210 
211  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
212  __global__ void ComputeUVGPU(Arg arg) {
213  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
214  if (x_cb >= arg.fineVolumeCB) return;
215 
216  int parity = blockDim.y*blockIdx.y + threadIdx.y;
217  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
218  if (ic_c >= coarseColor) return;
219  if (dir == QUDA_FORWARDS) // only for preconditioned clover is V != AV
220  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, arg.V, parity, x_cb, ic_c);
221  else
222  computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, arg.AV, parity, x_cb, ic_c);
223  }
224 
229  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
230  __device__ __host__ inline void computeAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
231  {
232  constexpr int N = fineSpin * fineColor / 2;
234 
235 #pragma unroll
236  for (int i = 0; i < N; i++) {
237  int s_i = 2 * ch + i / fineColor;
238  int c_i = i % fineColor;
239 #pragma unroll
240  for (int j = 0; j <= i; j++) {
241  int s_j = 2 * ch + j / fineColor;
242  int c_j = j % fineColor;
243 #ifndef DYNAMIC_CLOVER
244  A(i, j) = arg.Cinv(0, parity, x_cb, s_i, s_j, c_i, c_j);
245 #else
246  A(i, j) = arg.C(0, parity, x_cb, s_i, s_j, c_i, c_j);
247 #endif
248  }
249  }
250 
251  ColorSpinor<Float, fineColor, fineSpin / 2> V;
252  for (int s = 0; s < fineSpin / 2; s++) {
253  for (int c = 0; c < fineColor; c++) { V(s, c) = arg.V(parity, x_cb, 2 * ch + s, c, ic_c); }
254  }
255 
256 #ifndef DYNAMIC_CLOVER
257  auto AV = A * V;
258 #else
259  // solve for the matrix
261  auto AV = cholesky.backward(cholesky.forward(V));
262 #endif
263 
264 #pragma unroll
265  for (int s = 0; s < fineSpin / 2; s++) {
266 #pragma unroll
267  for (int ic = 0; ic < fineColor; ic++) { arg.AV(parity, x_cb, 2 * ch + s, ic, ic_c) = AV(s, ic); }
268  }
269 
270  } // computeAV
271 
272  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg> void ComputeAVCPU(Arg &arg)
273  {
274  for (int parity=0; parity<2; parity++) {
275 #pragma omp parallel for
276  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
277  for (int ch = 0; ch < 2; ch++) { // Loop over chiral blocks
278 
279  for (int ic_c = 0; ic_c < coarseColor; ic_c++) { // coarse color
280  computeAV<Float, fineSpin, fineColor, coarseColor>(arg, parity, x_cb, ch, ic_c);
281  }
282  }
283  } // c/b volume
284  } // parity
285  }
286 
287  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
288  __global__ void ComputeAVGPU(Arg arg)
289  {
290  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
291  if (x_cb >= arg.fineVolumeCB) return;
292 
293  int ch_parity = blockDim.y * blockIdx.y + threadIdx.y;
294  if (ch_parity >= 4) return;
295  int ch = ch_parity % 2;
296  int parity = ch_parity / 2;
297 
298  int ic_c = blockDim.z * blockIdx.z + threadIdx.z; // coarse color
299  if (ic_c >= coarseColor) return;
300 
301  if (ch == 0)
302  computeAV<Float, fineSpin, fineColor, coarseColor>(arg, parity, x_cb, 0, ic_c);
303  else
304  computeAV<Float, fineSpin, fineColor, coarseColor>(arg, parity, x_cb, 1, ic_c);
305  }
306 
311  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
312  __device__ __host__ inline void computeTMAV(Arg &arg, int parity, int x_cb, int v) {
313 
314  complex<Float> fp(1./(1.+arg.mu*arg.mu),-arg.mu/(1.+arg.mu*arg.mu));
315  complex<Float> fm(1./(1.+arg.mu*arg.mu),+arg.mu/(1.+arg.mu*arg.mu));
316 
317  for(int s = 0; s < fineSpin/2; s++) {
318  for(int c = 0; c < fineColor; c++) {
319  arg.AV(parity,x_cb,s,c,v) = arg.V(parity,x_cb,s,c,v)*fp;
320  }
321  }
322 
323  for(int s = fineSpin/2; s < fineSpin; s++) {
324  for(int c = 0; c < fineColor; c++) {
325  arg.AV(parity,x_cb,s,c,v) = arg.V(parity,x_cb,s,c,v)*fm;
326  }
327  }
328 
329  } // computeTMAV
330 
331  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
333  for (int parity=0; parity<2; parity++) {
334 #pragma omp parallel for
335  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
336  for (int v=0; v<coarseColor; v++) // coarse color
337  computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, v);
338  } // c/b volume
339  } // parity
340  }
341 
342  template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
343  __global__ void ComputeTMAVGPU(Arg arg) {
344  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
345  if (x_cb >= arg.fineVolumeCB) return;
346 
347  int parity = blockDim.y*blockIdx.y + threadIdx.y;
348  int v = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
349  if (v >= coarseColor) return;
350 
351  computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg, parity, x_cb, v);
352  }
353 
354 #ifdef DYNAMIC_CLOVER
355 
360  template <typename Float, bool twist, typename Arg>
361  __device__ __host__ inline Float computeCloverInvMax(Arg &arg, int parity, int x_cb)
362  {
363 
364  Float max = 0.0;
365 
366  constexpr int nColor = 3;
367  constexpr int nSpin = 4;
368  constexpr int N = nColor * nSpin / 2;
369  typedef HMatrix<Float, N> Mat;
370 
371 #pragma unroll
372  for (int ch = 0; ch < 2; ch++) { /* Loop over chiral blocks */
373  Mat A;
374 
375 #pragma unroll
376  for (int i = 0; i < N; i++) {
377 #pragma unroll
378  for (int j = 0; j <= i; j++) {
379  A(i, j) = arg.C(0, parity, x_cb, 2 * ch + i / nColor, 2 * ch + j / nColor, i % nColor, j % nColor);
380  }
381  }
382 
383  if (twist) {
384  A = A.square();
385  A += arg.mu * arg.mu;
386  }
387 
388  // compute the Colesky decomposition
390 
391  Mat Ainv = cholesky.invert();
392 
393  Float inv_max = Ainv.max();
394  max = max > inv_max ? max : inv_max;
395 
396  } // chirality
397 
398  return max;
399  }
400 
401  template <typename Float, bool twist, typename Arg> void ComputeCloverInvMaxCPU(Arg &arg)
402  {
403  Float max = 0.0;
404  for (int parity=0; parity<2; parity++) {
405 #pragma omp parallel for reduction(max:max)
406  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
407  Float max_x = computeCloverInvMax<Float, twist, Arg>(arg, parity, x_cb);
408  max = max > max_x ? max : max_x;
409  } // c/b volume
410  } // parity
411  arg.max_h = max;
412  }
413 
414  template <typename Float, bool twist, typename Arg> __global__ void ComputeCloverInvMaxGPU(Arg arg)
415  {
416  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
417  if (x_cb >= arg.fineVolumeCB) return;
418 
419  int parity = blockDim.y*blockIdx.y + threadIdx.y;
420  arg.max_d[parity + 2 * x_cb] = computeCloverInvMax<Float, twist, Arg>(arg, parity, x_cb);
421  }
422 
423 #endif // DYNAMIC_CLOVER
424 
429  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
430  __device__ __host__ inline void computeTMCAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
431  {
432  constexpr int N = fineSpin * fineColor / 2;
434 
435 #pragma unroll
436  for (int i = 0; i < N; i++) {
437  int s_i = 2 * ch + i / fineColor;
438  int c_i = i % fineColor;
439 #pragma unroll
440  for (int j = 0; j <= i; j++) {
441  int s_j = 2 * ch + j / fineColor;
442  int c_j = j % fineColor;
443  A(i, j) = arg.C(0, parity, x_cb, s_i, s_j, c_i, c_j);
444  }
445  }
446 
447  complex<Float> mu(0., arg.mu);
448  if (ch == 0) mu *= static_cast<Float>(-1.0);
449 
450  ColorSpinor<Float, fineColor, fineSpin / 2> V;
451 
452  for (int s = 0; s < fineSpin / 2; s++) {
453  for (int c = 0; c < fineColor; c++) { V(s, c) = arg.V(parity, x_cb, 2 * ch + s, c, ic_c); }
454  }
455 
456  // first apply the clover matrix directly, including mu
457  auto UV = A * V;
458  UV += mu * V;
459 
460  // Then we calculate AV = Cinv UV, so [AV = (C^2 + mu^2)^{-1} (Clover -/+ i mu)·Vector]
461  // for in twisted-clover fermions, Cinv keeps (C^2 + mu^2)^{-1}
462 
463 #ifndef DYNAMIC_CLOVER
464  // load in the clover inverse matrix
465  HMatrix<Float, N> Ainv;
466 #pragma unroll
467  for (int i = 0; i < N; i++) {
468  int s_i = 2 * ch + i / fineColor;
469  int c_i = i % fineColor;
470 #pragma unroll
471  for (int j = 0; j <= i; j++) {
472  int s_j = 2 * ch + j / fineColor;
473  int c_j = j % fineColor;
474  Ainv(i, j) = arg.Cinv(0, parity, x_cb, s_i, s_j, c_i, c_j);
475  }
476  }
477  auto AV = Ainv * UV;
478 #else
479  // compute the clover inverse matrix with the already loaded clover matrix
480  A = A.square();
481  A += arg.mu * arg.mu;
482 
484  const auto AV = cholesky.backward(cholesky.forward(UV));
485 #endif
486 
487  for (int s = 0; s < fineSpin / 2; s++)
488  for (int c = 0; c < fineColor; c++) arg.AV(parity, x_cb, 2 * ch + s, c, ic_c) = AV(s, c);
489  } // computeTMCAV
490 
491  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg> void ComputeTMCAVCPU(Arg &arg)
492  {
493  for (int parity = 0; parity < 2; parity++) {
494 #pragma omp parallel for
495  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
496  for (int ch = 0; ch < 2; ch++) {
497  for (int ic_c = 0; ic_c < coarseColor; ic_c++) { // coarse color
498  computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(arg, parity, x_cb, ch, ic_c);
499  }
500  }
501  } // c/b volume
502  } // parity
503  }
504 
505  template <typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
506  __global__ void ComputeTMCAVGPU(Arg arg)
507  {
508  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
509  if (x_cb >= arg.fineVolumeCB) return;
510 
511  int ch_parity = blockDim.y * blockIdx.y + threadIdx.y;
512  if (ch_parity >= 4) return;
513  int ch = ch_parity % 2;
514  int parity = ch_parity / 2;
515 
516  int ic_c = blockDim.z * blockIdx.z + threadIdx.z; // coarse color
517  if (ic_c >= coarseColor) return;
518 
519  if (ch == 0)
520  computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(arg, parity, x_cb, 0, ic_c);
521  else
522  computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(arg, parity, x_cb, 1, ic_c);
523  }
524 
536  template <bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg, typename Gamma>
537  __device__ __host__ inline void multiplyVUV(complex<Float> vuv[], const Arg &arg, const Gamma &gamma, int parity, int x_cb, int ic_c, int jc_c) {
538 
539 #pragma unroll
540  for (int i=0; i<coarseSpin*coarseSpin; i++) vuv[i] = 0.0;
541 
542  if (!from_coarse) { // fine grid is top level
543 
544 #pragma unroll
545  for (int s = 0; s < fineSpin; s++) { //Loop over fine spin
546 
547  //Spin part of the color matrix. Will always consist
548  //of two terms - diagonal and off-diagonal part of
549  //P_mu = (1+/-\gamma_mu)
550 
551  const int s_c_row = arg.spin_map(s,parity); // Coarse spin row index
552 
553  //Use Gamma to calculate off-diagonal coupling and
554  //column index. Diagonal coupling is always 1.
555  // If computing the backwards (forwards) direction link then
556  // we desire the positive (negative) projector
557 
558  const int s_col = gamma.getcol(s);
559  const int s_c_col = arg.spin_map(s_col,parity); // Coarse spin col index
560 
561 #pragma unroll
562  for (int ic = 0; ic < fineColor; ic++) { //Sum over fine color
563  if (dir == QUDA_BACKWARDS) {
564  complex<Float> V = arg.V(parity, x_cb, s, ic, ic_c);
565 
566  // here UV is really UAV
567  //Diagonal Spin
568  caxpy(conj(V), arg.UV(parity, x_cb, s, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_row]);
569 
570  //Off-diagonal Spin (backward link / positive projector applied)
571  caxpy( gamma.apply(s, conj(V)), arg.UV(parity, x_cb, s_col, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_col]);
572  } else {
573  complex<Float> AV = arg.AV(parity, x_cb, s, ic, ic_c);
574 
575  //Diagonal Spin
576  caxpy(conj(AV), arg.UV(parity, x_cb, s, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_row]);
577 
578  //Off-diagonal Spin (forward link / negative projector applied)
579  caxpy( -gamma.apply(s, conj(AV)), arg.UV(parity, x_cb, s_col, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_col]);
580  }
581  } //Fine color
582  }
583 
584  } else { // fine grid operator is a coarse operator
585 
586 #pragma unroll
587  for(int ic = 0; ic < fineColor; ic++) { //Sum over fine color
588 #pragma unroll
589  for (int s = 0; s < fineSpin; s++) {
590  complex<Float> AV = arg.AV(parity, x_cb, s, ic, ic_c);
591 #pragma unroll
592  for (int s_col=0; s_col<fineSpin; s_col++) { // which chiral block
593  complex<Float> UV = arg.UV(parity, x_cb, s_col*fineSpin+s, ic, jc_c);
594  caxpy(conj(AV), UV, vuv[s*coarseSpin+s_col]);
595  } //Fine color
596  } //Fine spin
597  }
598 
599  } // from_coarse
600 
601  }
602 
603  template<typename Arg>
604  __device__ __host__ inline int virtualThreadIdx(const Arg &arg) {
605  constexpr int warp_size = 32;
606  int warp_id = threadIdx.x / warp_size;
607  int warp_lane = threadIdx.x % warp_size;
608  int tx = warp_id * (warp_size / arg.aggregates_per_block) + warp_lane / arg.aggregates_per_block;
609  return tx;
610  }
611 
612  template<typename Arg>
613  __device__ __host__ inline int virtualBlockDim(const Arg &arg) {
614  int block_dim_x = blockDim.x / arg.aggregates_per_block;
615  return block_dim_x;
616  }
617 
618  template<typename Arg>
619  __device__ __host__ inline int coarseIndex(const Arg &arg) {
620  constexpr int warp_size = 32;
621  int warp_lane = threadIdx.x % warp_size;
622  int x_coarse = (arg.coarse_color_wave ? blockIdx.y : blockIdx.x)*arg.aggregates_per_block + warp_lane % arg.aggregates_per_block;
623  return x_coarse;
624  }
625 
626  template<bool shared_atomic, bool parity_flip, bool from_coarse, typename Float, int dim, QudaDirection dir,
627  int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg, typename Gamma>
628  __device__ __host__ void computeVUV(Arg &arg, const Gamma &gamma, int parity, int x_cb, int c_row, int c_col, int parity_coarse_, int coarse_x_cb_) {
629 
630  constexpr int nDim = 4;
631  int coord[QUDA_MAX_DIM];
632  int coord_coarse[QUDA_MAX_DIM];
633 
634  getCoords(coord, x_cb, arg.x_size, parity);
635  for(int d = 0; d < nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
636 
637  //Check to see if we are on the edge of a block. If adjacent site
638  //is in same block, M = X, else M = Y
639  const bool isDiagonal = ((coord[dim]+1)%arg.x_size[dim])/arg.geo_bs[dim] == coord_coarse[dim] ? true : false;
640 
641  int coarse_parity = shared_atomic ? parity_coarse_ : 0;
642  if (!shared_atomic) {
643  for (int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
644  coarse_parity &= 1;
645  coord_coarse[0] /= 2;
646  }
647  int coarse_x_cb = shared_atomic ? coarse_x_cb_ : ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
648 
649  complex<Float> vuv[coarseSpin*coarseSpin];
650  multiplyVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(vuv, arg, gamma, parity, x_cb, c_row, c_col);
651 
652  const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
653 
654  if (shared_atomic) {
655 
656 #ifdef __CUDA_ARCH__
657  __shared__ complex<storeType> X[max_color_per_block][max_color_per_block][4][coarseSpin][coarseSpin];
658  __shared__ complex<storeType> Y[max_color_per_block][max_color_per_block][4][coarseSpin][coarseSpin];
659 
660  int x_ = coarse_x_cb%arg.aggregates_per_block;
661 
662  int tx = virtualThreadIdx(arg);
663  int s_col = tx / coarseSpin;
664  int s_row = tx % coarseSpin;
665 
666  int c_col_block = c_col % max_color_per_block;
667  int c_row_block = c_row % max_color_per_block;
668 
669  if (tx < coarseSpin*coarseSpin) {
670  Y[c_row_block][c_col_block][x_][s_row][s_col] = 0;
671  X[c_row_block][c_col_block][x_][s_row][s_col] = 0;
672  }
673 
674  __syncthreads();
675 
676  if (!isDiagonal) {
677 #pragma unroll
678  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
679 #pragma unroll
680  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
681  if (gauge::fixed_point<Float,storeType>()) {
682  Float scale = arg.Y_atomic.accessor.scale;
683  complex<storeType> a(round(scale * vuv[s_row*coarseSpin+s_col].real()),
684  round(scale * vuv[s_row*coarseSpin+s_col].imag()));
685  atomicAdd(&Y[c_row_block][c_col_block][x_][s_row][s_col],a);
686  } else {
687  atomicAdd(&Y[c_row_block][c_col_block][x_][s_row][s_col],reinterpret_cast<complex<storeType>*>(vuv)[s_row*coarseSpin+s_col]);
688  }
689  }
690  }
691  } else {
692 #pragma unroll
693  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
694 #pragma unroll
695  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
696  vuv[s_row*coarseSpin+s_col] *= -arg.kappa;
697  if (gauge::fixed_point<Float,storeType>()) {
698  Float scale = arg.X_atomic.accessor.scale;
699  complex<storeType> a(round(scale * vuv[s_row*coarseSpin+s_col].real()),
700  round(scale * vuv[s_row*coarseSpin+s_col].imag()));
701  atomicAdd(&X[c_row_block][c_col_block][x_][s_row][s_col],a);
702  } else {
703  atomicAdd(&X[c_row_block][c_col_block][x_][s_row][s_col],reinterpret_cast<complex<storeType>*>(vuv)[s_row*coarseSpin+s_col]);
704  }
705  }
706  }
707  }
708 
709  __syncthreads();
710 
711  if (tx < coarseSpin*coarseSpin && (parity == 0 || parity_flip == 1) ) {
712  arg.Y_atomic.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,Y[c_row_block][c_col_block][x_][s_row][s_col]);
713 
714  if (dir == QUDA_BACKWARDS) {
715  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_col,s_row,c_col,c_row,conj(X[c_row_block][c_col_block][x_][s_row][s_col]));
716  } else {
717  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,X[c_row_block][c_col_block][x_][s_row][s_col]);
718  }
719 
720  if (!arg.bidirectional) {
721  if (s_row == s_col) arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,X[c_row_block][c_col_block][x_][s_row][s_col]);
722  else arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,-X[c_row_block][c_col_block][x_][s_row][s_col]);
723  }
724  }
725 #else
726  errorQuda("Shared-memory atomic aggregation not supported on CPU");
727 #endif
728 
729  } else {
730 
731  if (!isDiagonal) {
732 #pragma unroll
733  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
734 #pragma unroll
735  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
736  arg.Y_atomic.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,vuv[s_row*coarseSpin+s_col]);
737  }
738  }
739  } else {
740 
741  for (int s2=0; s2<coarseSpin*coarseSpin; s2++) vuv[s2] *= -arg.kappa;
742 
743  if (dir == QUDA_BACKWARDS) {
744 #pragma unroll
745  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
746 #pragma unroll
747  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
748  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_col,s_row,c_col,c_row,conj(vuv[s_row*coarseSpin+s_col]));
749  }
750  }
751  } else {
752 #pragma unroll
753  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
754 #pragma unroll
755  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
756  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,vuv[s_row*coarseSpin+s_col]);
757  }
758  }
759  }
760 
761  if (!arg.bidirectional) {
762 #pragma unroll
763  for (int s_row = 0; s_row < coarseSpin; s_row++) { // Chiral row block
764 #pragma unroll
765  for (int s_col = 0; s_col < coarseSpin; s_col++) { // Chiral column block
766  const Float sign = (s_row == s_col) ? static_cast<Float>(1.0) : static_cast<Float>(-1.0);
767  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,sign*vuv[s_row*coarseSpin+s_col]);
768  }
769  }
770  }
771 
772  }
773 
774  }
775 
776  }
777 
778  template<bool from_coarse, typename Float, int dim, QudaDirection dir, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
779  void ComputeVUVCPU(Arg arg) {
780 
782  constexpr bool shared_atomic = false; // not supported on CPU
783  constexpr bool parity_flip = true;
784 
785  for (int parity=0; parity<2; parity++) {
786 #pragma omp parallel for
787  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) { // Loop over fine volume
788  for (int c_row=0; c_row<coarseColor; c_row++)
789  for (int c_col=0; c_col<coarseColor; c_col++)
790  computeVUV<shared_atomic,parity_flip,from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, gamma, parity, x_cb, c_row, c_col, 0, 0);
791  } // c/b volume
792  } // parity
793  }
794 
795  // compute indices for shared-atomic kernel
796  template <bool parity_flip, typename Arg>
797  __device__ inline void getIndicesShared(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row) {
798 
799  if (arg.coarse_color_wave) {
800  int parity_c_col_block_z = blockDim.y*blockIdx.x + threadIdx.y;
801  int c_col_block_z = parity_flip ? (parity_c_col_block_z % arg.coarse_color_grid_z ) : (parity_c_col_block_z / 2); // coarse color col index
802  parity = parity_flip ? (parity_c_col_block_z / arg.coarse_color_grid_z ) : (parity_c_col_block_z % 2);
803  c_col = c_col_block_z % arg.coarse_color;
804  c_row = blockDim.z*(c_col_block_z/arg.coarse_color) + threadIdx.z; // coarse color row index
805  } else {
806  int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
807  c_col = parity_flip ? (parity_c_col % arg.coarse_color) : (parity_c_col / 2); // coarse color col index
808  parity = parity_flip ? (parity_c_col / arg.coarse_color) : (parity_c_col % 2);
809  c_row = blockDim.z*blockIdx.z + threadIdx.z; // coarse color row index
810  }
811 
812  int block_dim_x = virtualBlockDim(arg);
813  int thread_idx_x = virtualThreadIdx(arg);
814  int x_coarse = coarseIndex(arg);
815 
816  parity_coarse = x_coarse >= arg.coarseVolumeCB ? 1 : 0;
817  x_coarse_cb = x_coarse - parity_coarse*arg.coarseVolumeCB;
818 
819  // obtain fine index from this look-up table
820  // since both parities map to the same block, each thread block must do both parities
821 
822  // threadIdx.x - fine checkerboard offset
823  // threadIdx.y - fine parity offset
824  // blockIdx.x - which coarse block are we working on
825  // assume that coarse_to_fine look up map is ordered as (coarse-block-id + fine-point-id)
826  // and that fine-point-id is parity ordered
827 
828  int x_fine = arg.coarse_to_fine[ (x_coarse*2 + parity) * block_dim_x + thread_idx_x];
829  x_cb = x_fine - parity*arg.fineVolumeCB;
830  }
831 
832  // compute indices for global-atomic kernel
833  template <bool parity_flip, typename Arg>
834  __device__ inline void getIndicesGlobal(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row) {
835 
836  x_cb = blockDim.x*(arg.coarse_color_wave ? blockIdx.y : blockIdx.x) + threadIdx.x;
837 
838  if (arg.coarse_color_wave) {
839  int parity_c_col_block_z = blockDim.y*blockIdx.x + threadIdx.y;
840  int c_col_block_z = parity_flip ? (parity_c_col_block_z % arg.coarse_color_grid_z ) : (parity_c_col_block_z / 2); // coarse color col index
841  parity = parity_flip ? (parity_c_col_block_z / arg.coarse_color_grid_z ) : (parity_c_col_block_z % 2);
842  c_col = c_col_block_z % arg.coarse_color;
843  c_row = blockDim.z*(c_col_block_z/arg.coarse_color) + threadIdx.z; // coarse color row index
844  } else {
845  int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
846  c_col = parity_flip ? (parity_c_col % arg.coarse_color) : (parity_c_col / 2); // coarse color col index
847  parity = parity_flip ? (parity_c_col / arg.coarse_color) : (parity_c_col % 2); // coarse color col index
848  c_row = blockDim.z*blockIdx.z + threadIdx.z; // coarse color row index
849  }
850 
851  x_coarse_cb = 0;
852  parity_coarse = 0;
853  }
854 
855  template<bool shared_atomic, bool parity_flip, bool from_coarse, typename Float, int dim, QudaDirection dir,
856  int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename Arg>
857  __global__ void ComputeVUVGPU(Arg arg) {
858 
860  int parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row;
861  if (shared_atomic) getIndicesShared<parity_flip>(arg, parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row);
862  else getIndicesGlobal<parity_flip>(arg, parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row);
863 
864  if (parity > 1) return;
865  if (c_col >= arg.coarse_color) return;
866  if (c_row >= arg.coarse_color) return;
867  if (!shared_atomic && x_cb >= arg.fineVolumeCB) return;
868 
869  computeVUV<shared_atomic,parity_flip,from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, gamma, parity, x_cb, c_row, c_col, parity_coarse, x_coarse_cb);
870  }
871 
876  template<typename Float, int nSpin, int nColor, typename Arg>
877  __device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb, int ic_c, int jc_c) {
878  auto &Y = arg.Y;
879 
880 #pragma unroll
881  for (int d=0; d<4; d++) {
882 #pragma unroll
883  for (int s_row = 0; s_row < nSpin; s_row++) { //Spin row
884 #pragma unroll
885  for (int s_col = 0; s_col < nSpin; s_col++) { //Spin column
886  if (s_row == s_col)
887  Y(d+4,parity,x_cb,s_row,s_col,ic_c,jc_c) = Y(d,parity,x_cb,s_row,s_col,ic_c,jc_c);
888  else
889  Y(d+4,parity,x_cb,s_row,s_col,ic_c,jc_c) = -Y(d,parity,x_cb,s_row,s_col,ic_c,jc_c);
890  } //Spin column
891  } //Spin row
892 
893  } // dimension
894 
895  }
896 
897  template<typename Float, int nSpin, int nColor, typename Arg>
898  void ComputeYReverseCPU(Arg &arg) {
899  for (int parity=0; parity<2; parity++) {
900 #pragma omp parallel for
901  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
902  for (int ic_c = 0; ic_c < nColor; ic_c++) { //Color row
903  for (int jc_c = 0; jc_c < nColor; jc_c++) { //Color col
904  computeYreverse<Float,nSpin,nColor,Arg>(arg, parity, x_cb, ic_c, jc_c);
905  }
906  }
907  } // c/b volume
908  } // parity
909  }
910 
911  template<typename Float, int nSpin, int nColor, typename Arg>
912  __global__ void ComputeYReverseGPU(Arg arg) {
913  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
914  if (x_cb >= arg.coarseVolumeCB) return;
915 
916  int parity_jc_c = blockDim.y*blockIdx.y + threadIdx.y; // parity and color col
917  if (parity_jc_c >= 2*nColor) return;
918  int parity = parity_jc_c / nColor;
919  int jc_c = parity_jc_c % nColor;
920 
921  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // color row
922  if (ic_c >= nColor) return;
923 
924  computeYreverse<Float,nSpin,nColor,Arg>(arg, parity, x_cb, ic_c, jc_c);
925  }
926 
927  template<bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
928  __device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c, int jc_c) {
929 
930  const int nDim = 4;
931 
932  int coord[QUDA_MAX_DIM];
933  int coord_coarse[QUDA_MAX_DIM];
934 
935  getCoords(coord, x_cb, arg.x_size, parity);
936  for (int d=0; d<nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
937 
938  int coarse_parity = 0;
939  for (int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
940  coarse_parity &= 1;
941  coord_coarse[0] /= 2;
942  int coarse_x_cb = ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
943 
944  coord[0] /= 2;
945 
946  complex<Float> X[coarseSpin*coarseSpin];
947  for (int i=0; i<coarseSpin*coarseSpin; i++) X[i] = 0.0;
948 
949  if (!from_coarse) {
950  //If Nspin = 4, then the clover term has structure C_{\mu\nu} = \gamma_{\mu\nu}C^{\mu\nu}
951  for (int s = 0; s < fineSpin; s++) { //Loop over fine spin row
952  const int s_c = arg.spin_map(s,parity);
953  //On the fine lattice, the clover field is chirally blocked, so loop over rows/columns
954  //in the same chiral block.
955  for (int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) { //Loop over fine spin column
956  for (int ic = 0; ic < fineColor; ic++) { //Sum over fine color row
957  for (int jc = 0; jc < fineColor; jc++) { //Sum over fine color column
958  X[s_c*coarseSpin + s_c] +=
959  conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.C(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, jc_c);
960  } //Fine color column
961  } //Fine color row
962  } //Fine spin column
963  } //Fine spin
964  } else {
965  //If Nspin != 4, then spin structure is a dense matrix and there is now spin aggregation
966  //N.B. assumes that no further spin blocking is done in this case.
967  for (int s = 0; s < fineSpin; s++) { //Loop over spin row
968  for (int s_col = 0; s_col < fineSpin; s_col++) { //Loop over spin column
969  for (int ic = 0; ic < fineColor; ic++) { //Sum over fine color row
970  for (int jc = 0; jc < fineColor; jc++) { //Sum over fine color column
971  X[s*coarseSpin + s_col] +=
972  conj(arg.V(parity, x_cb, s, ic, ic_c)) * arg.C(0, parity, x_cb, s, s_col, ic, jc) * arg.V(parity, x_cb, s_col, jc, jc_c);
973  } //Fine color column
974  } //Fine color row
975  } //Fine spin column
976  } //Fine spin
977  }
978 
979  for (int si = 0; si < coarseSpin; si++) {
980  for (int sj = 0; sj < coarseSpin; sj++) {
981  arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,si,sj,ic_c,jc_c,X[si*coarseSpin+sj]);
982  }
983  }
984 
985  }
986 
987  template <bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
988  void ComputeCoarseCloverCPU(Arg &arg) {
989  for (int parity=0; parity<2; parity++) {
990 #pragma omp parallel for
991  for (int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
992  for (int jc_c=0; jc_c<coarseColor; jc_c++) {
993  for (int ic_c=0; ic_c<coarseColor; ic_c++) {
994  computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg, parity, x_cb, ic_c, jc_c);
995  }
996  }
997  } // c/b volume
998  } // parity
999  }
1000 
1001  template <bool from_coarse, typename Float, int fineSpin, int coarseSpin, int fineColor, int coarseColor, typename Arg>
1002  __global__ void ComputeCoarseCloverGPU(Arg arg) {
1003  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1004  if (x_cb >= arg.fineVolumeCB) return;
1005 
1006  int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y; // parity and color column
1007  if (parity_c_col >= 2*coarseColor) return;
1008  int jc_c = parity_c_col % coarseColor; // coarse color col index
1009  int parity = parity_c_col / coarseColor;
1010 
1011  int ic_c = blockDim.z*blockIdx.z + threadIdx.z; // coarse color
1012  if (ic_c >= coarseColor) return;
1013  computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg, parity, x_cb, ic_c, jc_c);
1014  }
1015 
1016 
1017 
1018  //Adds the identity matrix to the coarse local term.
1019  template<typename Float, int nSpin, int nColor, typename Arg>
1020  void AddCoarseDiagonalCPU(Arg &arg) {
1021  for (int parity=0; parity<2; parity++) {
1022 #pragma omp parallel for
1023  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1024  for(int s = 0; s < nSpin; s++) { //Spin
1025  for(int c = 0; c < nColor; c++) { //Color
1026  arg.X_atomic(0,parity,x_cb,s,s,c,c) += complex<Float>(1.0,0.0);
1027  } //Color
1028  } //Spin
1029  } // x_cb
1030  } //parity
1031  }
1032 
1033 
1034  //Adds the identity matrix to the coarse local term.
1035  template<typename Float, int nSpin, int nColor, typename Arg>
1036  __global__ void AddCoarseDiagonalGPU(Arg arg) {
1037  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1038  if (x_cb >= arg.coarseVolumeCB) return;
1039  int parity = blockDim.y*blockIdx.y + threadIdx.y;
1040 
1041  for(int s = 0; s < nSpin; s++) { //Spin
1042  for(int c = 0; c < nColor; c++) { //Color
1043  arg.X_atomic(0,parity,x_cb,s,s,c,c) += complex<Float>(1.0,0.0);
1044  } //Color
1045  } //Spin
1046  }
1047 
1048  //Adds the twisted-mass term to the coarse local term.
1049  template<typename Float, int nSpin, int nColor, typename Arg>
1050  void AddCoarseTmDiagonalCPU(Arg &arg) {
1051 
1052  const complex<Float> mu(0., arg.mu*arg.mu_factor);
1053 
1054  for (int parity=0; parity<2; parity++) {
1055 #pragma omp parallel for
1056  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1057  for(int s = 0; s < nSpin/2; s++) { //Spin
1058  for(int c = 0; c < nColor; c++) { //Color
1059  arg.X_atomic(0,parity,x_cb,s,s,c,c) += mu;
1060  } //Color
1061  } //Spin
1062  for(int s = nSpin/2; s < nSpin; s++) { //Spin
1063  for(int c = 0; c < nColor; c++) { //Color
1064  arg.X_atomic(0,parity,x_cb,s,s,c,c) -= mu;
1065  } //Color
1066  } //Spin
1067  } // x_cb
1068  } //parity
1069  }
1070 
1071  //Adds the twisted-mass term to the coarse local term.
1072  template<typename Float, int nSpin, int nColor, typename Arg>
1073  __global__ void AddCoarseTmDiagonalGPU(Arg arg) {
1074  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1075  if (x_cb >= arg.coarseVolumeCB) return;
1076  int parity = blockDim.y*blockIdx.y + threadIdx.y;
1077 
1078  const complex<Float> mu(0., arg.mu*arg.mu_factor);
1079 
1080  for(int s = 0; s < nSpin/2; s++) { //Spin
1081  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color
1082  arg.X_atomic(0,parity,x_cb,s,s,ic_c,ic_c) += mu;
1083  } //Color
1084  } //Spin
1085  for(int s = nSpin/2; s < nSpin; s++) { //Spin
1086  for(int ic_c = 0; ic_c < nColor; ic_c++) { //Color
1087  arg.X_atomic(0,parity,x_cb,s,s,ic_c,ic_c) -= mu;
1088  } //Color
1089  } //Spin
1090  }
1091 
1095  template<typename Float, int nSpin, int nColor, typename Arg>
1096  __device__ __host__ void convert(Arg &arg, int parity, int x_cb, int c_row, int c_col) {
1097 
1098  if (arg.dim_index < 8) {
1099 
1100  const auto &in = arg.Y_atomic;
1101  int d_in = arg.dim_index % in.geometry;
1102  int d_out = arg.dim_index % arg.Y.geometry;
1103 
1104 #pragma unroll
1105  for (int s_row = 0; s_row < nSpin; s_row++) { //Spin row
1106 #pragma unroll
1107  for (int s_col = 0; s_col < nSpin; s_col++) { //Spin column
1108  complex<Float> M = in(d_in,parity,x_cb,s_row,s_col,c_row,c_col);
1109  arg.Y(d_out,parity,x_cb,s_row,s_col,c_row,c_col) = M;
1110  } //Spin column
1111  } //Spin row
1112 
1113  } else {
1114 
1115  const auto &in = arg.X_atomic;
1116  int d_in = arg.dim_index % in.geometry;
1117  int d_out = arg.dim_index % arg.X.geometry;
1118 
1119 #pragma unroll
1120  for (int s_row = 0; s_row < nSpin; s_row++) { //Spin row
1121 #pragma unroll
1122  for (int s_col = 0; s_col < nSpin; s_col++) { //Spin column
1123  complex<Float> M = in(d_in,parity,x_cb,s_row,s_col,c_row,c_col);
1124  arg.X(d_out,parity,x_cb,s_row,s_col,c_row,c_col) = M;
1125  } //Spin column
1126  } //Spin row
1127 
1128  }
1129 
1130  }
1131 
1132  template<typename Float, int nSpin, int nColor, typename Arg>
1133  void ConvertCPU(Arg &arg) {
1134  for (int parity=0; parity<2; parity++) {
1135 #pragma omp parallel for
1136  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1137  for(int c_row = 0; c_row < nColor; c_row++) { //Color row
1138  for(int c_col = 0; c_col < nColor; c_col++) { //Color column
1139  convert<Float,nSpin,nColor,Arg>(arg, parity, x_cb, c_row, c_col);
1140  }
1141  }
1142  } // c/b volume
1143  } // parity
1144  }
1145 
1146  template<typename Float, int nSpin, int nColor, typename Arg>
1147  __global__ void ConvertGPU(Arg arg) {
1148  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1149  if (x_cb >= arg.coarseVolumeCB) return;
1150 
1151  int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
1152  if (parity_c_col >= 2*nColor) return;
1153 
1154  int c_col = parity_c_col % nColor; // color col index
1155  int parity = parity_c_col / nColor;
1156 
1157  int c_row = blockDim.z*blockIdx.z + threadIdx.z; // color row index
1158  if (c_row >= nColor) return;
1159 
1160  convert<Float,nSpin,nColor,Arg>(arg, parity, x_cb, c_row, c_col);
1161  }
1162 
1166  template<typename Float, int nSpin, int nColor, typename Arg>
1167  __device__ __host__ void rescaleY(Arg &arg, int parity, int x_cb, int c_row, int c_col) {
1168 
1169 #pragma unroll
1170  for (int s_row = 0; s_row < nSpin; s_row++) { //Spin row
1171 #pragma unroll
1172  for (int s_col = 0; s_col < nSpin; s_col++) { //Spin column
1173  complex<Float> M = arg.Y(arg.dim_index,parity,x_cb,s_row,s_col,c_row,c_col);
1174  arg.Y(arg.dim_index,parity,x_cb,s_row,s_col,c_row,c_col) = arg.rescale*M;
1175  } //Spin column
1176  } //Spin row
1177 
1178  }
1179 
1180  template<typename Float, int nSpin, int nColor, typename Arg>
1181  void RescaleYCPU(Arg &arg) {
1182  for (int parity=0; parity<2; parity++) {
1183 #pragma omp parallel for
1184  for (int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1185  for(int c_row = 0; c_row < nColor; c_row++) { //Color row
1186  for(int c_col = 0; c_col < nColor; c_col++) { //Color column
1187  rescaleY<Float,nSpin,nColor,Arg>(arg, parity, x_cb, c_row, c_col);
1188  }
1189  }
1190  } // c/b volume
1191  } // parity
1192  }
1193 
1194  template<typename Float, int nSpin, int nColor, typename Arg>
1195  __global__ void RescaleYGPU(Arg arg) {
1196  int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1197  if (x_cb >= arg.coarseVolumeCB) return;
1198 
1199  int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
1200  if (parity_c_col >= 2*nColor) return;
1201 
1202  int c_col = parity_c_col % nColor; // color col index
1203  int parity = parity_c_col / nColor;
1204 
1205  int c_row = blockDim.z*blockIdx.z + threadIdx.z; // color row index
1206  if (c_row >= nColor) return;
1207 
1208  rescaleY<Float,nSpin,nColor,Arg>(arg, parity, x_cb, c_row, c_col);
1209  }
1210 
1211 } // namespace quda
const fineSpinorV V
__device__ __host__ int coarseIndex(const Arg &arg)
__device__ __host__ int virtualThreadIdx(const Arg &arg)
__device__ __host__ Vector backward(const Vector &b)
Backward substition to solve L^dagger x = b.
Definition: linalg.cuh:123
void RescaleYCPU(Arg &arg)
#define errorQuda(...)
Definition: util_quda.h:121
const fineClover C
Compute Cholesky decomposition of A. By default, we use a modified Cholesky which avoids the division...
Definition: linalg.cuh:38
__global__ void RescaleYGPU(Arg arg)
__global__ void ComputeCoarseCloverGPU(Arg arg)
__device__ __host__ complex< ValueType > apply(int row, const complex< ValueType > &a) const
Definition: gamma.cuh:221
coarseGaugeAtomic X_atomic
__global__ void ComputeTMCAVGPU(Arg arg)
__device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c, int jc_c)
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
void ComputeTMAVCPU(Arg &arg)
coarseGaugeAtomic Y_atomic
const fineClover Cinv
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
Definition: atomic.cuh:51
int comm_dim[QUDA_MAX_DIM]
void ComputeVUVCPU(Arg arg)
enum QudaDirection_s QudaDirection
void AddCoarseTmDiagonalCPU(Arg &arg)
__device__ void getIndicesShared(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row)
__device__ __host__ void caxpy(const complex< Float > &a, const complex< Float > &x, complex< Float > &y)
const spin_mapper< fineSpin, coarseSpin > spin_map
const int nColor
Definition: covdev_test.cpp:75
int_fastdiv geo_bs[QUDA_MAX_DIM]
__device__ __host__ void computeVUV(Arg &arg, const Gamma &gamma, int parity, int x_cb, int c_row, int c_col, int parity_coarse_, int coarse_x_cb_)
__device__ __host__ void multiplyVUV(complex< Float > vuv[], const Arg &arg, const Gamma &gamma, int parity, int x_cb, int ic_c, int jc_c)
Do a single (AV)^ * UV product, where for preconditioned clover, AV correspond to the clover inverse ...
cpuColorSpinorField * in
__device__ void getIndicesGlobal(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row)
#define max_color_per_block
int_fastdiv aggregates_per_block
__device__ __host__ int getcol(int row) const
Definition: gamma.cuh:28
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:61
static constexpr bool coarse_color_wave
Main header file for host and device accessors to GaugeFields.
__global__ void ComputeUVGPU(Arg arg)
__device__ __host__ void rescaleY(Arg &arg, int parity, int x_cb, int c_row, int c_col)
__global__ void ConvertGPU(Arg arg)
__global__ void AddCoarseDiagonalGPU(Arg arg)
static constexpr int coarse_color
__global__ void ComputeVUVGPU(Arg arg)
__device__ __host__ Vector forward(const Vector &b)
Forward substition to solve Lx = b.
Definition: linalg.cuh:97
int xc_size[QUDA_MAX_DIM]
__device__ __host__ void computeTMCAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
__shared__ float s[]
__global__ void ComputeYReverseGPU(Arg arg)
__device__ __host__ void computeAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
void ComputeCoarseCloverCPU(Arg &arg)
__device__ __host__ Mat< T, N > invert()
Compute the inverse of A (the matrix used to construct the Cholesky decomposition).
Definition: linalg.cuh:146
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
Definition: spinor_noise.cu:23
void ComputeAVCPU(Arg &arg)
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
Definition: quda_matrix.h:353
CalculateYArg(coarseGauge &Y, coarseGauge &X, coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic, fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinorV &V, const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor, const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_, const int *fine_to_coarse, const int *coarse_to_fine, bool bidirectional)
void AddCoarseDiagonalCPU(Arg &arg)
void ComputeYReverseCPU(Arg &arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__global__ void ComputeTMAVGPU(Arg arg)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
int_fastdiv x_size[QUDA_MAX_DIM]
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void ComputeUVCPU(Arg &arg)
__global__ void AddCoarseTmDiagonalGPU(Arg arg)
QudaParity parity
Definition: covdev_test.cpp:54
void ConvertCPU(Arg &arg)
__device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb, int ic_c, int jc_c)
__device__ __host__ void computeTMAV(Arg &arg, int parity, int x_cb, int v)
void ComputeTMCAVCPU(Arg &arg)
int comm_dim_partitioned(int dim)
__device__ __host__ int virtualBlockDim(const Arg &arg)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
__device__ void convert(OutputType x[], InputType y[], const int N)
Definition: convert.h:149
__global__ void ComputeAVGPU(Arg arg)
int_fastdiv coarse_color_grid_z
__device__ __host__ void computeUV(Arg &arg, const Wtype &W, int parity, int x_cb, int ic_c)