QUDA  0.9.0
dslash_coarse.cu
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <gauge_field_order.h>
4 #include <index_helper.cuh>
5 #if __COMPUTE_CAPABILITY__ >= 300
6 #include <generics/shfl.h>
7 #endif
8 
9 namespace quda {
10 
11 #ifdef GPU_MULTIGRID
12 
13  enum DslashType {
14  DSLASH_INTERIOR,
15  DSLASH_EXTERIOR,
16  DSLASH_FULL
17  };
18 
19  template <typename Float, int coarseSpin, int coarseColor, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder>
20  struct DslashCoarseArg {
21  typedef typename colorspinor::FieldOrderCB<Float,coarseSpin,coarseColor,1,csOrder> F;
22  typedef typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder> G;
23 
24  F out;
25  const F inA;
26  const F inB;
27  const G Y;
28  const G X;
29  const Float kappa;
30  const int parity; // only use this for single parity fields
31  const int nParity; // number of parities we're working on
32  const int nFace; // hard code to 1 for now
33  const int_fastdiv X0h; // X[0]/2
34  const int_fastdiv dim[5]; // full lattice dimensions
35  const int commDim[4]; // whether a given dimension is partitioned or not
36  const int volumeCB;
37 
38  inline DslashCoarseArg(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
39  const GaugeField &Y, const GaugeField &X, Float kappa, int parity)
40  : out(const_cast<ColorSpinorField&>(out)), inA(const_cast<ColorSpinorField&>(inA)),
41  inB(const_cast<ColorSpinorField&>(inB)), Y(const_cast<GaugeField&>(Y)),
42  X(const_cast<GaugeField&>(X)), kappa(kappa), parity(parity),
43  nParity(out.SiteSubset()), nFace(1), X0h( ((3-nParity) * out.X(0)) /2),
44  dim{ (3-nParity) * out.X(0), out.X(1), out.X(2), out.X(3), out.Ndim() == 5 ? out.X(4) : 1 },
46  volumeCB(out.VolumeCB()/dim[4])
47  { }
48  };
49 
53  template <DslashType type>
54  static __host__ __device__ bool doHalo() {
55  switch(type) {
56  case DSLASH_EXTERIOR:
57  case DSLASH_FULL:
58  return true;
59  default:
60  return false;
61  }
62  }
63 
67  template <DslashType type>
68  static __host__ __device__ bool doBulk() {
69  switch(type) {
70  case DSLASH_INTERIOR:
71  case DSLASH_FULL:
72  return true;
73  default:
74  return false;
75  }
76  }
77 
86  template <typename I>
87  static __device__ __host__ inline void getCoordsCB(int x[], int cb_index, const I X[], const I X0h, int parity) {
88  //x[3] = cb_index/(X[2]*X[1]*X[0]/2);
89  //x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
90  //x[1] = (cb_index/(X[0]/2)) % X[1];
91  //x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
92 
93  int za = (cb_index / X0h);
94  int zb = (za / X[1]);
95  x[1] = (za - zb * X[1]);
96  x[3] = (zb / X[2]);
97  x[2] = (zb - x[3] * X[2]);
98  int x1odd = (x[1] + x[2] + x[3] + parity) & 1;
99  x[0] = (2 * cb_index + x1odd - za * X[0]);
100  return;
101  }
102 
113  extern __shared__ float s[];
114  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_stride, int thread_dir, int thread_dim, bool dagger, DslashType type, typename Arg>
115  __device__ __host__ inline void applyDslash(complex<Float> out[], Arg &arg, int x_cb, int src_idx, int parity, int s_row, int color_block, int color_offset) {
116  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
117 
118  int coord[5];
119  getCoordsCB(coord, x_cb, arg.dim, arg.X0h, parity);
120  coord[4] = src_idx;
121 
122 #ifdef __CUDA_ARCH__
123  complex<Float> *shared_sum = (complex<Float>*)s;
124  if (!thread_dir) {
125 #endif
126 
127  //Forward gather - compute fwd offset for spinor fetch
128 #pragma unroll
129  for(int d = thread_dim; d < nDim; d+=dim_stride) // loop over dimension
130  {
131  const int fwd_idx = linkIndexP1(coord, arg.dim, d);
132 
133  if ( arg.commDim[d] && (coord[d] + arg.nFace >= arg.dim[d]) ) {
134  if (doHalo<type>()) {
135  int ghost_idx = ghostFaceIndex<1>(coord, arg.dim, d, arg.nFace);
136 
137 #pragma unroll
138  for(int color_local = 0; color_local < Mc; color_local++) { //Color row
139  int c_row = color_block + color_local; // global color index
140  int row = s_row*Nc + c_row;
141 #pragma unroll
142  for(int s_col = 0; s_col < Ns; s_col++) { //Spin column
143 #pragma unroll
144  for(int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color column
145  int col = s_col*Nc + c_col + color_offset;
146  if (!dagger)
147  out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
148  * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
149  else
150  out[color_local] += arg.Y(d, parity, x_cb, row, col)
151  * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
152  }
153  }
154  }
155  }
156  } else if (doBulk<type>()) {
157 #pragma unroll
158  for(int color_local = 0; color_local < Mc; color_local++) { //Color row
159  int c_row = color_block + color_local; // global color index
160  int row = s_row*Nc + c_row;
161 #pragma unroll
162  for(int s_col = 0; s_col < Ns; s_col++) { //Spin column
163 #pragma unroll
164  for(int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color column
165  int col = s_col*Nc + c_col + color_offset;
166  if (!dagger)
167  out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
168  * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
169  else
170  out[color_local] += arg.Y(d, parity, x_cb, row, col)
171  * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
172  }
173  }
174  }
175  }
176 
177  } // nDim
178 
179 #if defined(__CUDA_ARCH__)
180  if (thread_dim > 0) { // only need to write to shared memory if not master thread
181 #pragma unroll
182  for (int color_local=0; color_local < Mc; color_local++) {
183  shared_sum[((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
184  }
185  }
186 #endif
187 
188 #ifdef __CUDA_ARCH__
189  } else {
190 #endif
191 
192  //Backward gather - compute back offset for spinor and gauge fetch
193 #pragma unroll
194  for(int d = thread_dim; d < nDim; d+=dim_stride)
195  {
196  const int back_idx = linkIndexM1(coord, arg.dim, d);
197  const int gauge_idx = back_idx;
198  if ( arg.commDim[d] && (coord[d] - arg.nFace < 0) ) {
199  if (doHalo<type>()) {
200  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, d, arg.nFace);
201 #pragma unroll
202  for (int color_local=0; color_local<Mc; color_local++) {
203  int c_row = color_block + color_local;
204  int row = s_row*Nc + c_row;
205 #pragma unroll
206  for (int s_col=0; s_col<Ns; s_col++)
207 #pragma unroll
208  for (int c_col=0; c_col<Nc; c_col+=color_stride) {
209  int col = s_col*Nc + c_col + color_offset;
210  if (!dagger)
211  out[color_local] += conj(arg.Y.Ghost(d, 1-parity, ghost_idx, col, row))
212  * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
213  else
214  out[color_local] += conj(arg.Y.Ghost(d+4, 1-parity, ghost_idx, col, row))
215  * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
216  }
217  }
218  }
219  } else if (doBulk<type>()) {
220 #pragma unroll
221  for(int color_local = 0; color_local < Mc; color_local++) {
222  int c_row = color_block + color_local;
223  int row = s_row*Nc + c_row;
224 #pragma unroll
225  for(int s_col = 0; s_col < Ns; s_col++)
226 #pragma unroll
227  for(int c_col = 0; c_col < Nc; c_col+=color_stride) {
228  int col = s_col*Nc + c_col + color_offset;
229  if (!dagger)
230  out[color_local] += conj(arg.Y(d, 1-parity, gauge_idx, col, row))
231  * arg.inA(their_spinor_parity, back_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
232  else
233  out[color_local] += conj(arg.Y(d+4, 1-parity, gauge_idx, col, row))
234  * arg.inA(their_spinor_parity, back_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
235  }
236  }
237  }
238 
239  } //nDim
240 
241 #if defined(__CUDA_ARCH__)
242 
243 #pragma unroll
244  for (int color_local=0; color_local < Mc; color_local++) {
245  shared_sum[ ((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
246  }
247 
248  } // forwards / backwards thread split
249 #endif
250 
251 #ifdef __CUDA_ARCH__ // CUDA path has to recombine the foward and backward results
252  __syncthreads();
253 
254  // (colorspin * dim_stride + dim * 2 + dir)
255  if (thread_dim == 0 && thread_dir == 0) {
256 
257  // full split over dimension and direction
258 #pragma unroll
259  for (int d=1; d<dim_stride; d++) { // get remaining forward fathers (if any)
260  // 4-way 1,2,3 (stride = 4)
261  // 2-way 1 (stride = 2)
262 #pragma unroll
263  for (int color_local=0; color_local < Mc; color_local++) {
264  out[color_local] +=
265  shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 0)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
266  }
267  }
268 
269 #pragma unroll
270  for (int d=0; d<dim_stride; d++) { // get all backward gathers
271 #pragma unroll
272  for (int color_local=0; color_local < Mc; color_local++) {
273  out[color_local] +=
274  shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 1)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
275  }
276  }
277 
278  // apply kappa
279 #pragma unroll
280  for (int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
281 
282  }
283 
284 #else // !__CUDA_ARCH__
285  for (int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
286 #endif
287 
288  }
289 
300  template <typename Float, int Ns, int Nc, int Mc, int color_stride, bool dagger, typename Arg>
301  __device__ __host__ inline void applyClover(complex<Float> out[], Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset) {
302  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
303 
304  // M is number of colors per thread
305 #pragma unroll
306  for(int color_local = 0; color_local < Mc; color_local++) {//Color out
307  int c = color_block + color_local; // global color index
308  int row = s*Nc + c;
309 #pragma unroll
310  for (int s_col = 0; s_col < Ns; s_col++) //Spin in
311 #pragma unroll
312  for (int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color in
313  //Factor of kappa and diagonal addition now incorporated in X
314  int col = s_col*Nc + c_col + color_offset;
315  if (!dagger) {
316  out[color_local] += arg.X(0, parity, x_cb, row, col)
317  * arg.inB(spinor_parity, x_cb+src_idx*arg.volumeCB, s_col, c_col+color_offset);
318  } else {
319  out[color_local] += conj(arg.X(0, parity, x_cb, col, row))
320  * arg.inB(spinor_parity, x_cb+src_idx*arg.volumeCB, s_col, c_col+color_offset);
321  }
322  }
323  }
324 
325  }
326 
327  //out(x) = M*in = \sum_mu Y_{-\mu}(x)in(x+mu) + Y^\dagger_mu(x-mu)in(x-mu)
328  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_thread_split,
329  bool dslash, bool clover, bool dagger, DslashType type, int dir, int dim, typename Arg>
330  __device__ __host__ inline void coarseDslash(Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
331  {
332  complex <Float> out[Mc];
333 #pragma unroll
334  for (int c=0; c<Mc; c++) out[c] = 0.0;
335  if (dslash) applyDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dir,dim,dagger,type>(out, arg, x_cb, src_idx, parity, s, color_block, color_offset);
336  if (doBulk<type>() && clover && dir==0 && dim==0) applyClover<Float,Ns,Nc,Mc,color_stride,dagger>(out, arg, x_cb, src_idx, parity, s, color_block, color_offset);
337 
338  if (dir==0 && dim==0) {
339  const int my_spinor_parity = (arg.nParity == 2) ? parity : 0;
340 #pragma unroll
341  for (int color_local=0; color_local<Mc; color_local++) {
342 #if __CUDA_ARCH__ >= 300
343  // reduce down to the first group of column-split threads
344  constexpr int warp_size = 32; // FIXME - this is buggy when x-dim * color_stride < 32
345 #pragma unroll
346  for (int offset = warp_size/2; offset >= warp_size/color_stride; offset /= 2)
347 #if (__CUDACC_VER_MAJOR__ >= 9)
348  out[color_local] += __shfl_down_sync(WARP_CONVERGED, out[color_local], offset);
349 #else
350  out[color_local] += __shfl_down(out[color_local], offset);
351 #endif
352 
353 #endif
354  int c = color_block + color_local; // global color index
355  if (color_offset == 0) {
356  // if not halo we just store, else we accumulate
357  if (doBulk<type>()) arg.out(my_spinor_parity, x_cb+src_idx*arg.volumeCB, s, c) = out[color_local];
358  else arg.out(my_spinor_parity, x_cb+src_idx*arg.volumeCB, s, c) += out[color_local];
359  }
360  }
361  }
362  }
363 
364  // CPU kernel for applying the coarse Dslash to a vector
365  template <typename Float, int nDim, int Ns, int Nc, int Mc, bool dslash, bool clover, bool dagger, DslashType type, typename Arg>
366  void coarseDslash(Arg arg)
367  {
368  // the fine-grain parameters mean nothing for CPU variant
369  const int color_stride = 1;
370  const int color_offset = 0;
371  const int dim_thread_split = 1;
372  const int dir = 0;
373  const int dim = 0;
374 
375  for (int parity= 0; parity < arg.nParity; parity++) {
376  // for full fields then set parity from loop else use arg setting
377  parity = (arg.nParity == 2) ? parity : arg.parity;
378 
379  for (int src_idx = 0; src_idx < arg.dim[4]; src_idx++) {
380  //#pragma omp parallel for
381  for(int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
382  for (int s=0; s<2; s++) {
383  for (int color_block=0; color_block<Nc; color_block+=Mc) { // Mc=Nc means all colors in a thread
384  coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,dir,dim>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
385  }
386  }
387  } // 4-d volumeCB
388  } // src index
389  } // parity
390 
391  }
392 
393  // GPU Kernel for applying the coarse Dslash to a vector
394  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_thread_split, bool dslash, bool clover, bool dagger, DslashType type, typename Arg>
395  __global__ void coarseDslashKernel(Arg arg)
396  {
397  constexpr int warp_size = 32;
398  const int lane_id = threadIdx.x % warp_size;
399  const int warp_id = threadIdx.x / warp_size;
400  const int vector_site_width = warp_size / color_stride;
401 
402  int x_cb = blockIdx.x*(blockDim.x/color_stride) + warp_id*(warp_size/color_stride) + lane_id % vector_site_width;
403 
404  const int color_offset = lane_id / vector_site_width;
405 
406  // for full fields set parity from y thread index else use arg setting
407 #if 0 // disable multi-src since this has a measurable impact on single src performance
408  int paritySrc = blockDim.y*blockIdx.y + threadIdx.y;
409  if (paritySrc >= arg.nParity * arg.dim[4]) return;
410  const int src_idx = (arg.nParity == 2) ? paritySrc / 2 : paritySrc; // maybe want to swap order or source and parity for improved locality of same parity
411  const int parity = (arg.nParity == 2) ? paritySrc % 2 : arg.parity;
412 #else
413  const int src_idx = 0;
414  const int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
415 #endif
416 
417  // z thread dimension is (( s*(Nc/Mc) + color_block )*dim_thread_split + dim)*2 + dir
418  int sMd = blockDim.z*blockIdx.z + threadIdx.z;
419  int dir = sMd & 1;
420  int sMdim = sMd >> 1;
421  int dim = sMdim % dim_thread_split;
422  int sM = sMdim / dim_thread_split;
423  int s = sM / (Nc/Mc);
424  int color_block = (sM % (Nc/Mc)) * Mc;
425 
426  if (x_cb >= arg.volumeCB) return;
427 
428  if (dir == 0) {
429  if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,0>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
430  else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,1>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
431  else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,2>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
432  else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,3>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
433  } else if (dir == 1) {
434  if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,0>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
435  else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,1>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
436  else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,2>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
437  else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,3>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
438  }
439  }
440 
441  template <typename Float, int nDim, int Ns, int Nc, int Mc, bool dslash, bool clover, bool dagger, DslashType type>
442  class DslashCoarse : public Tunable {
443 
444  protected:
445  ColorSpinorField &out;
446  const ColorSpinorField &inA;
447  const ColorSpinorField &inB;
448  const GaugeField &Y;
449  const GaugeField &X;
450  const double kappa;
451  const int parity;
452  const int nParity;
453  const int nSrc;
454 
455 #ifdef EIGHT_WAY_WARP_SPLIT
456  const int max_color_col_stride = 8;
457 #else
458  const int max_color_col_stride = 4;
459 #endif
460  mutable int color_col_stride;
461  mutable int dim_threads;
462  char *saveOut;
463 
464  long long flops() const
465  {
466  return ((dslash*2*nDim+clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*(long long)out.VolumeCB();
467  }
468  long long bytes() const
469  {
470  return (dslash||clover) * out.Bytes() + dslash*8*inA.Bytes() + clover*inB.Bytes() +
471  nSrc*nParity*(dslash*Y.Bytes()*Y.VolumeCB()/(2*Y.Stride()) + clover*X.Bytes()/2);
472  }
473  unsigned int sharedBytesPerThread() const { return (sizeof(complex<Float>) * Mc); }
474  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
475  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions
476  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions
477  unsigned int minThreads() const { return color_col_stride * X.VolumeCB(); } // 4-d volume since this x threads only
478  unsigned int maxBlockSize() const { return deviceProp.maxThreadsPerBlock / (dim_threads * 2 * nParity); }
479 
480  bool advanceBlockDim(TuneParam &param) const
481  {
482  dim3 block = param.block;
483  dim3 grid = param.grid;
485  param.block.y = block.y; param.block.z = block.z;
486  param.grid.y = grid.y; param.grid.z = grid.z;
487 
488  if (ret) { // we advanced the block.x so we're done
489  return true;
490  } else { // block.x (spacetime) was reset
491 
492  if (param.block.y < (unsigned int)(nParity * nSrc)) { // advance parity / 5th dimension
493  param.block.y++;
494  param.grid.y = (nParity * nSrc + param.block.y - 1) / param.block.y;
495  return true;
496  } else {
497  // reset parity / 5th dimension
498  param.block.y = 1;
499  param.grid.y = nParity * nSrc;
500 
501  // let's try to advance spin/block-color
502  while(param.block.z <= (unsigned int)(dim_threads * 2 * 2 * (Nc/Mc))) {
503  param.block.z+=dim_threads * 2;
504  if ( (dim_threads*2*2*(Nc/Mc)) % param.block.z == 0) {
505  param.grid.z = (dim_threads * 2 * 2 * (Nc/Mc)) / param.block.z;
506  break;
507  }
508  }
509 
510  // we can advance spin/block-color since this is valid
511  if (param.block.z <= (unsigned int)(dim_threads * 2 * 2 * (Nc/Mc)) &&
512  param.block.z <= (unsigned int)deviceProp.maxThreadsDim[2] ) { //
513  return true;
514  } else { // we have run off the end so let's reset
515  param.block.z = dim_threads * 2;
516  param.grid.z = 2 * (Nc/Mc);
517  return false;
518  }
519  }
520  }
521  }
522 
523  // FIXME: understand why this leads to slower perf and variable correctness
524  //int blockStep() const { return deviceProp.warpSize/4; }
525  //int blockMin() const { return deviceProp.warpSize/4; }
526 
527  // Experimental autotuning of the color column stride
528  bool advanceAux(TuneParam &param) const
529  {
530 
531 #if __COMPUTE_CAPABILITY__ >= 300
532  // we can only split the dot product on Kepler and later since we need the __shfl instruction
533  if (2*param.aux.x <= max_color_col_stride && Nc % (2*param.aux.x) == 0 &&
534  param.block.x % deviceProp.warpSize == 0) {
535  // An x-dimension block size that is not a multiple of the
536  // warp size is incompatible with splitting the dot product
537  // across the warp so we must skip this
538 
539  param.aux.x *= 2; // safe to advance
540  color_col_stride = param.aux.x;
541 
542  // recompute grid size since minThreads() has now been updated
543  param.grid.x = (minThreads()+param.block.x-1)/param.block.x;
544 
545  // check this grid size is valid before returning
546  if (param.grid.x < (unsigned int)deviceProp.maxGridSize[0]) return true;
547  }
548 #endif
549 
550  // reset color column stride if too large or not divisible
551  param.aux.x = 1;
552  color_col_stride = param.aux.x;
553 
554  // recompute grid size since minThreads() has now been updated
555  param.grid.x = (minThreads()+param.block.x-1)/param.block.x;
556 
557  if (2*param.aux.y <= nDim) {
558  param.aux.y *= 2;
559  dim_threads = param.aux.y;
560 
561  // need to reset z-block/grid size/shared_bytes since dim_threads has changed
562  param.block.z = dim_threads * 2;
563  param.grid.z = 2* (Nc / Mc);
564 
565  param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
566  sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
567 
568  return true;
569  } else {
570  param.aux.y = 1;
571  dim_threads = param.aux.y;
572 
573  // need to reset z-block/grid size/shared_bytes since
574  // dim_threads has changed. Strictly speaking this isn't needed
575  // since this is the outer dimension to tune, but would be
576  // needed if we added an aux.z tuning dimension
577  param.block.z = dim_threads * 2;
578  param.grid.z = 2* (Nc / Mc);
579 
580  param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
581  sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
582 
583  return false;
584  }
585  }
586 
587  virtual void initTuneParam(TuneParam &param) const
588  {
589  param.aux = make_int4(1,1,1,1);
590  color_col_stride = param.aux.x;
591  dim_threads = param.aux.y;
592 
594  param.block.y = 1;
595  param.grid.y = nParity * nSrc;
596  param.block.z = dim_threads * 2;
597  param.grid.z = 2*(Nc/Mc);
598  param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
599  sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
600  }
601 
603  virtual void defaultTuneParam(TuneParam &param) const
604  {
605  param.aux = make_int4(1,1,1,1);
606  color_col_stride = param.aux.x;
607  dim_threads = param.aux.y;
608 
610  // ensure that the default x block size is divisible by the warpSize
611  param.block.x = deviceProp.warpSize;
612  param.grid.x = (minThreads()+param.block.x-1)/param.block.x;
613  param.block.y = 1;
614  param.grid.y = nParity * nSrc;
615  param.block.z = dim_threads * 2;
616  param.grid.z = 2*(Nc/Mc);
617  param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
618  sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
619  }
620 
621  public:
622  inline DslashCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
623  const GaugeField &Y, const GaugeField &X, double kappa, int parity, MemoryLocation *halo_location)
624  : out(out), inA(inA), inB(inB), Y(Y), X(X), kappa(kappa), parity(parity),
625  nParity(out.SiteSubset()), nSrc(out.Ndim()==5 ? out.X(4) : 1)
626  {
627  strcpy(aux, "policy_kernel,");
628  strcat(aux, out.AuxString());
630 
631  // record the location of where each pack buffer is in [2*dim+dir] ordering
632  // 0 - no packing
633  // 1 - pack to local GPU memory
634  // 2 - pack to local mapped CPU memory
635  // 3 - pack to remote mapped GPU memory
636  switch(type) {
637  case DSLASH_INTERIOR: strcat(aux,",interior"); break;
638  case DSLASH_EXTERIOR: strcat(aux,",exterior"); break;
639  case DSLASH_FULL: strcat(aux,",full"); break;
640  }
641 
642  if (doHalo<type>()) {
643  char label[15] = ",halo=";
644  for (int dim=0; dim<4; dim++) {
645  for (int dir=0; dir<2; dir++) {
646  label[2*dim+dir+6] = !comm_dim_partitioned(dim) ? '0' : halo_location[2*dim+dir] == Device ? '1' : halo_location[2*dim+dir] == Host ? '2' : '3';
647  }
648  }
649  label[14] = '\0';
650  strcat(aux,label);
651  }
652  }
653  virtual ~DslashCoarse() { }
654 
655  inline void apply(const cudaStream_t &stream) {
656 
657  if (out.Location() == QUDA_CPU_FIELD_LOCATION) {
658 
659  if (out.FieldOrder() != QUDA_SPACE_SPIN_COLOR_FIELD_ORDER || Y.FieldOrder() != QUDA_QDP_GAUGE_ORDER)
660  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());
661 
662  DslashCoarseArg<Float,Ns,Nc,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,QUDA_QDP_GAUGE_ORDER> arg(out, inA, inB, Y, X, (Float)kappa, parity);
663  coarseDslash<Float,nDim,Ns,Nc,Mc,dslash,clover,dagger,type>(arg);
664  } else {
665 
666  const TuneParam &tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE /*getVerbosity()*/);
667 
668  if (out.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER || Y.FieldOrder() != QUDA_FLOAT2_GAUGE_ORDER)
669  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());
670 
671  DslashCoarseArg<Float,Ns,Nc,QUDA_FLOAT2_FIELD_ORDER,QUDA_FLOAT2_GAUGE_ORDER> arg(out, inA, inB, Y, X, (Float)kappa, parity);
672 
673  switch (tp.aux.y) { // dimension gather parallelisation
674  case 1:
675  switch (tp.aux.x) { // this is color_col_stride
676  case 1:
677  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
678  break;
679  case 2:
680  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
681  break;
682  case 4:
683  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
684  break;
685 #ifdef EIGHT_WAY_WARP_SPLIT
686  case 8:
687  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
688  break;
689 #endif
690  default:
691  errorQuda("Color column stride %d not valid", tp.aux.x);
692  }
693  break;
694  case 2:
695  switch (tp.aux.x) { // this is color_col_stride
696  case 1:
697  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
698  break;
699  case 2:
700  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
701  break;
702  case 4:
703  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
704  break;
705 #ifdef EIGHT_WAY_WARP_SPLIT
706  case 8:
707  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
708  break;
709 #endif
710  default:
711  errorQuda("Color column stride %d not valid", tp.aux.x);
712  }
713  break;
714  case 4:
715  switch (tp.aux.x) { // this is color_col_stride
716  case 1:
717  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
718  break;
719  case 2:
720  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
721  break;
722  case 4:
723  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
724  break;
725 #ifdef EIGHT_WAY_WARP_SPLIT
726  case 8:
727  coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
728  break;
729 #endif
730  default:
731  errorQuda("Color column stride %d not valid", tp.aux.x);
732  }
733  break;
734  default:
735  errorQuda("Invalid dimension thread splitting %d", tp.aux.y);
736  }
737  }
738  }
739 
740  TuneKey tuneKey() const {
741  return TuneKey(out.VolString(), typeid(*this).name(), aux);
742  }
743 
744  void preTune() {
745  saveOut = new char[out.Bytes()];
746  cudaMemcpy(saveOut, out.V(), out.Bytes(), cudaMemcpyDeviceToHost);
747  }
748 
749  void postTune()
750  {
751  cudaMemcpy(out.V(), saveOut, out.Bytes(), cudaMemcpyHostToDevice);
752  delete[] saveOut;
753  }
754 
755  };
756 
757 
758  template <typename Float, int coarseColor, int coarseSpin>
759  inline void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
760  const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash,
761  bool clover, bool dagger, DslashType type, MemoryLocation *halo_location) {
762 
763  const int colors_per_thread = 1;
764  const int nDim = 4;
765 
766  if (dagger) {
767  if (dslash) {
768  if (clover) {
769  if (type == DSLASH_FULL) {
770  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
771  dslash.apply(0);
772  } else { errorQuda("Dslash type %d not instantiated", type); }
773  } else {
774  if (type == DSLASH_FULL) {
775  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
776  dslash.apply(0);
777  } else { errorQuda("Dslash type %d not instantiated", type); }
778  }
779  } else {
780  if (type == DSLASH_EXTERIOR) errorQuda("Cannot call halo on pure clover kernel");
781  if (clover) {
782  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
783  dslash.apply(0);
784  } else {
785  errorQuda("Unsupported dslash=false clover=false");
786  }
787  }
788  } else {
789  if (dslash) {
790  if (clover) {
791  if (type == DSLASH_FULL) {
792  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
793  dslash.apply(0);
794  } else { errorQuda("Dslash type %d not instantiated", type); }
795  } else {
796  if (type == DSLASH_FULL) {
797  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
798  dslash.apply(0);
799  } else { errorQuda("Dslash type %d not instantiated", type); }
800  }
801  } else {
802  if (type == DSLASH_EXTERIOR) errorQuda("Cannot call halo on pure clover kernel");
803  if (clover) {
804  DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
805  dslash.apply(0);
806  } else {
807  errorQuda("Unsupported dslash=false clover=false");
808  }
809  }
810  }
811  }
812 
813  // template on the number of coarse colors
814  template <typename Float>
815  inline void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
816  const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash,
817  bool clover, bool dagger, DslashType type, MemoryLocation *halo_location) {
818 
819  if (Y.FieldOrder() != X.FieldOrder())
820  errorQuda("Field order mismatch Y = %d, X = %d", Y.FieldOrder(), X.FieldOrder());
821 
822  if (inA.FieldOrder() != out.FieldOrder())
823  errorQuda("Field order mismatch inA = %d, out = %d", inA.FieldOrder(), out.FieldOrder());
824 
825  if (inA.Nspin() != 2)
826  errorQuda("Unsupported number of coarse spins %d\n",inA.Nspin());
827 
828  if (inA.Ncolor() == 2) {
829  ApplyCoarse<Float,2,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
830 #if 0
831  } else if (inA.Ncolor() == 4) {
832  ApplyCoarse<Float,4,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
833  } else if (inA.Ncolor() == 8) {
834  ApplyCoarse<Float,8,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
835  } else if (inA.Ncolor() == 12) {
836  ApplyCoarse<Float,12,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
837  } else if (inA.Ncolor() == 16) {
838  ApplyCoarse<Float,16,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
839  } else if (inA.Ncolor() == 20) {
840  ApplyCoarse<Float,20,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
841 #endif
842  } else if (inA.Ncolor() == 24) {
843  ApplyCoarse<Float,24,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
844 #if 0
845  } else if (inA.Ncolor() == 28) {
846  ApplyCoarse<Float,28,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
847 #endif
848  } else if (inA.Ncolor() == 32) {
849  ApplyCoarse<Float,32,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
850  } else {
851  errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor());
852  }
853  }
854 
855  // this is the Worker pointer that may have issue additional work
856  // while we're waiting on communication to finish
857  namespace dslash {
858  extern Worker* aux_worker;
859  }
860 
861 #endif // GPU_MULTIGRID
862 
864  DSLASH_COARSE_BASIC, // stage both sends and recvs in host memory using memcpys
865  DSLASH_COARSE_ZERO_COPY_PACK, // zero copy write pack buffers
866  DSLASH_COARSE_ZERO_COPY_READ, // zero copy read halos in dslash kernel
867  DSLASH_COARSE_ZERO_COPY, // full zero copy
870  DSLASH_COARSE_GDR, // full GDR
871  DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV, // zero copy write and GDR recv
872  DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ // GDR send and zero copy read
873  };
874 
876 
880  const GaugeField &Y;
881  const GaugeField &X;
882  double kappa;
883  int parity;
884  bool dslash;
885  bool clover;
886  bool dagger;
887 
889  const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash, bool clover, bool dagger)
891 
896 #ifdef GPU_MULTIGRID
897  if (inA.V() == out.V()) errorQuda("Aliasing pointers");
898 
899  // check all precisions match
900  QudaPrecision precision = checkPrecision(out, inA, inB, Y, X);
901 
902  // check all locations match
903  checkLocation(out, inA, inB, Y, X);
904 
905  MemoryLocation pack_destination[2*QUDA_MAX_DIM]; // where we will pack the ghost buffer to
906  MemoryLocation halo_location[2*QUDA_MAX_DIM]; // where we load the halo from
907  for (int i=0; i<2*QUDA_MAX_DIM; i++) {
908  pack_destination[i] = (policy == DSLASH_COARSE_ZERO_COPY_PACK || policy == DSLASH_COARSE_ZERO_COPY ||
912  }
913  bool gdr_send = (policy == DSLASH_COARSE_GDR_SEND || policy == DSLASH_COARSE_GDR ||
915  bool gdr_recv = (policy == DSLASH_COARSE_GDR_RECV || policy == DSLASH_COARSE_GDR ||
917 
918  if (dslash && comm_partitioned()) {
919  const int nFace = 1;
920  inA.exchangeGhost((QudaParity)(1-parity), nFace, dagger, pack_destination, halo_location, gdr_send, gdr_recv);
921  }
922 
924 
925  if (precision == QUDA_DOUBLE_PRECISION) {
926 #ifdef GPU_MULTIGRID_DOUBLE
927  ApplyCoarse<double>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, DSLASH_FULL, halo_location);
928  //if (dslash && comm_partitioned()) ApplyCoarse<double>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, true, halo_location);
929 #else
930  errorQuda("Double precision multigrid has not been enabled");
931 #endif
932  } else if (precision == QUDA_SINGLE_PRECISION) {
933  ApplyCoarse<float>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, DSLASH_FULL, halo_location);
934  //if (dslash && comm_partitioned()) ApplyCoarse<float>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, true, halo_location);
935  } else {
936  errorQuda("Unsupported precision %d\n", Y.Precision());
937  }
938 
940 #else
941  errorQuda("Multigrid has not been built");
942 #endif
943  }
944 
945  };
946 
947  // hooks into tune.cpp variables for policy tuning
948  typedef std::map<TuneKey, TuneParam> map;
949  const map& getTuneCache();
950 
951  void disableProfileCount();
952  void enableProfileCount();
953  void setPolicyTuning(bool);
954 
955  static bool dslash_init = false;
956  static std::vector<DslashCoarsePolicy> policy;
957  static int config = 0; // 3-bit number used to record the machine config (p2p / gdr) and if this changes we will force a retune
958 
960 
962 
963  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
964  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
965  unsigned int sharedBytesPerThread() const { return 0; }
966  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
967 
968  public:
970  {
971  strcpy(aux,"policy,");
972  if (dslash.dslash) strcat(aux,"dslash");
973  strcat(aux, dslash.clover ? "clover," : ",");
974  strcat(aux,dslash.inA.AuxString());
976 
977  if (!dslash_init) {
978  policy.reserve(9);
979  static char *dslash_policy_env = getenv("QUDA_ENABLE_DSLASH_COARSE_POLICY");
980 
981  if (dslash_policy_env) { // set the policies to tune for explicitly
982  std::stringstream policy_list(dslash_policy_env);
983 
984  int policy_;
985  while (policy_list >> policy_) {
986  DslashCoarsePolicy dslash_policy = static_cast<DslashCoarsePolicy>(policy_);
987 
988  // check this is a valid policy choice
989  if ( (dslash_policy == DSLASH_COARSE_GDR_SEND || dslash_policy == DSLASH_COARSE_GDR_RECV ||
990  dslash_policy == DSLASH_COARSE_GDR || dslash_policy == DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV ||
991  dslash_policy == DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ) && !comm_gdr_enabled() ) {
992  errorQuda("Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", dslash_policy);
993  }
994 
995  policy.push_back(static_cast<DslashCoarsePolicy>(policy_));
996  if (policy_list.peek() == ',') policy_list.ignore();
997  }
998  } else {
999  policy.push_back(DSLASH_COARSE_BASIC);
1002  policy.push_back(DSLASH_COARSE_ZERO_COPY);
1003  if (comm_gdr_enabled()) {
1004  policy.push_back(DSLASH_COARSE_GDR_SEND);
1005  policy.push_back(DSLASH_COARSE_GDR_RECV);
1006  policy.push_back(DSLASH_COARSE_GDR);
1009  }
1010  }
1011 
1012  config += comm_gdr_enabled();
1014  dslash_init = true;
1015  }
1016 
1017  // before we do policy tuning we must ensure the kernel
1018  // constituents have been tuned since we can't do nested tuning
1019  if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
1021  for (auto &i : policy) dslash(i);
1023  setPolicyTuning(true);
1024  }
1025  }
1026 
1028 
1029  inline void apply(const cudaStream_t &stream) {
1030  TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_DEBUG_VERBOSE /*getVerbosity()*/);
1031 
1032  if (config != tp.aux.y) {
1033  errorQuda("Machine configuration (P2P/GDR=%d) changed since tunecache was created (P2P/GDR=%d). Please delete "
1034  "this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.",
1035  config, tp.aux.y);
1036  }
1037 
1038  if (tp.aux.x >= (int)policy.size()) errorQuda("Requested policy that is outside of range");
1039  dslash(policy[tp.aux.x]);
1040  }
1041 
1042  int tuningIter() const { return 10; }
1043 
1045  {
1046  if ((unsigned)param.aux.x < policy.size()-1) {
1047  param.aux.x++;
1048  return true;
1049  } else {
1050  param.aux.x = 0;
1051  return false;
1052  }
1053  }
1054 
1056 
1059  param.aux.x = 0; param.aux.y = config; param.aux.z = 0; param.aux.w = 0;
1060  }
1061 
1064  param.aux.x = 0; param.aux.y = config; param.aux.z = 0; param.aux.w = 0;
1065  }
1066 
1067  TuneKey tuneKey() const {
1068  return TuneKey(dslash.inA.VolString(), typeid(*this).name(), aux);
1069  }
1070 
1071  long long flops() const {
1072  int nDim = 4;
1073  int Ns = dslash.inA.Nspin();
1074  int Nc = dslash.inA.Ncolor();
1075  int nParity = dslash.inA.SiteSubset();
1076  int volumeCB = dslash.inA.VolumeCB();
1077  return ((dslash.dslash*2*nDim+dslash.clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*volumeCB;
1078  }
1079 
1080  long long bytes() const {
1081  int nParity = dslash.inA.SiteSubset();
1082  return (dslash.dslash||dslash.clover) * dslash.out.Bytes() +
1083  dslash.dslash*8*dslash.inA.Bytes() + dslash.clover*dslash.inB.Bytes() +
1084  nParity*(dslash.dslash*dslash.Y.Bytes()*dslash.Y.VolumeCB()/(2*dslash.Y.Stride())
1085  + dslash.clover*dslash.X.Bytes()/2);
1086  // multiply Y by volume / stride to correct for pad
1087  }
1088  };
1089 
1090 
1091  //Apply the coarse Dirac matrix to a coarse grid vector
1092  //out(x) = M*in = X*in - kappa*\sum_mu Y_{-\mu}(x)in(x+mu) + Y^\dagger_mu(x-mu)in(x-mu)
1093  // or
1094  //out(x) = M^dagger*in = X^dagger*in - kappa*\sum_mu Y^\dagger_{-\mu}(x)in(x+mu) + Y_mu(x-mu)in(x-mu)
1095  //Uses the kappa normalization for the Wilson operator.
1097  const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash, bool clover, bool dagger) {
1098 
1099  DslashCoarseLaunch Dslash(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger);
1100 
1102  policy.apply(0);
1103 
1104  }//ApplyCoarse
1105 
1106 
1107 } // namespace quda
virtual void apply(const cudaStream_t &stream)=0
dim3 dim3 blockDim
void operator()(DslashCoarsePolicy policy)
Execute the coarse dslash using the given policy.
const GaugeField & X
enum QudaPrecision_s QudaPrecision
static bool dslash_init
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_mpi.cpp:342
cudaDeviceProp deviceProp
void disableProfileCount()
Definition: tune.cpp:107
#define checkPrecision(...)
const ColorSpinorField & inB
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
unsigned int sharedBytesPerThread() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
char * strcat(char *__s1, const char *__s2)
DslashCoarseLaunch(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash, bool clover, bool dagger)
void enableProfileCount()
Definition: tune.cpp:108
DslashCoarseLaunch & dslash
size_t size_t offset
QudaGaugeParam param
Definition: pack_test.cpp:17
DslashCoarsePolicy
static int bufferIndex
VOLATILE spinorFloat kappa
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const =0
Worker * aux_worker
Definition: dslash_quda.cu:78
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
int src_idx
for(int s=0;s< param.dc.Ls;s++)
ColorSpinorField & out
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
static std::vector< DslashCoarsePolicy > policy
Main header file for host and device accessors to GaugeFields.
bool advanceAux(TuneParam &param) const
enum QudaParity_s QudaParity
void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity=QUDA_INVALID_PARITY, bool dslash=true, bool clover=true, bool dagger=false)
void setPolicyTuning(bool)
Definition: tune.cpp:457
std::map< TuneKey, TuneParam > map
void apply(const cudaStream_t &stream)
cpuColorSpinorField * out
static int config
if(err !=cudaSuccess)
const GaugeField & Y
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
const ColorSpinorField & inA
const void * c
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void defaultTuneParam(TuneParam &param) const
const map & getTuneCache()
Definition: tune.cpp:110
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:102
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
bool advanceTuneParam(TuneParam &param) const
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void initTuneParam(TuneParam &param) const
QudaParity parity
Definition: covdev_test.cpp:53
DslashCoarsePolicyTune(DslashCoarseLaunch &dslash)
int comm_peer2peer_enabled_global()
char * getenv(const char *)
QudaFieldOrder FieldOrder() const
__syncthreads()
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)
cudaEvent_t cudaEvent_t end
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254