QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
coarse_op.cuh
Go to the documentation of this file.
1 #include <tune_quda.h>
2 
3 #include <jitify_helper.cuh>
5 
6 namespace quda {
7 
8  // For coarsening un-preconditioned operators we use uni-directional
9  // coarsening to reduce the set up code. For debugging we can force
10  // bi-directional coarsening.
11  static bool bidirectional_debug = false;
12 
13  enum ComputeType {
28  };
29 
30  template <bool from_coarse, typename Float, int fineSpin,
31  int fineColor, int coarseSpin, int coarseColor, typename Arg>
32  class CalculateY : public TunableVectorYZ {
33  public:
34 
35  protected:
36  Arg &arg;
42 
43  int dim;
46 
47  long long flops() const
48  {
49  long long flops_ = 0;
50  switch (type) {
51  case COMPUTE_UV:
52  // when fine operator is coarse take into account that the link matrix has spin dependence
53  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * coarseColor * fineColor * fineColor * (!from_coarse ? 1 : fineSpin);
54  break;
55  case COMPUTE_AV:
56  case COMPUTE_TMAV:
57  // # chiral blocks * size of chiral block * number of null space vectors
58  flops_ = 2l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
59  break;
60  case COMPUTE_TMCAV:
61  // # Twice chiral blocks * size of chiral block * number of null space vectors
62  flops_ = 4l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
63  break;
64  case COMPUTE_VUV:
65  // when the fine operator is truly fine the VUV multiplication is block sparse which halves the number of operations
66  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor / (!from_coarse ? coarseSpin : 1);
67  break;
69  // when the fine operator is truly fine the clover multiplication is block sparse which halves the number of operations
70  flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor * fineColor / (!from_coarse ? coarseSpin : 1);
71  break;
72  case COMPUTE_REVERSE_Y:
73  case COMPUTE_CONVERT:
74  case COMPUTE_RESCALE:
75  case COMPUTE_CLOVER_INV_MAX: // FIXME
76  case COMPUTE_TWISTED_CLOVER_INV_MAX: // FIXME
77  // no floating point operations
78  flops_ = 0;
79  break;
80  case COMPUTE_DIAGONAL:
81  case COMPUTE_TMDIAGONAL:
82  // read addition on the diagonal
83  flops_ = 2l * arg.coarseVolumeCB*coarseSpin*coarseColor;
84  break;
85  default:
86  errorQuda("Undefined compute type %d", type);
87  }
88  // 2 from parity, 8 from complex
89  return flops_;
90  }
91  long long bytes() const
92  {
93  long long bytes_ = 0;
94  switch (type) {
95  case COMPUTE_UV:
96  bytes_ = arg.UV.Bytes() + arg.V.Bytes() + 2*arg.U.Bytes()*coarseColor;
97  break;
98  case COMPUTE_AV:
99  bytes_ = arg.AV.Bytes() + arg.V.Bytes() + 2*arg.C.Bytes()*coarseColor;
100  break;
101  case COMPUTE_TMAV:
102  bytes_ = arg.AV.Bytes() + arg.V.Bytes();
103  break;
104  case COMPUTE_TMCAV:
105 #ifdef DYNAMIC_CLOVER
106  bytes_ = arg.AV.Bytes() + arg.V.Bytes() + 2*arg.C.Bytes()*coarseColor; // A single clover field
107 #else
108  bytes_ = arg.AV.Bytes() + arg.V.Bytes() + 4*arg.C.Bytes()*coarseColor; // Both clover and its inverse
109 #endif
110  break;
113  bytes_ = 2*arg.C.Bytes(); // read both parities of the clover field
114  break;
115  case COMPUTE_VUV:
116  {
117  // formula for shared-atomic variant assuming parity_flip = true
118  int writes = 4;
119  // we use a (coarseColor * coarseColor) matrix of threads so each load is input element is loaded coarseColor times
120  // we ignore the multiple loads of spin since these are per thread (and should be cached?)
121  bytes_ = 2*writes*arg.Y.Bytes() + (arg.bidirectional ? 1 : 2) * 2*writes*arg.X.Bytes() + coarseColor*(arg.UV.Bytes() + arg.V.Bytes());
122  break;
123  }
125  bytes_ = 2*arg.X.Bytes() + 2*arg.C.Bytes() + arg.V.Bytes(); // 2 from parity
126  break;
127  case COMPUTE_REVERSE_Y:
128  bytes_ = 4*2*2*arg.Y.Bytes(); // 4 from direction, 2 from i/o, 2 from parity
129  case COMPUTE_DIAGONAL:
130  case COMPUTE_TMDIAGONAL:
131  bytes_ = 2*2*arg.X.Bytes(); // 2 from i/o, 2 from parity
132  break;
133  case COMPUTE_CONVERT:
134  bytes_ = dim == 4 ? 2*(arg.X.Bytes() + arg.X_atomic.Bytes()) : 2*(arg.Y.Bytes() + arg.Y_atomic.Bytes());
135  break;
136  case COMPUTE_RESCALE:
137  bytes_ = 2*2*arg.Y.Bytes(); // 2 from i/o, 2 from parity
138  break;
139  default:
140  errorQuda("Undefined compute type %d", type);
141  }
142  return bytes_;
143  }
144 
145  unsigned int minThreads() const {
146  unsigned int threads = 0;
147  switch (type) {
148  case COMPUTE_UV:
149  case COMPUTE_AV:
150  case COMPUTE_TMAV:
151  case COMPUTE_TMCAV:
154  case COMPUTE_VUV:
156  threads = arg.fineVolumeCB;
157  break;
158  case COMPUTE_REVERSE_Y:
159  case COMPUTE_DIAGONAL:
160  case COMPUTE_TMDIAGONAL:
161  case COMPUTE_CONVERT:
162  case COMPUTE_RESCALE:
163  threads = arg.coarseVolumeCB;
164  break;
165  default:
166  errorQuda("Undefined compute type %d", type);
167  }
168  return threads;
169  }
170 
171  bool tuneGridDim() const { return false; } // don't tune the grid dimension
172  bool tuneAuxDim() const { return type != COMPUTE_VUV ? false : true; }
173 
174  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
175  if (arg.shared_atomic && type == COMPUTE_VUV) return 4*sizeof(storeType)*max_color_per_block*max_color_per_block*4*coarseSpin*coarseSpin;
177  }
178 
179  public:
180  CalculateY(Arg &arg, const ColorSpinorField &meta, GaugeField &Y, GaugeField &X, GaugeField &Y_atomic, GaugeField &X_atomic)
181  : TunableVectorYZ(2,1), arg(arg), type(COMPUTE_INVALID),
182  meta(meta), Y(Y), X(X), Y_atomic(Y_atomic), X_atomic(X_atomic), dim(0), dir(QUDA_BACKWARDS)
183  {
184  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
185 #ifdef JITIFY
186  create_jitify_program("kernels/coarse_op_kernel.cuh");
187 #endif
188  }
189  strcpy(aux, compile_type_str(meta));
190  strcat(aux, meta.AuxString());
191  strcat(aux, comm_dim_partitioned_string());
192  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
193  }
194  virtual ~CalculateY() { }
195 
196  void apply(const cudaStream_t &stream) {
197  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
198 
199  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
200 
201  if (type == COMPUTE_UV) {
202 
203  if (dir == QUDA_BACKWARDS) {
204  if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
205  else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
206  else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
207  else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
208  } else if (dir == QUDA_FORWARDS) {
209  if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
210  else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
211  else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
212  else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
213  } else {
214  errorQuda("Undefined direction %d", dir);
215  }
216 
217  } else if (type == COMPUTE_AV) {
218 
219  if (from_coarse) errorQuda("ComputeAV should only be called from the fine grid");
220 #if defined(GPU_CLOVER_DIRAC) && !defined(COARSECOARSE)
221  ComputeAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
222 #else
223  errorQuda("Clover dslash has not been built");
224 #endif
225 
226  } else if (type == COMPUTE_TMAV) {
227 
228  if (from_coarse) errorQuda("ComputeTMAV should only be called from the fine grid");
229 #if defined(GPU_TWISTED_MASS_DIRAC) && !defined(COARSECOARSE)
230  ComputeTMAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
231 #else
232  errorQuda("Twisted mass dslash has not been built");
233 #endif
234 
235  } else if (type == COMPUTE_TMCAV) {
236 
237  if (from_coarse) errorQuda("ComputeTMCAV should only be called from the fine grid");
238 #if defined(GPU_TWISTED_CLOVER_DIRAC) && !defined(COARSECOARSE)
239  ComputeTMCAVCPU<Float,fineSpin,fineColor,coarseColor>(arg);
240 #else
241  errorQuda("Twisted clover dslash has not been built");
242 #endif
243 
244  } else if (type == COMPUTE_CLOVER_INV_MAX) {
245 
246  if (from_coarse) errorQuda("ComputeInvCloverMax should only be called from the fine grid");
247 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE)
248  ComputeCloverInvMaxCPU<Float, false>(arg);
249 #else
250  errorQuda("ComputeInvCloverMax only enabled with dynamic clover");
251 #endif
252 
253  } else if (type == COMPUTE_TWISTED_CLOVER_INV_MAX) {
254 
255  if (from_coarse) errorQuda("ComputeInvCloverMax should only be called from the fine grid");
256 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE)
257  ComputeCloverInvMaxCPU<Float, true>(arg);
258 #else
259  errorQuda("ComputeInvCloverMax only enabled with dynamic clover");
260 #endif
261 
262  } else if (type == COMPUTE_VUV) {
263 
264  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
265 
266  if (dir == QUDA_BACKWARDS) {
267  if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
268  else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
269  else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
270  else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
271  } else if (dir == QUDA_FORWARDS) {
272  if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
273  else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
274  else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
275  else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(arg);
276  } else {
277  errorQuda("Undefined direction %d", dir);
278  }
279 
280  } else if (type == COMPUTE_COARSE_CLOVER) {
281 
282  ComputeCoarseCloverCPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(arg);
283 
284  } else if (type == COMPUTE_REVERSE_Y) {
285 
286  ComputeYReverseCPU<Float,coarseSpin,coarseColor>(arg);
287 
288  } else if (type == COMPUTE_DIAGONAL) {
289 
290  AddCoarseDiagonalCPU<Float,coarseSpin,coarseColor>(arg);
291 
292  } else if (type == COMPUTE_TMDIAGONAL) {
293 
294  AddCoarseTmDiagonalCPU<Float,coarseSpin,coarseColor>(arg);
295 
296  } else if (type == COMPUTE_CONVERT) {
297 
298  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
299  ConvertCPU<Float,coarseSpin,coarseColor>(arg);
300 
301  } else if (type == COMPUTE_RESCALE) {
302 
303  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
304  RescaleYCPU<Float,coarseSpin,coarseColor>(arg);
305 
306  } else {
307  errorQuda("Undefined compute type %d", type);
308  }
309  } else {
310 
311  if (type == COMPUTE_UV) {
312 
313  if (dir != QUDA_BACKWARDS && dir != QUDA_FORWARDS) errorQuda("Undefined direction %d", dir);
314 #ifdef JITIFY
315  using namespace jitify::reflection;
316  jitify_error = program->kernel("quda::ComputeUVGPU")
317  .instantiate(from_coarse,Type<Float>(),dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Type<Arg>())
318  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
319 #else
320  if (dir == QUDA_BACKWARDS) {
321  if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
322  else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
323  else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
324  else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
325  } else if (dir == QUDA_FORWARDS) {
326  if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
327  else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
328  else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
329  else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
330  }
331 #endif
332 
333  } else if (type == COMPUTE_AV) {
334 
335  if (from_coarse) errorQuda("ComputeAV should only be called from the fine grid");
336 #ifdef JITIFY
337  using namespace jitify::reflection;
338  jitify_error = program->kernel("quda::ComputeAVGPU")
339  .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
340  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
341 #else
342 #if defined(GPU_CLOVER_DIRAC) && !defined(COARSECOARSE)
343  ComputeAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
344 #else
345  errorQuda("Clover dslash has not been built");
346 #endif
347 #endif
348 
349  } else if (type == COMPUTE_TMAV) {
350 
351  if (from_coarse) errorQuda("ComputeTMAV should only be called from the fine grid");
352 #ifdef JITIFY
353  using namespace jitify::reflection;
354  jitify_error = program->kernel("quda::ComputeTMAVGPU")
355  .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
356  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
357 #else
358 #if defined(GPU_TWISTED_MASS_DIRAC) && !defined(COARSECOARSE)
359  ComputeTMAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
360 #else
361  errorQuda("Twisted mass dslash has not been built");
362 #endif
363 #endif
364 
365  } else if (type == COMPUTE_TMCAV) {
366 
367  if (from_coarse) errorQuda("ComputeTMCAV should only be called from the fine grid");
368 #ifdef JITIFY
369  using namespace jitify::reflection;
370  jitify_error = program->kernel("quda::ComputeTMCAVGPU")
371  .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
372  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
373 #else
374 #if defined(GPU_TWISTED_CLOVER_DIRAC) && !defined(COARSECOARSE)
375  ComputeTMCAVGPU<Float,fineSpin,fineColor,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
376 #else
377  errorQuda("Twisted clover dslash has not been built");
378 #endif
379 #endif
380 
381  } else if (type == COMPUTE_CLOVER_INV_MAX) {
382 
383  if (from_coarse) errorQuda("ComputeCloverInvMax should only be called from the fine grid");
384  arg.max_d = static_cast<Float*>(pool_device_malloc(2 * arg.fineVolumeCB *sizeof(Float)));
385 
386 #ifdef JITIFY
387  using namespace jitify::reflection;
388  jitify_error = program->kernel("quda::ComputeCloverInvMaxGPU")
389  .instantiate(Type<Float>(), false, Type<Arg>())
390  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
391  .launch(arg);
392 #else
393 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE)
394  ComputeCloverInvMaxGPU<Float, false><<<tp.grid, tp.block, tp.shared_bytes>>>(arg);
395 #else
396  errorQuda("ComputeCloverInvMax only enabled with dynamic clover");
397 #endif
398 #endif
399 
400  if (!activeTuning()) { // only do reduction once tuning is done else thrust will catch tuning failures
402  thrust::device_ptr<Float> ptr(arg.max_d);
403  arg.max_h = thrust::reduce(thrust::cuda::par(alloc), ptr, ptr + 2 * arg.fineVolumeCB,
404  static_cast<Float>(0.0), thrust::maximum<Float>());
405  }
406  pool_device_free(arg.max_d);
407 
408  } else if (type == COMPUTE_TWISTED_CLOVER_INV_MAX) {
409 
410  if (from_coarse) errorQuda("ComputeCloverInvMax should only be called from the fine grid");
411  arg.max_d = static_cast<Float *>(pool_device_malloc(2 * arg.fineVolumeCB * sizeof(Float)));
412 
413 #ifdef JITIFY
414  using namespace jitify::reflection;
415  jitify_error = program->kernel("quda::ComputeCloverInvMaxGPU")
416  .instantiate(Type<Float>(), true, Type<Arg>())
417  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
418  .launch(arg);
419 #else
420 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE)
421  ComputeCloverInvMaxGPU<Float, true><<<tp.grid, tp.block, tp.shared_bytes>>>(arg);
422 #else
423  errorQuda("ComputeCloverInvMax only enabled with dynamic clover");
424 #endif
425 #endif
426 
427  if (!activeTuning()) { // only do reduction once tuning is done else thrust will catch tuning failures
429  thrust::device_ptr<Float> ptr(arg.max_d);
430  arg.max_h = thrust::reduce(thrust::cuda::par(alloc), ptr, ptr+2*arg.fineVolumeCB, static_cast<Float>(0.0), thrust::maximum<Float>());
431  }
432  pool_device_free(arg.max_d);
433 
434  } else if (type == COMPUTE_VUV) {
435 
436  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
437 
438  // need to resize the grid since we don't tune over the entire coarseColor dimension
439  // factor of two comes from parity onto different blocks (e.g. in the grid)
440  tp.grid.y = (2*coarseColor + tp.block.y - 1) / tp.block.y;
441  tp.grid.z = (coarseColor + tp.block.z - 1) / tp.block.z;
442 
443  arg.shared_atomic = tp.aux.y;
444  arg.parity_flip = tp.aux.z;
445 
446  if (arg.shared_atomic) {
447  // check we have a valid problem size for shared atomics
448  // constrint is due to how shared memory initialization and global store are done
449  int block_size = arg.fineVolumeCB/arg.coarseVolumeCB;
450  if (block_size/2 < coarseSpin*coarseSpin)
451  errorQuda("Block size %d not supported in shared-memory atomic coarsening", block_size);
452 
453  arg.aggregates_per_block = tp.aux.x;
454  tp.block.x *= tp.aux.x;
455  tp.grid.x /= tp.aux.x;
456  }
457 
458  if (arg.coarse_color_wave) {
459  // swap x and y grids
460  std::swap(tp.grid.y,tp.grid.x);
461  // augment x grid with coarseColor row grid (z grid)
462  arg.grid_z = tp.grid.z;
463  arg.coarse_color_grid_z = coarseColor*tp.grid.z;
464  tp.grid.x *= tp.grid.z;
465  tp.grid.z = 1;
466  }
467 
468  tp.shared_bytes -= sharedBytesPerBlock(tp); // shared memory is static so don't include it in launch
469 
470 #ifdef JITIFY
471  using namespace jitify::reflection;
472  jitify_error = program->kernel("quda::ComputeVUVGPU")
473  .instantiate(arg.shared_atomic,arg.parity_flip,from_coarse,Type<Float>(),dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Type<Arg>())
474  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
475 #else
476  if (arg.shared_atomic) {
477  if (arg.parity_flip != true) errorQuda("parity_flip = %d not instantiated", arg.parity_flip);
478  constexpr bool parity_flip = true;
479 
480  if (dir == QUDA_BACKWARDS) {
481  if (dim==0) ComputeVUVGPU<true,parity_flip,from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
482  else if (dim==1) ComputeVUVGPU<true,parity_flip,from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
483  else if (dim==2) ComputeVUVGPU<true,parity_flip,from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
484  else if (dim==3) ComputeVUVGPU<true,parity_flip,from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
485  } else if (dir == QUDA_FORWARDS) {
486  if (dim==0) ComputeVUVGPU<true,parity_flip,from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
487  else if (dim==1) ComputeVUVGPU<true,parity_flip,from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
488  else if (dim==2) ComputeVUVGPU<true,parity_flip,from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
489  else if (dim==3) ComputeVUVGPU<true,parity_flip,from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
490  } else {
491  errorQuda("Undefined direction %d", dir);
492  }
493  } else {
494  if (arg.parity_flip != false) errorQuda("parity_flip = %d not instantiated", arg.parity_flip);
495  constexpr bool parity_flip = false;
496 
497  if (dir == QUDA_BACKWARDS) {
498  if (dim==0) ComputeVUVGPU<false,parity_flip,from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
499  else if (dim==1) ComputeVUVGPU<false,parity_flip,from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
500  else if (dim==2) ComputeVUVGPU<false,parity_flip,from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
501  else if (dim==3) ComputeVUVGPU<false,parity_flip,from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
502  } else if (dir == QUDA_FORWARDS) {
503  if (dim==0) ComputeVUVGPU<false,parity_flip,from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
504  else if (dim==1) ComputeVUVGPU<false,parity_flip,from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
505  else if (dim==2) ComputeVUVGPU<false,parity_flip,from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
506  else if (dim==3) ComputeVUVGPU<false,parity_flip,from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
507  } else {
508  errorQuda("Undefined direction %d", dir);
509  }
510  }
511 #endif
512  tp.shared_bytes += sharedBytesPerBlock(tp); // restore shared memory
513 
514  if (arg.coarse_color_wave) {
515  // revert the grids
516  tp.grid.z = arg.grid_z;
517  tp.grid.x /= tp.grid.z;
518  std::swap(tp.grid.x,tp.grid.y);
519  }
520 
521  if (arg.shared_atomic) {
522  tp.block.x /= tp.aux.x;
523  tp.grid.x *= tp.aux.x;
524  }
525 
526  } else if (type == COMPUTE_COARSE_CLOVER) {
527 
528 #ifdef JITIFY
529  using namespace jitify::reflection;
530  jitify_error = program->kernel("quda::ComputeCoarseCloverGPU")
531  .instantiate(from_coarse,Type<Float>(),fineSpin,coarseSpin,fineColor,coarseColor,Type<Arg>())
532  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
533 #else
534  ComputeCoarseCloverGPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>
535  <<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
536 #endif
537 
538  } else if (type == COMPUTE_REVERSE_Y) {
539 
540 #ifdef JITIFY
541  using namespace jitify::reflection;
542  jitify_error = program->kernel("quda::ComputeYReverseGPU")
543  .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
544  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
545 #else
546  ComputeYReverseGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
547 #endif
548  } else if (type == COMPUTE_DIAGONAL) {
549 
550 #ifdef JITIFY
551  using namespace jitify::reflection;
552  jitify_error = program->kernel("quda::AddCoarseDiagonalGPU")
553  .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
554  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
555 #else
556  AddCoarseDiagonalGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
557 #endif
558  } else if (type == COMPUTE_TMDIAGONAL) {
559 
560 #ifdef JITIFY
561  using namespace jitify::reflection;
562  jitify_error = program->kernel("quda::AddCoarseTmDiagonalGPU")
563  .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
564  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
565 #else
566  AddCoarseTmDiagonalGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
567 #endif
568  } else if (type == COMPUTE_CONVERT) {
569 
570  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
571 #ifdef JITIFY
572  using namespace jitify::reflection;
573  jitify_error = program->kernel("quda::ConvertGPU")
574  .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
575  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
576 #else
577  ConvertGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
578 #endif
579  } else if (type == COMPUTE_RESCALE) {
580 
581  arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
582 #ifdef JITIFY
583  using namespace jitify::reflection;
584  jitify_error = program->kernel("quda::RescaleYGPU")
585  .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
586  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
587 #else
588  RescaleYGPU<Float,coarseSpin,coarseColor><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
589 #endif
590 
591  } else {
592  errorQuda("Undefined compute type %d", type);
593  }
594  }
595  }
596 
600  void setDimension(int dim_) { dim = dim_; }
601 
605  void setDirection(QudaDirection dir_) { dir = dir_; }
606 
611  type = type_;
612  switch(type) {
613  case COMPUTE_VUV:
614  arg.shared_atomic = false;
615  arg.parity_flip = false;
616  if (arg.shared_atomic) {
617  // if not parity flip then we need to force parity within the block (hence factor of 2)
618  resizeVector( (arg.parity_flip ? 1 : 2) * max_color_per_block,max_color_per_block);
619  } else {
621  }
622  break;
623  case COMPUTE_COARSE_CLOVER: // no shared atomic version so keep separate from above
624  case COMPUTE_REVERSE_Y:
625  case COMPUTE_CONVERT:
626  case COMPUTE_RESCALE:
627  resizeVector(2*coarseColor,coarseColor);
628  break;
629  case COMPUTE_UV:
630  case COMPUTE_TMAV: resizeVector(2, coarseColor); break;
631  case COMPUTE_AV:
632  case COMPUTE_TMCAV: resizeVector(4, coarseColor); break; // y dimension is chirality and parity
633  default: resizeVector(2, 1); break;
634  }
635 
636  resizeStep(1,1);
637  if (arg.shared_atomic && type == COMPUTE_VUV && !arg.parity_flip) resizeStep(2,1);
638 
639  // do not tune spatial block size for VUV or COARSE_CLOVER
640  tune_block_x = (type == COMPUTE_VUV || type == COMPUTE_COARSE_CLOVER) ? false : true;
641  }
642 
644  {
645  if (type != COMPUTE_VUV) return false;
646 
647  // exhausted the global-atomic search space so switch to
648  // shared-atomic space
649  if (param.aux.y == 0) {
650 
651  // pre-Maxwell does not support shared-memory atomics natively so no point in trying
652  if (deviceProp.major < 5) return false;
653 
654  // before advancing, check we can use shared-memory atomics
655  int block_size = arg.fineVolumeCB/arg.coarseVolumeCB;
656  if (block_size/2 < coarseSpin*coarseSpin) return false;
657 
658  arg.shared_atomic = true;
659  arg.parity_flip = true; // this is usually optimal for shared atomics
660 
661  resizeVector( (arg.parity_flip ? 1 : 2) * max_color_per_block,max_color_per_block);
662  if (!arg.parity_flip) resizeStep(2,1);
663 
664  // need to reset since we're switching to shared-memory atomics
665  initTuneParam(param);
666 
667  return true;
668  } else {
669  // already doing shared-memory atomics but can tune number of
670  // coarse grid points per block
671 
672  if (param.aux.x < 4) {
673  param.aux.x *= 2;
674  return true;
675  } else {
676  param.aux.x = 1;
677 
678  // completed all shared-memory tuning so reset to global atomics
679  arg.shared_atomic = false;
680  arg.parity_flip = false; // this is usually optimal for global atomics
681 
682  initTuneParam(param);
683 
684  return false;
685  }
686 
687  }
688 
689  }
690 
692  return ( (!arg.shared_atomic && !from_coarse && type == COMPUTE_VUV) || type == COMPUTE_COARSE_CLOVER) ? false : Tunable::advanceSharedBytes(param);
693  }
694 
696  // only do autotuning if we have device fields
698  else return false;
699  }
700 
702  {
704  param.aux.x = 1; // aggregates per block
705  param.aux.y = arg.shared_atomic;
706  param.aux.z = arg.parity_flip; // not actually tuned over at present
707 
708  // with shared-atomic VUV, each block.x matches exactly to a c/b aggregate
709  if (arg.shared_atomic && type == COMPUTE_VUV) {
710  param.block.x = arg.fineVolumeCB/(2*arg.coarseVolumeCB); // checker-boarded block size
711  param.grid.x = 2*arg.coarseVolumeCB;
712  }
713  }
714 
717  {
719  param.aux.x = 1; // aggregates per block
720  param.aux.y = arg.shared_atomic;
721  param.aux.z = arg.parity_flip; // not actually tuned over at present
722 
723  // with shared-atomic VUV, each block.x matches exactly to a c/b aggregate
724  if (arg.shared_atomic && type == COMPUTE_VUV) {
725  param.block.x = arg.fineVolumeCB/(2*arg.coarseVolumeCB); // checker-boarded block size
726  param.grid.x = 2*arg.coarseVolumeCB;
727  }
728  }
729 
730  TuneKey tuneKey() const {
731  char Aux[TuneKey::aux_n];
732  strcpy(Aux,aux);
733 
734  if (type == COMPUTE_UV) strcat(Aux,",computeUV");
735  else if (type == COMPUTE_AV) strcat(Aux,",computeAV");
736  else if (type == COMPUTE_TMAV) strcat(Aux,",computeTmAV");
737  else if (type == COMPUTE_TMCAV) strcat(Aux,",computeTmcAV");
738  else if (type == COMPUTE_CLOVER_INV_MAX)
739  strcat(Aux, ",computeCloverInverseMax");
740  else if (type == COMPUTE_TWISTED_CLOVER_INV_MAX)
741  strcat(Aux, ",computeTwistedCloverInverseMax");
742  else if (type == COMPUTE_VUV) strcat(Aux,",computeVUV");
743  else if (type == COMPUTE_COARSE_CLOVER) strcat(Aux,",computeCoarseClover");
744  else if (type == COMPUTE_REVERSE_Y) strcat(Aux,",computeYreverse");
745  else if (type == COMPUTE_DIAGONAL) strcat(Aux,",computeCoarseDiagonal");
746  else if (type == COMPUTE_TMDIAGONAL) strcat(Aux,",computeCoarseTmDiagonal");
747  else if (type == COMPUTE_CONVERT) strcat(Aux,",computeConvert");
748  else if (type == COMPUTE_RESCALE) strcat(Aux,",computeRescale");
749  else errorQuda("Unknown type=%d\n", type);
750 
751 #ifdef DYNAMIC_CLOVER
752  if (type == COMPUTE_AV || type == COMPUTE_CLOVER_INV_MAX || // ensure separate tuning for dynamic
754  strcat(Aux, ",Dynamic");
755 #endif
756 
757  if (type == COMPUTE_UV || type == COMPUTE_VUV) {
758  if (dim == 0) strcat(Aux, ",dim=0");
759  else if (dim == 1) strcat(Aux, ",dim=1");
760  else if (dim == 2) strcat(Aux, ",dim=2");
761  else if (dim == 3) strcat(Aux, ",dim=3");
762 
763  if (dir == QUDA_BACKWARDS) strcat(Aux,",dir=back");
764  else if (dir == QUDA_FORWARDS) strcat(Aux,",dir=fwd");
765 
766  if (arg.bidirectional && type == COMPUTE_VUV) strcat(Aux,",bidirectional");
767  }
768 
769  const char *vol_str = (type == COMPUTE_REVERSE_Y || type == COMPUTE_DIAGONAL || type == COMPUTE_TMDIAGONAL ||
770  type == COMPUTE_CONVERT || type == COMPUTE_RESCALE) ? X.VolString () : meta.VolString();
771 
772  if (type == COMPUTE_VUV || type == COMPUTE_COARSE_CLOVER) {
773  strcat(Aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped," :
774  meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device," : ",CPU,");
775  strcat(Aux,"coarse_vol=");
776  strcat(Aux,X.VolString());
777  } else {
778  strcat(Aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped" :
779  meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device" : ",CPU");
780  }
781 
782  return TuneKey(vol_str, typeid(*this).name(), Aux);
783  }
784 
785  void preTune() {
786  switch (type) {
787  case COMPUTE_VUV:
788  Y_atomic.backup();
789  case COMPUTE_DIAGONAL:
790  case COMPUTE_TMDIAGONAL:
792  X_atomic.backup();
793  break;
794  case COMPUTE_CONVERT:
795  if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.backup();
796  if (X_atomic.Gauge_p() == X.Gauge_p()) X.backup();
797  break;
798  case COMPUTE_RESCALE:
799  Y.backup();
800  case COMPUTE_UV:
801  case COMPUTE_AV:
802  case COMPUTE_TMAV:
803  case COMPUTE_TMCAV:
806  case COMPUTE_REVERSE_Y:
807  break;
808  default:
809  errorQuda("Undefined compute type %d", type);
810  }
811  }
812 
813  void postTune() {
814  switch (type) {
815  case COMPUTE_VUV:
816  Y_atomic.restore();
817  case COMPUTE_DIAGONAL:
818  case COMPUTE_TMDIAGONAL:
820  X_atomic.restore();
821  break;
822  case COMPUTE_CONVERT:
823  if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.restore();
824  if (X_atomic.Gauge_p() == X.Gauge_p()) X.restore();
825  break;
826  case COMPUTE_RESCALE:
827  Y.restore();
828  case COMPUTE_UV:
829  case COMPUTE_AV:
830  case COMPUTE_TMAV:
831  case COMPUTE_TMCAV:
834  case COMPUTE_REVERSE_Y:
835  break;
836  default:
837  errorQuda("Undefined compute type %d", type);
838  }
839  }
840  };
841 
842 
843 
867  template<bool from_coarse, typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, typename F,
868  typename Ftmp, typename Vt, typename coarseGauge, typename coarseGaugeAtomic, typename fineGauge, typename fineClover>
869  void calculateY(coarseGauge &Y, coarseGauge &X,
870  coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic,
871  Ftmp &UV, F &AV, Vt &V, fineGauge &G, fineClover &C, fineClover &Cinv,
872  GaugeField &Y_, GaugeField &X_, GaugeField &Y_atomic_, GaugeField &X_atomic_,
874  double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc,
875  bool need_bidirectional, const int *fine_to_coarse, const int *coarse_to_fine) {
876 
877  // sanity checks
879  errorQuda("Unsupported coarsening of matpc = %d", matpc);
880 
881  bool is_dirac_coarse = (dirac == QUDA_COARSE_DIRAC || dirac == QUDA_COARSEPC_DIRAC) ? true : false;
882  if (is_dirac_coarse && fineSpin != 2)
883  errorQuda("Input Dirac operator %d should have nSpin=2, not nSpin=%d\n", dirac, fineSpin);
884  if (!is_dirac_coarse && fineSpin != 4)
885  errorQuda("Input Dirac operator %d should have nSpin=4, not nSpin=%d\n", dirac, fineSpin);
886  if (!is_dirac_coarse && fineColor != 3)
887  errorQuda("Input Dirac operator %d should have nColor=3, not nColor=%d\n", dirac, fineColor);
888 
889  if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
890  const int nDim = 4;
891 
892  int x_size[QUDA_MAX_DIM] = { };
893  for (int i=0; i<4; i++) x_size[i] = v.X(i);
894  x_size[4] = 1;
895 
896  int xc_size[QUDA_MAX_DIM] = { };
897  for (int i=0; i<4; i++) xc_size[i] = X_.X()[i];
898  xc_size[4] = 1;
899 
900  int geo_bs[QUDA_MAX_DIM] = { };
901  for(int d = 0; d < nDim; d++) geo_bs[d] = x_size[d]/xc_size[d];
902  int spin_bs = V.Nspin()/Y.NspinCoarse();
903 
904  // If doing a preconditioned operator with a clover term then we
905  // have bi-directional links, though we can do the bidirectional setup for all operators for debugging
906  bool bidirectional_links = (dirac == QUDA_CLOVERPC_DIRAC || dirac == QUDA_COARSEPC_DIRAC || bidirectional_debug ||
907  dirac == QUDA_TWISTED_MASSPC_DIRAC || dirac == QUDA_TWISTED_CLOVERPC_DIRAC || need_bidirectional);
908 
909  if (getVerbosity() >= QUDA_VERBOSE) {
910  if (bidirectional_links) printfQuda("Doing bi-directional link coarsening\n");
911  else printfQuda("Doing uni-directional link coarsening\n");
912  }
913 
914  //Calculate UV and then VUV for each dimension, accumulating directly into the coarse gauge field Y
915 
917  Arg arg(Y, X, Y_atomic, X_atomic, UV, AV, G, V, C, Cinv, kappa,
918  mu, mu_factor, x_size, xc_size, geo_bs, spin_bs, fine_to_coarse, coarse_to_fine, bidirectional_links);
920 
921  QudaFieldLocation location = checkLocation(Y_, X_, av, v);
922  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Running link coarsening on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
923 
924  // do exchange of null-space vectors
925  const int nFace = 1;
926  v.exchangeGhost(QUDA_INVALID_PARITY, nFace, 0);
927  arg.V.resetGhost(v, v.Ghost()); // point the accessor to the correct ghost buffer
928  if (&v == &av) arg.AV.resetGhost(av, av.Ghost());
929  LatticeField::bufferIndex = (1 - LatticeField::bufferIndex); // update ghost bufferIndex for next exchange
930 
931  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("V2 = %e\n", arg.V.norm2());
932 
933  // If doing preconditioned clover then we first multiply the
934  // null-space vectors by the clover inverse matrix, since this is
935  // needed for the coarse link computation
936  if ( dirac == QUDA_CLOVERPC_DIRAC && (matpc == QUDA_MATPC_EVEN_EVEN || matpc == QUDA_MATPC_ODD_ODD) ) {
937  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing AV\n");
938 
939  if (av.Precision() == QUDA_HALF_PRECISION) {
940 #ifdef DYNAMIC_CLOVER
942  y.apply(0);
943  double max = 6 * arg.max_h;
944 #else
945  double max = 6*arg.Cinv.abs_max(0);
946 #endif
947  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("clover max %e\n", max);
948  av.Scale(max);
949  arg.AV.resetScale(max);
950  }
951 
953  y.apply(0);
954 
955  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("AV2 = %e\n", arg.AV.norm2());
956  }
957 
958  // If doing preconditioned twisted-mass then we first multiply the
959  // null-space vectors by the inverse twist, since this is
960  // needed for the coarse link computation
961  if ( dirac == QUDA_TWISTED_MASSPC_DIRAC && (matpc == QUDA_MATPC_EVEN_EVEN || matpc == QUDA_MATPC_ODD_ODD) ) {
962  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing TMAV\n");
963 
964  if (av.Precision() == QUDA_HALF_PRECISION) {
965  // this is just a trivial rescaling kernel, find the maximum
966  complex<Float> fp(1./(1.+arg.mu*arg.mu),-arg.mu/(1.+arg.mu*arg.mu));
967  complex<Float> fm(1./(1.+arg.mu*arg.mu),+arg.mu/(1.+arg.mu*arg.mu));
968  double max = std::max(abs(fp), abs(fm));
969  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("tm max %e\n", max);
970  av.Scale(max);
971  arg.AV.resetScale(max);
972  }
973 
975  y.apply(0);
976 
977  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("AV2 = %e\n", arg.AV.norm2());
978  }
979 
980  // If doing preconditioned twisted-clover then we first multiply the
981  // null-space vectors by the inverse of the squared clover matrix plus
982  // mu^2, and then we multiply the result by the clover matrix. This is
983  // needed for the coarse link computation
984  if ( dirac == QUDA_TWISTED_CLOVERPC_DIRAC && (matpc == QUDA_MATPC_EVEN_EVEN || matpc == QUDA_MATPC_ODD_ODD) ) {
985  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing TMCAV\n");
986 
987  if (av.Precision() == QUDA_HALF_PRECISION) {
988 #ifdef DYNAMIC_CLOVER
990  y.apply(0);
991  double max = 6*sqrt(arg.max_h);
992 #else
993  double max = 6*sqrt(arg.Cinv.abs_max(0));
994 #endif
995  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("tmc max %e\n", max);
996  av.Scale(max);
997  arg.AV.resetScale(max);
998  }
999 
1001  y.apply(0);
1002 
1003  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("AV2 = %e\n", arg.AV.norm2());
1004  }
1005 
1006  // work out what to set the scales to
1007  if (coarseGaugeAtomic::fixedPoint()) {
1008  double max = 500.0; // Should be more than sufficient
1009  arg.Y_atomic.resetScale(max);
1010  arg.X_atomic.resetScale(max);
1011  }
1012 
1013  // zero the atomic fields before we start summing to them
1014  Y_atomic_.zero();
1015  X_atomic_.zero();
1016 
1017  bool set_scale = false; // records where the scale has been set already or not
1018 
1019  // First compute the coarse forward links if needed
1020  if (bidirectional_links) {
1021  for (int d = 0; d < nDim; d++) {
1022  y.setDimension(d);
1024  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing forward %d UV and VUV\n", d);
1025 
1026  if (uv.Precision() == QUDA_HALF_PRECISION) {
1027  double U_max = 3.0*arg.U.abs_max(from_coarse ? d+4 : d);
1028  double uv_max = U_max * v.Scale();
1029  uv.Scale(uv_max);
1030  arg.UV.resetScale(uv_max);
1031 
1032  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("%d U_max = %e v_max = %e uv_max = %e\n", d, U_max, v.Scale(), uv_max);
1033  }
1034 
1035  y.setComputeType(COMPUTE_UV); // compute U*V product
1036  y.apply(0);
1037  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("UV2[%d] = %e\n", d, arg.UV.norm2());
1038 
1039  // if we are writing to a temporary, we need to zero it before each computation
1040  if (Y_atomic.Geometry() == 1) Y_atomic_.zero();
1041 
1042  y.setComputeType(COMPUTE_VUV); // compute Y += VUV
1043  y.apply(0);
1044  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Y2[%d] (atomic) = %e\n", 4+d, arg.Y_atomic.norm2( (4+d) % arg.Y_atomic.geometry ));
1045 
1046  // now convert from atomic to application computation format if necessary for Y[d]
1047  if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint()) {
1048 
1049  if (coarseGauge::fixedPoint()) {
1050  double y_max = arg.Y_atomic.abs_max( (4+d) % arg.Y_atomic.geometry );
1051 
1052  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Y[%d] (atomic) max = %e Y[%d] scale = %e\n", 4+d, y_max, 4+d, Y_.Scale());
1053  if (!set_scale) {
1054  Y_.Scale(1.1*y_max); // slightly oversize to avoid unnecessary rescaling
1055  arg.Y.resetScale(Y_.Scale());
1056  set_scale = true;
1057  } else if (y_max > Y_.Scale()) {
1058  // we have exceeded the maximum used before so we need to reset the maximum and rescale the elements
1059  arg.rescale = Y_.Scale() / y_max; // how much we need to shrink the elements by
1061 
1062  for (int d_=0; d_<d; d_++) {
1063  y.setDimension(d_);
1064  y.apply(0);
1065  }
1066 
1067  y.setDimension(d);
1068  Y_.Scale(y_max);
1069  arg.Y.resetScale(Y_.Scale());
1070  }
1071  }
1072 
1074  y.apply(0);
1075 
1076  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Y2[%d] = %e\n", 4+d, arg.Y.norm2( 4+d ));
1077  }
1078 
1079  }
1080  }
1081 
1082  if ( (dirac == QUDA_CLOVERPC_DIRAC || dirac == QUDA_TWISTED_MASSPC_DIRAC || dirac == QUDA_TWISTED_CLOVERPC_DIRAC) &&
1083  (matpc == QUDA_MATPC_EVEN_EVEN || matpc == QUDA_MATPC_ODD_ODD) ) {
1084  av.exchangeGhost(QUDA_INVALID_PARITY, nFace, 0);
1085  arg.AV.resetGhost(av, av.Ghost()); // make sure we point to the correct pointer in the accessor
1086  LatticeField::bufferIndex = (1 - LatticeField::bufferIndex); // update ghost bufferIndex for next exchange
1087  }
1088 
1089  // Now compute the backward links
1090  for (int d = 0; d < nDim; d++) {
1091  y.setDimension(d);
1093  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing backward %d UV and VUV\n", d);
1094 
1095  if (uv.Precision() == QUDA_HALF_PRECISION) {
1096  double U_max = 3.0*arg.U.abs_max(d);
1097  double uv_max = U_max * av.Scale();
1098  uv.Scale(uv_max);
1099  arg.UV.resetScale(uv_max);
1100 
1101  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("%d U_max = %e av_max = %e uv_max = %e\n", d, U_max, av.Scale(), uv_max);
1102  }
1103 
1104  y.setComputeType(COMPUTE_UV); // compute U*A*V product
1105  y.apply(0);
1106  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("UAV2[%d] = %e\n", d, arg.UV.norm2());
1107 
1108  // if we are writing to a temporary, we need to zero it before each computation
1109  if (Y_atomic.Geometry() == 1) Y_atomic_.zero();
1110 
1111  y.setComputeType(COMPUTE_VUV); // compute Y += VUV
1112  y.apply(0);
1113  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Y2[%d] (atomic) = %e\n", d, arg.Y_atomic.norm2( d%arg.Y_atomic.geometry ));
1114 
1115  // now convert from atomic to application computation format if necessary for Y[d]
1116  if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint() ) {
1117 
1118  if (coarseGauge::fixedPoint()) {
1119  double y_max = arg.Y_atomic.abs_max( d % arg.Y_atomic.geometry );
1120  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Y[%d] (atomic) max = %e Y[%d] scale = %e\n", d, y_max, d, Y_.Scale());
1121 
1122  if (!set_scale) {
1123  Y_.Scale(1.1*y_max); // slightly oversize to avoid unnecessary rescaling
1124  arg.Y.resetScale(Y_.Scale());
1125  set_scale = true;
1126  } else if (y_max > Y_.Scale()) {
1127  // we have exceeded the maximum used before so we need to reset the maximum and rescale the elements
1128  arg.rescale = Y_.Scale() / y_max; // how much we need to shrink the elements by
1130 
1131  // update all prior compute Y links
1132  if (bidirectional_links) {
1134  for (int d_=0; d_<4; d_++) {
1135  y.setDimension(d_);
1136  y.apply(0);
1137  }
1138  }
1139 
1141  for (int d_=0; d_<d; d_++) {
1142  y.setDimension(d_);
1143  y.apply(0);
1144  }
1145 
1146  y.setDimension(d);
1147  Y_.Scale(y_max);
1148  arg.Y.resetScale(Y_.Scale());
1149  }
1150  }
1151 
1153  y.apply(0);
1154 
1155  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Y2[%d] = %e\n", d, arg.Y.norm2( d ));
1156  }
1157 
1158  }
1159 
1160  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", arg.X_atomic.norm2(0));
1161 
1162  // if not doing a preconditioned operator then we can trivially
1163  // construct the forward links from the backward links
1164  if ( !bidirectional_links ) {
1165  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reversing links\n");
1166  y.setComputeType(COMPUTE_REVERSE_Y); // reverse the links for the forwards direction
1167  y.apply(0);
1168  }
1169 
1170  // Check if we have a clover term that needs to be coarsened
1171  if (dirac == QUDA_CLOVER_DIRAC || dirac == QUDA_COARSE_DIRAC || dirac == QUDA_TWISTED_CLOVER_DIRAC) {
1172  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing fine->coarse clover term\n");
1174  y.apply(0);
1175  } else { //Otherwise, we just have to add the identity matrix
1176  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Summing diagonal contribution to coarse clover\n");
1178  y.apply(0);
1179  }
1180 
1181  if (arg.mu*arg.mu_factor!=0 || dirac == QUDA_TWISTED_MASS_DIRAC || dirac == QUDA_TWISTED_CLOVER_DIRAC) {
1182  if (dirac == QUDA_TWISTED_MASS_DIRAC || dirac == QUDA_TWISTED_CLOVER_DIRAC)
1183  arg.mu_factor += 1.;
1184  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Adding mu = %e\n",arg.mu*arg.mu_factor);
1186  y.apply(0);
1187  }
1188 
1189  // now convert from atomic to application computation format if necessary for X field
1190  if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint() ) {
1191  // dim=4 corresponds to X field
1192  y.setDimension(8);
1194 
1195  if (coarseGauge::fixedPoint()) {
1196  double x_max = arg.X_atomic.abs_max(0);
1197  X_.Scale(x_max);
1198  arg.X.resetScale(x_max);
1199  }
1200 
1202  y.apply(0);
1203  }
1204 
1205  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", arg.X.norm2(0));
1206 
1207  }
1208 
1209 
1210 } // namespace quda
void setDirection(QudaDirection dir_)
Definition: coarse_op.cuh:605
TuneKey tuneKey() const
Definition: coarse_op.cuh:730
long long flops() const
Definition: coarse_op.cuh:47
double mu
Definition: test_util.cpp:1648
const char * AuxString() const
void resizeStep(int y, int z) const
Definition: tune_quda.h:539
cudaDeviceProp deviceProp
long long bytes() const
Definition: coarse_op.cuh:91
bool advanceSharedBytes(TuneParam &param) const
Definition: coarse_op.cuh:691
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
double kappa
Definition: test_util.cpp:1647
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:238
#define errorQuda(...)
Definition: util_quda.h:121
bool advanceAux(TuneParam &param) const
Definition: coarse_op.cuh:643
Helper file when using jitify run-time compilation. This file should be included in source code...
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53
void initTuneParam(TuneParam &param) const
Definition: coarse_op.cuh:701
cudaStream_t * stream
double Scale() const
const char * VolString() const
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137
virtual ~CalculateY()
Definition: coarse_op.cuh:194
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
QudaGaugeParam param
Definition: pack_test.cpp:17
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
virtual QudaMemoryType MemType() const
enum QudaDirection_s QudaDirection
GaugeField & X
Definition: coarse_op.cuh:39
static int bufferIndex
bool tuneAuxDim() const
Definition: coarse_op.cuh:172
unsigned int minThreads() const
Definition: coarse_op.cuh:145
GaugeField & Y_atomic
Definition: coarse_op.cuh:40
void setComputeType(ComputeType type_)
Definition: coarse_op.cuh:610
void setDimension(int dim_)
Definition: coarse_op.cuh:600
enum QudaMatPCType_s QudaMatPCType
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define max_color_per_block
CUresult jitify_error
Definition: tune_quda.h:276
#define checkLocation(...)
virtual void backup() const
Backs up the LatticeField.
const char * vol_str
Definition: copy_quda.cu:27
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
Definition: util_quda.cpp:134
QudaDirection dir
Definition: coarse_op.cuh:44
int V
Definition: test_util.cpp:27
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:121
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
void * Ghost(const int i)
void calculateY(coarseGauge &Y, coarseGauge &X, coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic, Ftmp &UV, F &AV, Vt &V, fineGauge &G, fineClover &C, fineClover &Cinv, GaugeField &Y_, GaugeField &X_, GaugeField &Y_atomic_, GaugeField &X_atomic_, ColorSpinorField &uv, ColorSpinorField &av, const ColorSpinorField &v, double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc, bool need_bidirectional, const int *fine_to_coarse, const int *coarse_to_fine)
Calculate the coarse-link field, including the coarse clover field.
Definition: coarse_op.cuh:869
__host__ double norm2(bool global=true) const
QudaFieldLocation Location() const
GaugeField & Y
Definition: coarse_op.cuh:38
enum QudaFieldLocation_s QudaFieldLocation
GaugeCovDev * dirac
Definition: covdev_test.cpp:73
const ColorSpinorField & meta
Definition: coarse_op.cuh:37
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:139
void apply(const cudaStream_t &stream)
Definition: coarse_op.cuh:196
GaugeField & X_atomic
Definition: coarse_op.cuh:41
void resizeVector(int y, int z) const
Definition: tune_quda.h:538
static const int aux_n
Definition: tune_key.h:12
#define printfQuda(...)
Definition: util_quda.h:115
ComputeType
Definition: coarse_op.cuh:13
double mu_factor[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1674
static bool bidirectional_debug
Definition: coarse_op.cuh:11
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
Definition: spinor_noise.cu:23
virtual void zero()=0
ComputeType type
Definition: coarse_op.cuh:45
const int * X() const
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
virtual void * Gauge_p()
Definition: gauge_field.h:315
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const =0
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: coarse_op.cuh:174
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:430
virtual void restore() const
Restores the LatticeField.
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265
CalculateY(Arg &arg, const ColorSpinorField &meta, GaugeField &Y, GaugeField &X, GaugeField &Y_atomic, GaugeField &X_atomic)
Definition: coarse_op.cuh:180
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531
enum QudaDiracType_s QudaDiracType
const int * X() const
bool tuneGridDim() const
Definition: coarse_op.cuh:171
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:335
void defaultTuneParam(TuneParam &param) const
Definition: coarse_op.cuh:716
bool advanceTuneParam(TuneParam &param) const
Definition: coarse_op.cuh:695