QUDA  v1.1.0
A library for QCD on GPUs
dslash.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <typeinfo>
4 
5 #include <color_spinor_field.h>
6 #include <tune_quda.h>
7 #include <dslash_quda.h>
8 #include <dslash_helper.cuh>
9 #include <jitify_helper.cuh>
10 #include <instantiate.h>
11 #include <instantiate_dslash.h>
12 
13 namespace quda
14 {
15 
31  template <template <int, bool, bool, KernelType, typename> class D, typename Arg>
32  class Dslash : public TunableVectorYZ
33  {
34 
35  protected:
36  Arg &arg;
39 
40  const int nDimComms;
41 
43  char aux[8][TuneKey::aux_n];
46 
47  // pointers to ghost buffers we are packing to
49 
55  inline void fillAuxBase()
56  {
57  char comm[5];
58  comm[0] = (arg.commDim[0] ? '1' : '0');
59  comm[1] = (arg.commDim[1] ? '1' : '0');
60  comm[2] = (arg.commDim[2] ? '1' : '0');
61  comm[3] = (arg.commDim[3] ? '1' : '0');
62  comm[4] = '\0';
63  strcpy(aux_base, ",commDim=");
64  strcat(aux_base, comm);
65 
66  if (arg.xpay) strcat(aux_base, ",xpay");
67  if (arg.dagger) strcat(aux_base, ",dagger");
68  }
69 
75  inline void fillAux(KernelType kernel_type, const char *kernel_str)
76  {
77  strcpy(aux[kernel_type], kernel_str);
78  if (kernel_type == INTERIOR_KERNEL) strcat(aux[kernel_type], comm_dim_partitioned_string());
79  strncat(aux[kernel_type], aux_base, TuneKey::aux_n - 1);
80  }
81 
82  virtual bool tuneGridDim() const { return arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0; }
83  virtual unsigned int minThreads() const { return arg.threads; }
84 
85  virtual unsigned int minGridSize() const
86  {
87  /* when using nvshmem we perform the exterior Dslash using a grid strided loop and uniquely assign communication
88  * directions to CUDA block and have all communication directions resident. We therefore figure out the number of
89  * communicating dimensions and make sure that the number of blocks is a multiple of the communicating directions (2*dim)
90  */
91  if (arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0) {
92  int nDimComms = 0;
93  for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];
94  return ((deviceProp.multiProcessorCount) / (2 * nDimComms)) * (2 * nDimComms);
95  } else {
97  }
98  }
99 
100  virtual int gridStep() const
101  {
102  /* see comment for minGridSize above for gridStep choice when using nvshmem */
103  if (arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0) {
104  int nDimComms = 0;
105  for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];
106  return ((deviceProp.multiProcessorCount) / (2 * nDimComms)) * (2 * nDimComms);
107  } else {
108  return TunableVectorYZ::gridStep();
109  }
110  }
111 
112  inline void setParam(TuneParam &tp)
113  {
114  arg.t_proj_scale = getKernelPackT() ? 1.0 : 2.0;
115 
116  // Need to reset ghost pointers prior to every call since the
117  // ghost buffer may have been changed during policy tuning.
118  // Also, the accessor constructor calls Ghost(), which uses
119  // ghost_buf, but this is only presently set with the
120  // synchronous exchangeGhost.
121  static void *ghost[8] = {}; // needs to be persistent across interior and exterior calls
122  for (int dim = 0; dim < 4; dim++) {
123 
124  for (int dir = 0; dir < 2; dir++) {
125  // if doing interior kernel, then this is the initial call,
126  // so we set all ghost pointers else if doing exterior
127  // kernel, then we only have to update the non-p2p ghosts,
128  // since these may have been assigned to zero-copy memory
129  if (!comm_peer2peer_enabled(dir, dim) || arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL) {
130  ghost[2 * dim + dir] = (typename Arg::Float *)((char *)in.Ghost2() + in.GhostOffset(dim, dir));
131  }
132  }
133  }
134 
135  arg.in.resetGhost(in, ghost);
136 
137  if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {
138  arg.blocks_per_dir = tp.aux.x;
139  arg.setPack(true, this->packBuffer); // need to recompute for updated block_per_dir
140  arg.in_pack.resetGhost(in, this->packBuffer);
141  tp.grid.x += arg.pack_blocks;
143  }
144  if (arg.shmem > 0 && arg.kernel_type == EXTERIOR_KERNEL_ALL) {
145  // if we are doing tuning we should not wait on the sync_arr to be set.
146  arg.counter = (activeTuning() && !policyTuning()) ? 2 : dslash::get_shmem_sync_counter();
147  }
148  if (arg.shmem > 0 && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {
149  arg.counter = activeTuning() ?
152  arg.exterior_blocks = ((arg.shmem & 64) && arg.exterior_dims > 0) ?
153  ((deviceProp.multiProcessorCount) / (2 * arg.exterior_dims)) * (2 * arg.exterior_dims * tp.aux.y) :
154  0;
155  tp.grid.x += arg.exterior_blocks;
156  }
157  }
158 
159  virtual int tuningIter() const { return 10; }
160 
161  virtual int blockStep() const { return 16; }
162  virtual int blockMin() const { return 16; }
163 
164  unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
165 
166  virtual bool advanceAux(TuneParam &param) const
167  {
168  if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {
169 
170  int max_threads_per_dir = 0;
171  for (int i = 0; i < 4; ++i) {
172  max_threads_per_dir = std::max(max_threads_per_dir, (arg.threadDimMapUpper[i] - arg.threadDimMapLower[i]) / 2);
173  }
174  int nDimComms = 0;
175  for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];
176 
177  /* if doing the fused packing + interior kernel we tune how many blocks to use for communication */
178  // use up to a quarter of the GPU for packing (but at least up to 4 blocks per dir)
179  const int max_blocks_per_dir = std::max((deviceProp.multiProcessorCount) / (8 * nDimComms), 4);
180  if (param.aux.x + 1 <= max_blocks_per_dir
181  && (param.aux.x + 1) * param.block.x < (max_threads_per_dir + param.block.x - 1)) {
182  param.aux.x++;
183  return true;
184  } else {
185  param.aux.x = 1;
186  if (arg.exterior_dims > 0 && arg.shmem & 64) {
187  /* if doing a fused interior+exterior kernel we use aux.y to control the number of blocks we add for the
188  * exterior. We make sure to use multiple blocks per communication direction.
189  */
190  auto maxgridsize = TunableVectorYZ::maxGridSize();
191  if (param.aux.y < 4) {
192  param.aux.y++;
193  return true;
194  } else {
195  param.aux.y = 1;
196  return false;
197  }
198  }
199  return false;
200  }
201  } else {
202  return false;
203  }
204  }
205 
206  virtual bool advanceTuneParam(TuneParam &param) const
207  {
209  }
210 
211  virtual void initTuneParam(TuneParam &param) const
212  {
213  /* for nvshmem uber kernels the current synchronization requires use to keep the y and z dimension local to the
214  * block. This can be removed when we introduce a finer grained synchronization which takes into account the y and
215  * z components explicitly */
216  if (arg.shmem & 64) {
219  }
221  if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL))
222  param.aux.x = 1; // packing blocks per direction
223  if (arg.exterior_dims && arg.kernel_type == UBER_KERNEL) param.aux.y = 1; // exterior blocks
224  }
225 
226  virtual void defaultTuneParam(TuneParam &param) const
227  {
228  /* for nvshmem uber kernels the current synchronization requires use to keep the y and z dimension local to the
229  * block. This can be removed when we introduce a finer grained synchronization which takes into account the y and
230  * z components explicitly. */
231  if (arg.shmem & 64) {
234  }
236  if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL))
237  param.aux.x = 1; // packing blocks per direction
238  if (arg.exterior_dims && arg.kernel_type == UBER_KERNEL) param.aux.y = 1; // exterior blocks
239  }
240 
247  template <template <bool, QudaPCType, typename> class P, int nParity, bool dagger, bool xpay, KernelType kernel_type>
248  inline void launch(TuneParam &tp, const qudaStream_t &stream)
249  {
250  if (deviceProp.major >= 7) { // should test whether this is always optimal on Volta
251  tp.set_max_shared_bytes = true;
252  }
253  qudaLaunchKernel(dslashGPU<D, P, nParity, dagger, xpay, kernel_type, Arg>, tp, stream, arg);
254  }
255 
256 #ifdef JITIFY
260  template <template <bool, QudaPCType, typename> class P> auto kernel_instance()
261  {
262  if (!program) errorQuda("Jitify program has not been created");
263  using namespace jitify::reflection;
264  const auto kernel = "quda::dslashGPU";
265 
266  // we need this hackery to get the naked unbound template class parameters
267  auto D_instance = reflect<D<0, false, false, INTERIOR_KERNEL, Arg>>();
268  auto D_naked = D_instance.substr(0, D_instance.find("<"));
269  auto P_instance = reflect<P<false, QUDA_4D_PC, Arg>>();
270  auto P_naked = P_instance.substr(0, P_instance.find("<"));
271 
272  // Since we pass the operator and packer classes as strings to
273  // jitify, we need to handle the reflection for all other
274  // template parameters here as well as opposed to leaving this
275  // to jitify.
276  auto instance = program->kernel(kernel).instantiate({D_naked, P_naked, reflect(arg.nParity), reflect(arg.dagger),
277  reflect(arg.xpay), reflect(arg.kernel_type), reflect<Arg>()});
278 
279  return instance;
280  }
281 #endif
282 
283  public:
290  template <template <bool, QudaPCType, typename> class P, int nParity, bool dagger, bool xpay>
291  inline void instantiate(TuneParam &tp, const qudaStream_t &stream)
292  {
294  errorQuda("Not implemented");
295  } else {
296 #ifdef JITIFY
297  Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);
298 #else
299  switch (arg.kernel_type) {
300  case INTERIOR_KERNEL: launch<P, nParity, dagger, xpay, INTERIOR_KERNEL>(tp, stream); break;
301 #ifdef MULTI_GPU
302 #ifdef NVSHMEM_COMMS
303  case UBER_KERNEL: launch<P, nParity, dagger, xpay, UBER_KERNEL>(tp, stream); break;
304 #endif
305  case EXTERIOR_KERNEL_X: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_X>(tp, stream); break;
306  case EXTERIOR_KERNEL_Y: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_Y>(tp, stream); break;
307  case EXTERIOR_KERNEL_Z: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_Z>(tp, stream); break;
308  case EXTERIOR_KERNEL_T: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_T>(tp, stream); break;
309  case EXTERIOR_KERNEL_ALL: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_ALL>(tp, stream); break;
310  default: errorQuda("Unexpected kernel type %d", arg.kernel_type);
311 #else
312  default: errorQuda("Unexpected kernel type %d for single-GPU build", arg.kernel_type);
313 #endif
314  }
315 #endif // JITIFY
316  }
317  }
318 
325  template <template <bool, QudaPCType, typename> class P, int nParity, bool xpay>
326  inline void instantiate(TuneParam &tp, const qudaStream_t &stream)
327  {
328 #ifdef JITIFY
329  Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);
330 #else
331  if (arg.dagger)
332  instantiate<P, nParity, true, xpay>(tp, stream);
333  else
334  instantiate<P, nParity, false, xpay>(tp, stream);
335 #endif
336  }
337 
344  template <template <bool, QudaPCType, typename> class P, bool xpay>
345  inline void instantiate(TuneParam &tp, const qudaStream_t &stream)
346  {
347 #ifdef JITIFY
348  Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);
349 #else
350  switch (arg.nParity) {
351  case 1: instantiate<P, 1, xpay>(tp, stream); break;
352  case 2: instantiate<P, 2, xpay>(tp, stream); break;
353  default: errorQuda("nParity = %d undefined\n", arg.nParity);
354  }
355 #endif
356  }
357 
364  template <template <bool, QudaPCType, typename> class P>
365  inline void instantiate(TuneParam &tp, const qudaStream_t &stream)
366  {
367 #ifdef JITIFY
368  Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);
369 #else
370  if (arg.xpay)
371  instantiate<P, true>(tp, stream);
372  else
373  instantiate<P, false>(tp, stream);
374 #endif
375  }
376 
377  Arg &dslashParam; // temporary addition for policy compatibility
378 
380  TunableVectorYZ(1, arg.nParity),
381  arg(arg),
382  out(out),
383  in(in),
384  nDimComms(4),
386  {
388  errorQuda("CPU Fields not supported in Dslash framework yet");
389 
390  // this sets the communications pattern for the packing kernel
391  setPackComms(arg.commDim);
392  // strcpy(aux, in.AuxString());
393  fillAuxBase();
394 #ifdef MULTI_GPU
395  fillAux(INTERIOR_KERNEL, "policy_kernel=interior");
396  fillAux(UBER_KERNEL, "policy_kernel=uber");
397  fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");
398  fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");
399  fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");
400  fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");
401  fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");
402 #else
403  fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");
404 #endif // MULTI_GPU
405  fillAux(KERNEL_POLICY, "policy");
406 
407 #ifdef NVSHMEM_COMMS
408  strcpy(aux_barrier, aux[EXTERIOR_KERNEL_ALL]);
409  strcat(aux_barrier, ",shmem");
410 #endif
411 
412  // extract the filename from the template template class (do
413  // this regardless of jitify to ensure a build error if filename
414  // helper isn't defined)
415  using D_ = D<0, false, false, INTERIOR_KERNEL, Arg>;
416  kernel_file = std::string("kernels/") + D_::filename();
417 #ifdef JITIFY
418  create_jitify_program(kernel_file);
419 #endif
420  }
421 
422  void setShmem(int shmem)
423  {
424 #ifdef NVSHMEM_COMMS
425  arg.shmem = shmem;
426 #endif
427  setUberTuning(arg.shmem & 64);
428  }
429 
430  void setPack(bool pack, MemoryLocation location)
431  {
432  if (!pack) {
433  arg.setPack(pack, packBuffer);
434  return;
435  }
436 
437  for (int dim = 0; dim < 4; dim++) {
438  for (int dir = 0; dir < 2; dir++) {
439  if ((location & Remote) && comm_peer2peer_enabled(dir, dim)) { // pack to p2p remote
440  packBuffer[2 * dim + dir] = static_cast<char *>(in.remoteFace_d(dir, dim)) + in.GhostOffset(dim, 1 - dir);
441  } else if (location & Host && !comm_peer2peer_enabled(dir, dim)) { // pack to cpu memory
442  packBuffer[2 * dim + dir] = in.myFace_hd(dir, dim);
443  } else if (location & Shmem) {
444  // we check whether we can directly pack into the in.remoteFace_d(dir, dim) buffer on the remote GPU
445  // pack directly into remote or local memory
446  packBuffer[2 * dim + dir] = in.remoteFace_d(dir, dim) ?
447  static_cast<char *>(in.remoteFace_d(dir, dim)) + in.GhostOffset(dim, 1 - dir) :
448  in.myFace_d(dir, dim);
449  // whether we need to shmem_putmem into the receiving buffer
450  packBuffer[2 * QUDA_MAX_DIM + 2 * dim + dir] = in.remoteFace_d(dir, dim) ?
451  nullptr :
452  static_cast<char *>(in.remoteFace_r()) + in.GhostOffset(dim, 1 - dir);
453  } else { // pack to local gpu memory
454  packBuffer[2 * dim + dir] = in.myFace_d(dir, dim);
455  }
456  }
457  }
458 
459  arg.setPack(pack, packBuffer);
460  // set the tuning string for the fused interior + packer kernel
461  strcpy(aux_pack, aux[arg.kernel_type]);
462  strcat(aux_pack, "");
463 
464  // label the locations we are packing to
465  // location label is nonp2p-p2p
466  switch ((int)location) {
467  case Device | Remote: strcat(aux_pack, ",device-remote"); break;
468  case Host | Remote: strcat(aux_pack, ",host-remote"); break;
469  case Device: strcat(aux_pack, ",device-device"); break;
470  case Host: strcat(aux_pack, comm_peer2peer_enabled_global() ? ",host-device" : ",host-host"); break;
471  case Shmem:
472  strcat(aux_pack, arg.exterior_dims > 0 ? ",shmemuber" : ",shmem");
473  strcat(aux_pack, (arg.shmem & 1 && arg.shmem & 2) ? "3" : "1");
474  break;
475 
476  default: errorQuda("Unknown pack target location %d\n", location);
477  }
478  }
479 
480  int Nface() const
481  {
482  return 2 * arg.nFace;
483  } // factor of 2 is for forwards/backwards (convention used in dslash policy)
484  int Dagger() const { return arg.dagger; }
485 
486  const char *getAux(KernelType type) const { return aux[type]; }
487 
488  void setAux(KernelType type, const char *aux_) { strcpy(aux[type], aux_); }
489 
490  void augmentAux(KernelType type, const char *extra) { strcat(aux[type], extra); }
491 
492  virtual TuneKey tuneKey() const
493  {
494  auto aux_ = (arg.pack_blocks > 0 && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) ?
495  aux_pack :
496  ((arg.shmem > 0 && arg.kernel_type == EXTERIOR_KERNEL_ALL) ? aux_barrier : aux[arg.kernel_type]);
497  return TuneKey(in.VolString(), typeid(*this).name(), aux_);
498  }
499 
504  virtual void preTune()
505  {
506  if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != UBER_KERNEL && arg.kernel_type != KERNEL_POLICY)
507  out.backup();
508  }
509 
513  virtual void postTune()
514  {
515  if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != UBER_KERNEL && arg.kernel_type != KERNEL_POLICY)
516  out.restore();
517  }
518 
519  /*
520  per direction / dimension flops
521  spin project flops = Nc * Ns
522  SU(3) matrix-vector flops = (8 Nc - 2) * Nc
523  spin reconstruction flops = 2 * Nc * Ns (just an accumulation to all components)
524  xpay = 2 * 2 * Nc * Ns
525 
526  So for the full dslash we have, where for the final spin
527  reconstruct we have -1 since the first direction does not
528  require any accumulation.
529 
530  flops = (2 * Nd * Nc * Ns) + (2 * Nd * (Ns/2) * (8*Nc-2) * Nc) + ((2 * Nd - 1) * 2 * Nc * Ns)
531  flops_xpay = flops + 2 * 2 * Nc * Ns
532 
533  For Wilson this should give 1344 for Nc=3,Ns=2 and 1368 for the xpay equivalent
534  */
535  virtual long long flops() const
536  {
537  int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
538  int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;
539  int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());
540  int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
541  int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary
542  int pack_flops = (in.Nspin() == 4 ? 2 * in.Nspin() / 2 * in.Ncolor() : 0); // only flops if spin projecting
543 
544  long long flops_ = 0;
545 
546  // FIXME - should we count the xpay flops in the derived kernels
547  // since some kernels require the xpay in the exterior (preconditiond clover)
548 
549  switch (arg.kernel_type) {
550  case EXTERIOR_KERNEL_X:
551  case EXTERIOR_KERNEL_Y:
552  case EXTERIOR_KERNEL_Z:
553  case EXTERIOR_KERNEL_T:
554  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.GhostFace()[arg.kernel_type];
555  break;
556  case EXTERIOR_KERNEL_ALL: {
557  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
558  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
559  break;
560  }
561  case INTERIOR_KERNEL:
562  case UBER_KERNEL:
563  if (arg.pack_threads) { flops_ += pack_flops * arg.nParity * in.getDslashConstant().Ls * arg.pack_threads; }
564  case KERNEL_POLICY: {
565  long long sites = in.Volume();
566  flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)
567  num_dir * num_mv_multiply * mv_flops + // SU(3) matrix-vector multiplies
568  ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
569  * sites; // accumulation
570  if (arg.xpay) flops_ += xpay_flops * sites;
571 
572  if (arg.kernel_type == KERNEL_POLICY) break;
573  // now correct for flops done by exterior kernel
574  long long ghost_sites = 0;
575  for (int d = 0; d < 4; d++)
576  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
577  flops_ -= ghost_flops * ghost_sites;
578 
579  break;
580  }
581  }
582 
583  return flops_;
584  }
585 
586  virtual long long bytes() const
587  {
588  int gauge_bytes = arg.reconstruct * in.Precision();
589  bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
590  int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
591  int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
592  int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
593  int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary
594  int pack_bytes = 2 * ((in.Nspin() == 4 ? in.Nspin() / 2 : in.Nspin()) + in.Nspin()) * in.Ncolor() * in.Precision();
595 
596  long long bytes_ = 0;
597 
598  switch (arg.kernel_type) {
599  case EXTERIOR_KERNEL_X:
600  case EXTERIOR_KERNEL_Y:
601  case EXTERIOR_KERNEL_Z:
602  case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
603  case EXTERIOR_KERNEL_ALL: {
604  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
605  bytes_ = ghost_bytes * ghost_sites;
606  break;
607  }
608  case INTERIOR_KERNEL:
609  case UBER_KERNEL:
610  if (arg.pack_threads) { bytes_ += pack_bytes * arg.nParity * in.getDslashConstant().Ls * arg.pack_threads; }
611  case KERNEL_POLICY: {
612  long long sites = in.Volume();
613  bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
614  if (arg.xpay) bytes_ += spinor_bytes;
615 
616  if (arg.kernel_type == KERNEL_POLICY) break;
617  // now correct for bytes done by exterior kernel
618  long long ghost_sites = 0;
619  for (int d = 0; d < 4; d++)
620  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
621  bytes_ -= ghost_bytes * ghost_sites;
622 
623  break;
624  }
625  }
626  return bytes_;
627  }
628  };
629 
630 } // namespace quda
const DslashConstant & getDslashConstant() const
Get the dslash_constant structure from this field.
virtual const void * Ghost2() const
size_t GhostOffset(const int dim, const int dir) const
const int * GhostFace() const
This is the generic driver for launching Dslash kernels (the base kernel of which is defined in dslas...
Definition: dslash.h:33
const ColorSpinorField & out
Definition: dslash.h:37
char aux_barrier[TuneKey::aux_n]
Definition: dslash.h:45
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
Definition: dslash.h:75
void setPack(bool pack, MemoryLocation location)
Definition: dslash.h:430
virtual int blockMin() const
Definition: dslash.h:162
char aux_base[TuneKey::aux_n - 32]
Definition: dslash.h:42
virtual unsigned int minGridSize() const
Definition: dslash.h:85
const char * getAux(KernelType type) const
Definition: dslash.h:486
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
Definition: dslash.h:55
const ColorSpinorField & in
Definition: dslash.h:38
virtual void initTuneParam(TuneParam &param) const
Definition: dslash.h:211
int Dagger() const
Definition: dslash.h:484
const int nDimComms
Definition: dslash.h:40
Dslash(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
Definition: dslash.h:379
void augmentAux(KernelType type, const char *extra)
Definition: dslash.h:490
void setParam(TuneParam &tp)
Definition: dslash.h:112
virtual bool advanceAux(TuneParam &param) const
Definition: dslash.h:166
void setShmem(int shmem)
Definition: dslash.h:422
Arg & arg
Definition: dslash.h:36
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the xpay template.
Definition: dslash.h:365
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the dagger template.
Definition: dslash.h:326
virtual long long bytes() const
Definition: dslash.h:586
void launch(TuneParam &tp, const qudaStream_t &stream)
This is a helper class that is used to instantiate the correct templated kernel for the dslash....
Definition: dslash.h:248
virtual long long flops() const
Definition: dslash.h:535
virtual void defaultTuneParam(TuneParam &param) const
Definition: dslash.h:226
Arg & dslashParam
Definition: dslash.h:377
virtual int blockStep() const
Definition: dslash.h:161
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: dslash.h:100
virtual unsigned int minThreads() const
Definition: dslash.h:83
virtual TuneKey tuneKey() const
Definition: dslash.h:492
int Nface() const
Definition: dslash.h:480
char aux_pack[TuneKey::aux_n]
Definition: dslash.h:44
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the nParity template.
Definition: dslash.h:345
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the KernelType template required for the multi-G...
Definition: dslash.h:291
void * packBuffer[4 *QUDA_MAX_DIM]
Definition: dslash.h:48
virtual bool tuneGridDim() const
Definition: dslash.h:82
virtual int tuningIter() const
Definition: dslash.h:159
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
Definition: dslash.h:504
void setAux(KernelType type, const char *aux_)
Definition: dslash.h:488
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
Definition: dslash.h:164
virtual bool advanceTuneParam(TuneParam &param) const
Definition: dslash.h:206
std::string kernel_file
Definition: dslash.h:50
char aux[8][TuneKey::aux_n]
Definition: dslash.h:43
virtual void postTune()
Restore the output field if doing exterior kernel.
Definition: dslash.h:513
void * remoteFace_r() const
Return base pointer to the ghost recv buffer. Since this is a base pointer, one still needs to take c...
virtual void backup() const
Backs up the LatticeField.
const char * VolString() const
void * remoteFace_d(int dir, int dim) const
Return base pointer to a remote device buffer for direct sending in a given direction and dimension....
void * myFace_d(int dir, int dim) const
Return pointer to the device send buffer in a given direction and dimension.
QudaPrecision Precision() const
QudaFieldLocation Location() const
void * myFace_hd(int dir, int dim) const
Return pointer to the local mapped my_face buffer in a given direction and dimension.
virtual void restore() const
Restores the LatticeField.
unsigned int maxDynamicSharedBytesPerBlock() const
Returns the maximum dynamic shared memory per block.
Definition: tune_quda.h:220
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:138
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:113
CUresult jitify_error
Definition: tune_quda.h:283
virtual unsigned int minGridSize() const
Definition: tune_quda.h:131
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:242
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:130
unsigned int step_y
Definition: tune_quda.h:469
unsigned int vector_length_y
Definition: tune_quda.h:468
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:560
unsigned vector_length_z
Definition: tune_quda.h:524
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:533
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:568
bool set_max_shared_bytes
Definition: tune_quda.h:31
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
bool comm_peer2peer_enabled(int dir, int dim)
int comm_peer2peer_enabled_global()
std::array< int, 4 > dim
bool dagger
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
#define checkLocation(...)
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45
shmem_sync_t inc_shmem_sync_counter()
increase the shmem sync counter for the next dslash application
shmem_sync_t get_shmem_sync_counter()
Get the shmem sync counter.
bool policyTuning()
Query whether we are currently tuning a policy.
Definition: tune.cpp:512
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
Definition: tune.cpp:519
qudaStream_t * stream
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:137
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_api.cpp:57
bool getKernelPackT()
bool uberTuning()
Query whether we are tuning an uber kernel.
Definition: tune.cpp:517
FloatingPoint< float > Float
::std::string string
Definition: gtest-port.h:891
QudaGaugeParam param
Definition: pack_test.cpp:18
cudaDeviceProp deviceProp
Definition: device.cpp:14
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
static const int aux_n
Definition: tune_key.h:12
#define errorQuda(...)
Definition: util_quda.h:120