QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <color_spinor_field.h>
4 #include <tune_quda.h>
5 #include <dslash_quda.h>
6 #include <dslash_helper.cuh>
7 #include <jitify_helper.cuh>
8 
9 namespace quda
10 {
11 
12  template <typename Float> class Dslash : public TunableVectorYZ
13  {
14 
15 protected:
19 
20  const int nDimComms;
21 
23  char aux[8][TuneKey::aux_n];
24 
25 #ifdef JITIFY
26  // local copy of the static program pointer - this is a work
27  // around for issues with the static program pointer when
28  // HOSTDEBUG compilation is targeted (more precisely -fno-inline)
29  jitify::Program *program_;
30 #endif
31 
36  inline void fillAuxBase()
37  {
38  char comm[5];
39  comm[0] = (arg.commDim[0] ? '1' : '0');
40  comm[1] = (arg.commDim[1] ? '1' : '0');
41  comm[2] = (arg.commDim[2] ? '1' : '0');
42  comm[3] = (arg.commDim[3] ? '1' : '0');
43  comm[4] = '\0';
44  strcpy(aux_base, ",commDim=");
45  strcat(aux_base, comm);
46 
47  if (arg.xpay) strcat(aux_base, ",xpay");
48  if (arg.dagger) strcat(aux_base, ",dagger");
49  }
50 
56  inline void fillAux(KernelType kernel_type, const char *kernel_str)
57  {
58  strcpy(aux[kernel_type], kernel_str);
59  if (kernel_type == INTERIOR_KERNEL) strcat(aux[kernel_type], comm_dim_partitioned_string());
60  strncat(aux[kernel_type], aux_base, TuneKey::aux_n-1);
61  }
62 
63  bool tuneGridDim() const { return false; }
64  unsigned int minThreads() const { return arg.threads; }
65 
66  template <typename Arg> inline void setParam(Arg &arg)
67  {
68  arg.t_proj_scale = getKernelPackT() ? 1.0 : 2.0;
69 
70  // Need to reset ghost pointers prior to every call since the
71  // ghost buffer may have been changed during policy tuning.
72  // Also, the accessor constructor calls Ghost(), which uses
73  // ghost_buf, but this is only presently set with the
74  // synchronous exchangeGhost.
75  static void *ghost[8] = {}; // needs to be persistent across interior and exterior calls
76  for (int dim = 0; dim < 4; dim++) {
77 
78  for (int dir = 0; dir < 2; dir++) {
79  // if doing interior kernel, then this is the initial call,
80  // so we set all ghost pointers else if doing exterior
81  // kernel, then we only have to update the non-p2p ghosts,
82  // since these may have been assigned to zero-copy memory
83  if (!comm_peer2peer_enabled(dir, dim) || arg.kernel_type == INTERIOR_KERNEL) {
84  ghost[2 * dim + dir] = (Float *)((char *)in.Ghost2() + in.GhostOffset(dim, dir) * in.GhostPrecision());
85  }
86  }
87  }
88 
89  arg.in.resetGhost(in, ghost);
90  }
91 
92  virtual int tuningIter() const { return 10; }
93 
94  int blockStep() const { return 16; }
95  int blockMin() const { return 16; }
96 
97  unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
98 
99 public:
100  template <typename T, typename Arg>
101  inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
102  {
103  if (deviceProp.major >= 7) { // should test whether this is always optimal on Volta
105  }
106  void *args[] = {&arg};
107  qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
108  }
109 
117  template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
118  int nParity, bool dagger, bool xpay, typename Arg>
119  inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
120  {
121 
122  if (in.Location() == QUDA_CPU_FIELD_LOCATION) {
123  errorQuda("Not implemented");
124  } else {
125  switch (arg.kernel_type) {
126  case INTERIOR_KERNEL:
127  Launch<Float, nDim, nColor, nParity, dagger, xpay, INTERIOR_KERNEL, Arg>::launch(*this, tp, arg, stream);
128  break;
129 #ifdef MULTI_GPU
130  case EXTERIOR_KERNEL_X:
131  Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_X, Arg>::launch(*this, tp, arg, stream);
132  break;
133  case EXTERIOR_KERNEL_Y:
134  Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Y, Arg>::launch(*this, tp, arg, stream);
135  break;
136  case EXTERIOR_KERNEL_Z:
137  Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Z, Arg>::launch(*this, tp, arg, stream);
138  break;
139  case EXTERIOR_KERNEL_T:
140  Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_T, Arg>::launch(*this, tp, arg, stream);
141  break;
142  case EXTERIOR_KERNEL_ALL:
143  Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_ALL, Arg>::launch(*this, tp, arg, stream);
144  break;
145  default: errorQuda("Unexpected kernel type %d", arg.kernel_type);
146 #else
147  default: errorQuda("Unexpected kernel type %d for single-GPU build", arg.kernel_type);
148 #endif
149  }
150  }
151  }
152 
160  template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
161  int nParity, bool xpay, typename Arg>
162  inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
163  {
164 #ifdef JITIFY
165  using namespace jitify::reflection;
166  const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
168  = program_->kernel(kernel)
169  .instantiate(Type<Float>(), nDim, nColor, nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
170  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
171  .launch(arg);
172 #else
173  if (arg.dagger)
174  instantiate<Launch, nDim, nColor, nParity, true, xpay>(tp, arg, stream);
175  else
176  instantiate<Launch, nDim, nColor, nParity, false, xpay>(tp, arg, stream);
177 #endif
178  }
179 
187  template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
188  bool xpay, typename Arg>
189  inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
190  {
191 #ifdef JITIFY
192  using namespace jitify::reflection;
193  const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
195  = program_->kernel(kernel)
196  .instantiate(Type<Float>(), nDim, nColor, arg.nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
197  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
198  .launch(arg);
199 #else
200  switch (arg.nParity) {
201  case 1: instantiate<Launch, nDim, nColor, 1, xpay>(tp, arg, stream); break;
202  case 2: instantiate<Launch, nDim, nColor, 2, xpay>(tp, arg, stream); break;
203  default: errorQuda("nParity = %d undefined\n", arg.nParity);
204  }
205 #endif
206  }
207 
215  template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
216  typename Arg>
217  inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
218  {
219 #ifdef JITIFY
220  using namespace jitify::reflection;
221  const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
222  Tunable::jitify_error = program_->kernel(kernel)
223  .instantiate(Type<Float>(), nDim, nColor, arg.nParity, arg.dagger, arg.xpay,
224  arg.kernel_type, Type<Arg>())
225  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
226  .launch(arg);
227 #else
228  if (arg.xpay)
229  instantiate<Launch, nDim, nColor, true>(tp, arg, stream);
230  else
231  instantiate<Launch, nDim, nColor, false>(tp, arg, stream);
232 #endif
233  }
234 
235  DslashArg<Float> &dslashParam; // temporary addition for policy compatibility
236 
237  Dslash(DslashArg<Float> &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src) :
238  TunableVectorYZ(1, arg.nParity),
239  arg(arg),
240  out(out),
241  in(in),
242  nDimComms(4),
243  dslashParam(arg)
244  {
245  if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)
246  errorQuda("CPU Fields not supported in Dslash framework yet");
247 
248  // this sets the communications pattern for the packing kernel
249  setPackComms(arg.commDim);
250 
251  // strcpy(aux, in.AuxString());
252  fillAuxBase();
253 #ifdef MULTI_GPU
254  fillAux(INTERIOR_KERNEL, "policy_kernel=interior");
255  fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");
256  fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");
257  fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");
258  fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");
259  fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");
260 #else
261  fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");
262 #endif // MULTI_GPU
263  fillAux(KERNEL_POLICY, "policy");
264 
265 #ifdef JITIFY
266  create_jitify_program(src);
267  program_ = program;
268 #endif
269  }
270 
271  int Nface() const
272  {
273  return 2 * arg.nFace;
274  } // factor of 2 is for forwards/backwards (convention used in dslash policy)
275  int Dagger() const { return arg.dagger; }
276 
277  const char *getAux(KernelType type) const { return aux[type]; }
278 
279  void setAux(KernelType type, const char *aux_) { strcpy(aux[type], aux_); }
280 
281  void augmentAux(KernelType type, const char *extra) { strcat(aux[type], extra); }
282 
287  virtual void preTune()
288  {
289  if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != KERNEL_POLICY) out.backup();
290  }
291 
295  virtual void postTune()
296  {
297  if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != KERNEL_POLICY) out.restore();
298  }
299 
300  /*
301  per direction / dimension flops
302  spin project flops = Nc * Ns
303  SU(3) matrix-vector flops = (8 Nc - 2) * Nc
304  spin reconstruction flops = 2 * Nc * Ns (just an accumulation to all components)
305  xpay = 2 * 2 * Nc * Ns
306 
307  So for the full dslash we have, where for the final spin
308  reconstruct we have -1 since the first direction does not
309  require any accumulation.
310 
311  flops = (2 * Nd * Nc * Ns) + (2 * Nd * (Ns/2) * (8*Nc-2) * Nc) + ((2 * Nd - 1) * 2 * Nc * Ns)
312  flops_xpay = flops + 2 * 2 * Nc * Ns
313 
314  For Wilson this should give 1344 for Nc=3,Ns=2 and 1368 for the xpay equivalent
315  */
316  virtual long long flops() const
317  {
318  int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
319  int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;
320  int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());
321  int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
322  int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary
323 
324  long long flops_ = 0;
325 
326  // FIXME - should we count the xpay flops in the derived kernels
327  // since some kernels require the xpay in the exterior (preconditiond clover)
328 
329  switch (arg.kernel_type) {
330  case EXTERIOR_KERNEL_X:
331  case EXTERIOR_KERNEL_Y:
332  case EXTERIOR_KERNEL_Z:
333  case EXTERIOR_KERNEL_T:
334  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.GhostFace()[arg.kernel_type];
335  break;
336  case EXTERIOR_KERNEL_ALL: {
337  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
338  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
339  break;
340  }
341  case INTERIOR_KERNEL:
342  case KERNEL_POLICY: {
343  long long sites = in.Volume();
344  flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)
345  num_dir * num_mv_multiply * mv_flops + // SU(3) matrix-vector multiplies
346  ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
347  * sites; // accumulation
348  if (arg.xpay) flops_ += xpay_flops * sites;
349 
350  if (arg.kernel_type == KERNEL_POLICY) break;
351  // now correct for flops done by exterior kernel
352  long long ghost_sites = 0;
353  for (int d = 0; d < 4; d++)
354  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
355  flops_ -= ghost_flops * ghost_sites;
356 
357  break;
358  }
359  }
360 
361  return flops_;
362  }
363 
364  virtual long long bytes() const
365  {
366  int gauge_bytes = arg.reconstruct * in.Precision();
367  bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
368  int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
369  int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
370  int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
371  int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary
372 
373  long long bytes_ = 0;
374 
375  switch (arg.kernel_type) {
376  case EXTERIOR_KERNEL_X:
377  case EXTERIOR_KERNEL_Y:
378  case EXTERIOR_KERNEL_Z:
379  case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
380  case EXTERIOR_KERNEL_ALL: {
381  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
382  bytes_ = ghost_bytes * ghost_sites;
383  break;
384  }
385  case INTERIOR_KERNEL:
386  case KERNEL_POLICY: {
387  long long sites = in.Volume();
388  bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
389  if (arg.xpay) bytes_ += spinor_bytes;
390 
391  if (arg.kernel_type == KERNEL_POLICY) break;
392  // now correct for bytes done by exterior kernel
393  long long ghost_sites = 0;
394  for (int d = 0; d < 4; d++)
395  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
396  bytes_ -= ghost_bytes * ghost_sites;
397 
398  break;
399  }
400  }
401  return bytes_;
402  }
403  };
404 
406  static constexpr QudaReconstructType recon0 = QUDA_RECONSTRUCT_NO;
407  static constexpr QudaReconstructType recon1 = QUDA_RECONSTRUCT_12;
408  static constexpr QudaReconstructType recon2 = QUDA_RECONSTRUCT_8;
409  };
410 
412  static constexpr QudaReconstructType recon0 = QUDA_RECONSTRUCT_NO;
413  static constexpr QudaReconstructType recon1 = QUDA_RECONSTRUCT_13;
414  static constexpr QudaReconstructType recon2 = QUDA_RECONSTRUCT_9;
415  };
416 
424  template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, int nColor,
425  typename... Args>
426  inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
427  {
428  if (U.Reconstruct() == Recon::recon0) {
429 #if QUDA_RECONSTRUCT & 4
430  Apply<Float, nColor, Recon::recon0>(out, in, U, args...);
431 #else
432  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-18", QUDA_RECONSTRUCT);
433 #endif
434  } else if (U.Reconstruct() == Recon::recon1) {
435 #if QUDA_RECONSTRUCT & 2
436  Apply<Float, nColor, Recon::recon1>(out, in, U, args...);
437 #else
438  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
439 #endif
440  } else if (U.Reconstruct() == Recon::recon2) {
441 #if QUDA_RECONSTRUCT & 1
442  Apply<Float, nColor, Recon::recon2>(out, in, U, args...);
443 #else
444  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
445 #endif
446  } else {
447  errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct());
448  }
449  }
450 
458  template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, typename... Args>
459  inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
460  {
461  if (in.Ncolor() == 3) {
462  instantiate<Apply, Recon, Float, 3>(out, in, U, args...);
463  } else {
464  errorQuda("Unsupported number of colors %d\n", U.Ncolor());
465  }
466  }
467 
475  template <template <typename, int, QudaReconstructType> class Apply, typename Recon = WilsonReconstruct, typename... Args>
476  inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
477  {
478  if (U.Precision() == QUDA_DOUBLE_PRECISION) {
479 #if QUDA_PRECISION & 8
480  instantiate<Apply, Recon, double>(out, in, U, args...);
481 #else
482  errorQuda("QUDA_PRECISION=%d does not enable double precision", QUDA_PRECISION);
483 #endif
484  } else if (U.Precision() == QUDA_SINGLE_PRECISION) {
485 #if QUDA_PRECISION & 4
486  instantiate<Apply, Recon, float>(out, in, U, args...);
487 #else
488  errorQuda("QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
489 #endif
490  } else if (U.Precision() == QUDA_HALF_PRECISION) {
491 #if QUDA_PRECISION & 2
492  instantiate<Apply, Recon, short>(out, in, U, args...);
493 #else
494  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
495 #endif
496  } else if (U.Precision() == QUDA_QUARTER_PRECISION) {
497 #if QUDA_PRECISION & 1
498  instantiate<Apply, Recon, char>(out, in, U, args...);
499 #else
500  errorQuda("QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
501 #endif
502  } else {
503  errorQuda("Unsupported precision %d\n", U.Precision());
504  }
505  }
506 
507 } // namespace quda
virtual void postTune()
Restore the output field if doing exterior kernel.
Definition: dslash.h:295
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash.h:101
KernelType kernel_type
unsigned int minThreads() const
Definition: dslash.h:64
void setParam(Arg &arg)
Definition: dslash.h:66
cudaDeviceProp deviceProp
bool getKernelPackT()
Definition: dslash_quda.cu:26
#define errorQuda(...)
Definition: util_quda.h:121
int Dagger() const
Definition: dslash.h:275
Helper file when using jitify run-time compilation. This file should be included in source code...
QudaPrecision GhostPrecision() const
const int nDimComms
Definition: dslash.h:20
cudaStream_t * stream
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: dslash.h:97
void augmentAux(KernelType type, const char *extra)
Definition: dslash.h:281
const QudaReconstructType reconstruct
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
Dslash(DslashArg< Float > &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src)
Definition: dslash.h:237
int_fastdiv threads
void setAux(KernelType type, const char *aux_)
Definition: dslash.h:279
int Ncolor() const
Definition: gauge_field.h:249
virtual long long bytes() const
Definition: dslash.h:364
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
Definition: dslash.h:56
const ColorSpinorField & in
Definition: dslash.h:18
const int nColor
Definition: covdev_test.cpp:75
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
Definition: dslash.h:36
bool tuneGridDim() const
Definition: dslash.h:63
CUresult jitify_error
Definition: tune_quda.h:276
#define checkLocation(...)
virtual void backup() const
Backs up the LatticeField.
DslashArg< Float > & dslashParam
Definition: dslash.h:235
const ColorSpinorField & out
Definition: dslash.h:17
int GhostOffset(const int i) const
bool comm_peer2peer_enabled(int dir, int dim)
QudaFieldLocation Location() const
int blockMin() const
Definition: dslash.h:95
const int nParity
Definition: spinor_noise.cu:25
int Nface() const
Definition: dslash.h:271
enum QudaReconstructType_s QudaReconstructType
DslashArg< Float > & arg
Definition: dslash.h:16
static const int aux_n
Definition: tune_key.h:12
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the KernelType template required for the multi-G...
Definition: dslash.h:119
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the nParity template.
Definition: dslash.h:189
const int * GhostFace() const
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
char aux_base[TuneKey::aux_n - 32]
Definition: dslash.h:22
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the xpay template.
Definition: dslash.h:217
virtual const void * Ghost2() const
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the dagger template.
Definition: dslash.h:162
char aux[8][TuneKey::aux_n]
Definition: dslash.h:23
virtual long long flops() const
Definition: dslash.h:316
QudaPrecision Precision() const
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
Definition: dslash.h:287
QudaDagType dagger
Definition: test_util.cpp:1620
const char * getAux(KernelType type) const
Definition: dslash.h:277
int blockStep() const
Definition: dslash.h:94
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
virtual void restore() const
Restores the LatticeField.
virtual int tuningIter() const
Definition: dslash.h:92
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14