QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash5_domain_wall.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
3 #include <dslash_quda.h>
4 #include <index_helper.cuh>
5 #include <dslash_quda.h>
6 
8 
9 namespace quda
10 {
11 
12  /*
13  FIXME
14  - fix flops counters
15  - check dagger operators are correct - there might need to be a
16  shift by 1 in which coefficients are used and conjugation of coefficients
17  - use kappa notation and not b/c for consistency with other codes and sanity
18  */
19 
20  template <typename Float, int nColor, typename Arg> class Dslash5 : public TunableVectorYZ
21  {
22 
23 protected:
24  Arg &arg;
26  static constexpr bool shared = true; // whether to use shared memory cache blocking for M5inv
27 
29  static constexpr bool var_inverse = true;
30 
31  long long flops() const
32  {
33  long long Ls = meta.X(4);
34  long long bulk = (Ls - 2) * (meta.Volume() / Ls);
35  long long wall = 2 * meta.Volume() / Ls;
36  long long n = meta.Ncolor() * meta.Nspin();
37 
38  long long flops_ = 0;
39  switch (arg.type) {
40  case DSLASH5_DWF: flops_ = n * (8ll * bulk + 10ll * wall + (arg.xpay ? 4ll * meta.Volume() : 0)); break;
41  case DSLASH5_MOBIUS_PRE:
42  flops_ = n * (8ll * bulk + 10ll * wall + 14ll * meta.Volume() + (arg.xpay ? 8ll * meta.Volume() : 0));
43  break;
44  case DSLASH5_MOBIUS:
45  flops_ = n * (8ll * bulk + 10ll * wall + 8ll * meta.Volume() + (arg.xpay ? 8ll * meta.Volume() : 0));
46  break;
47  case M5_INV_DWF:
48  case M5_INV_MOBIUS: // FIXME flops
49  // flops_ = ((2 + 8 * n) * Ls + (arg.xpay ? 4ll : 0)) * meta.Volume();
50  flops_ = (144 * Ls + (arg.xpay ? 4ll : 0)) * meta.Volume();
51  break;
52  case M5_INV_ZMOBIUS:
53  // flops_ = ((12 + 16 * n) * Ls + (arg.xpay ? 8ll : 0)) * meta.Volume();
54  flops_ = (144 * Ls + (arg.xpay ? 8ll : 0)) * meta.Volume();
55  break;
56  default: errorQuda("Unknown Dslash5Type %d", arg.type);
57  }
58 
59  return flops_;
60  }
61 
62  long long bytes() const
63  {
64  long long Ls = meta.X(4);
65  switch (arg.type) {
66  case DSLASH5_DWF: return arg.out.Bytes() + 2 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
67  case DSLASH5_MOBIUS_PRE: return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
68  case DSLASH5_MOBIUS: return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
69  case M5_INV_DWF: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
70  case M5_INV_MOBIUS: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
71  case M5_INV_ZMOBIUS: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
72  default: errorQuda("Unknown Dslash5Type %d", arg.type);
73  }
74  return 0ll;
75  }
76 
77  bool tuneGridDim() const { return false; }
78  unsigned int minThreads() const { return arg.volume_4d_cb; }
79  int blockStep() const { return 4; }
80  int blockMin() const { return 4; }
81  unsigned int sharedBytesPerThread() const
82  {
83  if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
84  // spin components in shared depend on inversion algorithm
85  int nSpin = var_inverse ? meta.Nspin() / 2 : meta.Nspin();
86  return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
87  } else {
88  return 0;
89  }
90  }
91 
92  // overloaded to return max dynamic shared memory if doing shared-memory inverse
93  unsigned int maxSharedBytesPerBlock() const
94  {
95  if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
97  } else {
99  }
100  }
101 
102 public:
103  Dslash5(Arg &arg, const ColorSpinorField &meta) : TunableVectorYZ(arg.Ls, arg.nParity), arg(arg), meta(meta)
104  {
105  strcpy(aux, meta.AuxString());
106  if (arg.dagger) strcat(aux, ",Dagger");
107  if (arg.xpay) strcat(aux, ",xpay");
108  switch (arg.type) {
109  case DSLASH5_DWF: strcat(aux, ",DSLASH5_DWF"); break;
110  case DSLASH5_MOBIUS_PRE: strcat(aux, ",DSLASH5_MOBIUS_PRE"); break;
111  case DSLASH5_MOBIUS: strcat(aux, ",DSLASH5_MOBIUS"); break;
112  case M5_INV_DWF: strcat(aux, ",M5_INV_DWF"); break;
113  case M5_INV_MOBIUS: strcat(aux, ",M5_INV_MOBIUS"); break;
114  case M5_INV_ZMOBIUS: strcat(aux, ",M5_INV_ZMOBIUS"); break;
115  default: errorQuda("Unknown Dslash5Type %d", arg.type);
116  }
117  }
118  virtual ~Dslash5() {}
119 
120  template <typename T> inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
121  {
122  if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
123  // if inverse kernel uses shared memory then maximize total shared memory pool
125  }
126  void *args[] = {&arg};
127  qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
128  }
129 
130  void apply(const cudaStream_t &stream)
131  {
132  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
133  errorQuda("CPU variant not instantiated");
134  } else {
135  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
136  if (arg.type == DSLASH5_DWF) {
137  if (arg.xpay)
138  arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_DWF, Arg>, tp, arg, stream) :
139  launch(dslash5GPU<Float, nColor, false, true, DSLASH5_DWF, Arg>, tp, arg, stream);
140  else
141  arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_DWF, Arg>, tp, arg, stream) :
142  launch(dslash5GPU<Float, nColor, false, false, DSLASH5_DWF, Arg>, tp, arg, stream);
143  } else if (arg.type == DSLASH5_MOBIUS_PRE) {
144  if (arg.xpay)
145  arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
146  launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
147  else
148  arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
149  launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
150  } else if (arg.type == DSLASH5_MOBIUS) {
151  if (arg.xpay)
152  arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
153  launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
154  else
155  arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
156  launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
157  } else if (arg.type == M5_INV_DWF) {
158  if (arg.xpay)
159  arg.dagger ?
160  launch(dslash5invGPU<Float, nColor, true, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
161  launch(dslash5invGPU<Float, nColor, false, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
162  else
163  arg.dagger ?
164  launch(dslash5invGPU<Float, nColor, true, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
165  launch(dslash5invGPU<Float, nColor, false, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
166  } else if (arg.type == M5_INV_MOBIUS) {
167  if (arg.xpay)
168  arg.dagger ? launch(
169  dslash5invGPU<Float, nColor, true, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
170  launch(dslash5invGPU<Float, nColor, false, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
171  arg, stream);
172  else
173  arg.dagger ? launch(
174  dslash5invGPU<Float, nColor, true, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
175  launch(dslash5invGPU<Float, nColor, false, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
176  arg, stream);
177  } else if (arg.type == M5_INV_ZMOBIUS) {
178  if (arg.xpay)
179  arg.dagger ? launch(
180  dslash5invGPU<Float, nColor, true, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
181  launch(dslash5invGPU<Float, nColor, false, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp,
182  arg, stream);
183  else
184  arg.dagger ? launch(
185  dslash5invGPU<Float, nColor, true, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
186  launch(dslash5invGPU<Float, nColor, false, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>,
187  tp, arg, stream);
188  }
189  }
190  }
191 
193  {
195  if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
196  param.block.y = arg.Ls; // Ls must be contained in the block
197  param.grid.y = 1;
198  param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
199  }
200  }
201 
203  {
205  if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
206  param.block.y = arg.Ls; // Ls must be contained in the block
207  param.grid.y = 1;
208  param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
209  }
210  }
211 
212  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
213  };
214 
215  template <typename Float, int nColor>
217  double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
218  {
219  Dslash5Arg<Float, nColor> arg(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type);
221  dslash.apply(streams[Nstream - 1]);
222  }
223 
224  // template on the number of colors
225  template <typename Float>
226  void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
227  double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
228  {
229  switch (in.Ncolor()) {
230  case 3: ApplyDslash5<Float, 3>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
231  default: errorQuda("Unsupported number of colors %d\n", in.Ncolor());
232  }
233  }
234 
235  // Apply the 5th dimension dslash operator to a colorspinor field
236  // out = Dslash5*in
237  void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
238  double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
239  {
240 #ifdef GPU_DOMAIN_WALL_DIRAC
241  if (in.PCType() != QUDA_4D_PC) errorQuda("Only 4-d preconditioned fields are supported");
242  checkLocation(out, in); // check all locations match
243 
244  switch (checkPrecision(out, in)) {
245  case QUDA_DOUBLE_PRECISION: ApplyDslash5<double>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
246  case QUDA_SINGLE_PRECISION: ApplyDslash5<float>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
247  case QUDA_HALF_PRECISION: ApplyDslash5<short>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
248  case QUDA_QUARTER_PRECISION: ApplyDslash5<char>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
249  default: errorQuda("Unsupported precision %d\n", in.Precision());
250  }
251 #else
252  errorQuda("Domain wall dslash has not been built");
253 #endif
254  }
255 
256 } // namespace quda
int blockMin() const
TuneKey tuneKey() const
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
static constexpr bool var_inverse
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * streams
cudaStream_t * stream
static constexpr bool shared
const int Nstream
Definition: quda_internal.h:83
void defaultTuneParam(TuneParam &param) const
const char * VolString() const
void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
Apply either the domain-wall / mobius Dslash5 operator or the M5 inverse operator. In the current implementation, it is expected that the color-spinor fields are 4-d preconditioned.
void apply(const cudaStream_t &stream)
Dslash5(Arg &arg, const ColorSpinorField &meta)
const ColorSpinorField & meta
unsigned int sharedBytesPerThread() const
int Ls
Definition: test_util.cpp:38
QudaGaugeParam param
Definition: pack_test.cpp:17
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181
const int nColor
Definition: covdev_test.cpp:75
unsigned int minThreads() const
cpuColorSpinorField * in
QudaPCType PCType() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define checkLocation(...)
std::complex< double > Complex
Definition: quda_internal.h:46
void initTuneParam(TuneParam &param) const
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523
QudaFieldLocation Location() const
cpuColorSpinorField * out
long long flops() const
Dslash5Type
Definition: dslash_quda.h:396
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
bool tuneGridDim() const
const int * X() const
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198
Parameter structure for applying the Dslash.
int blockStep() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
long long bytes() const
QudaDagType dagger
Definition: test_util.cpp:1620
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: tune_quda.h:229
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531