QUDA  0.9.0
dslash_domain_wall_4d.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace domainwall4d {
29 
30 #undef GPU_STAGGERED_DIRAC
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #ifdef GPU_DOMAIN_WALL_DIRAC
40 #include <dw_dslash4_def.h> // Dslash4 Domain Wall kernels
41 #include <dw_dslash5_def.h> // Dslash5 Domain Wall kernels
42 #include <dw_dslash5inv_def.h> // Dslash5inv Domain Wall kernels
43 #endif
44 
45 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
46 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
47 #endif
48 
49 #include <dslash_quda.cuh>
50  }
51 
52  // declare the dslash events
53 #include <dslash_events.cuh>
54 
55  using namespace domainwall4d;
56 
57 #ifdef GPU_DOMAIN_WALL_DIRAC
58  template <typename sFloat, typename gFloat>
59  class DomainWallDslash4DPCCuda : public DslashCuda {
60 
61  private:
62  const int DS_type;
63 
64  bool checkGrid(TuneParam &param) const {
65  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0] || param.grid.y > (unsigned int)deviceProp.maxGridSize[1]) {
66  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
67  param.block.x, param.block.y, param.block.z, param.grid.x, param.grid.y, param.grid.z);
68  return false;
69  } else {
70  return true;
71  }
72  }
73 
74  protected:
75  bool advanceBlockDim(TuneParam &param) const
76  {
77  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
78  const int step[2] = { deviceProp.warpSize, 1 };
79  bool advance[2] = { false, false };
80 
81  // first try to advance block.x
82  param.block.x += step[0];
83  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0] ||
84  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
85  advance[0] = false;
86  param.block.x = step[0]; // reset block.x
87  } else {
88  advance[0] = true; // successfully advanced block.x
89  }
90 
91  if (!advance[0]) { // if failed to advance block.x, now try block.y
92  param.block.y += step[1];
93 
94  if (param.block.y > (unsigned)in->X(4) ||
95  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
96  advance[1] = false;
97  param.block.y = step[1]; // reset block.x
98  } else {
99  advance[1] = true; // successfully advanced block.y
100  }
101  }
102 
103  if (advance[0] || advance[1]) {
104  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
105  (in->X(4)+param.block.y-1) / param.block.y, 1);
106 
107  bool advance = true;
108  if (!checkGrid(param)) advance = advanceBlockDim(param);
109  return advance;
110  } else {
111  return false;
112  }
113  }
114 
115  unsigned int sharedBytesPerThread() const { return 0; }
116 
117  public:
118  DomainWallDslash4DPCCuda(cudaColorSpinorField *out, const GaugeField &gauge, const cudaColorSpinorField *in,
119  const cudaColorSpinorField *x, const double mferm,
120  const double a, const double b, const int parity, const int dagger, const int *commOverride, const int DS_type)
121  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride), DS_type(DS_type)
122  {
123  dslashParam.a = a;
124  dslashParam.a_f = a;
125  dslashParam.b = b;
126  dslashParam.b_f = b;
127  dslashParam.mferm = mferm;
128  dslashParam.mferm_f = mferm;
129  }
130  virtual ~DomainWallDslash4DPCCuda() { unbindSpinorTex<sFloat>(in, out, x); }
131 
132  TuneKey tuneKey() const
133  {
134  TuneKey key = DslashCuda::tuneKey();
135  switch(DS_type){
136  case 0:
137  strcat(key.aux,",Dslash4");
138  break;
139  case 1:
140  strcat(key.aux,",Dslash5");
141  break;
142  case 2:
143  strcat(key.aux,",Dslash5inv");
144  break;
145  }
146  return key;
147  }
148 
149  virtual void initTuneParam(TuneParam &param) const
150  {
152  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
153  (in->X(4)+param.block.y-1) / param.block.y, 1);
154  bool ok = true;
155  if (!checkGrid(param)) ok = advanceBlockDim(param);
156  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
157  }
158 
160  virtual void defaultTuneParam(TuneParam &param) const
161  {
163  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
164  (in->X(4)+param.block.y-1) / param.block.y, 1);
165  bool ok = true;
166  if (!checkGrid(param)) ok = advanceBlockDim(param);
167  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
168  }
169 
170  void apply(const cudaStream_t &stream)
171  {
172 #ifndef USE_TEXTURE_OBJECTS
173  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
174 #endif // USE_TEXTURE_OBJECTS
175  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
176  setParam();
177 
178  switch(DS_type){
179  case 0:
180  DSLASH(domainWallDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
181  break;
182  case 1:
183  DSLASH(domainWallDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
184  break;
185  case 2:
186  DSLASH(domainWallDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
187  break;
188  default:
189  errorQuda("invalid Dslash type");
190  }
191  }
192 
193  long long flops() const {
194  long long Ls = in->X(4);
195  long long vol4d = in->VolumeCB() / Ls;
196  long long bulk = (Ls-2)*vol4d;
197  long long wall = 2*vol4d;
198  long long flops = 0;
199  switch(DS_type){
200  case 0:
202  break;
203  case 1:
204  flops = (x ? 48ll : 0 ) * in->VolumeCB() + 96ll*bulk + 120ll*wall;
205  break;
206  case 2:
207  flops = 144ll*in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
208  break;
209  default:
210  errorQuda("invalid Dslash type");
211  }
212  return flops;
213  }
214 
215  long long bytes() const {
216  bool isHalf = in->Precision() == sizeof(short) ? true : false;
217  int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
218  long long Ls = in->X(4);
219  long long bytes = 0;
220 
221  switch(DS_type){
222  case 0:
224  break;
225  case 1:
226  bytes = (x ? 5ll : 4ll ) * spinor_bytes * in->VolumeCB();
227  break;
228  case 2:
229  bytes = (x ? Ls + 2 : Ls + 1) * spinor_bytes * in->VolumeCB();
230  break;
231  default:
232  errorQuda("invalid Dslash type");
233  }
234  return bytes;
235  }
236  };
237 #endif // GPU_DOMAIN_WALL_DIRAC
238 
239 #include <dslash_policy.cuh>
240 
241 
242  //-----------------------------------------------------
243  // Modification for 4D preconditioned DWF operator
244  // Additional Arg. is added to give a function name.
245  //
246  // pre-defined DS_type list
247  // 0 = dslash4
248  // 1 = dslash5
249  // 2 = dslash5inv
250  //-----------------------------------------------------
251 
253  const cudaColorSpinorField *in, const int parity, const int dagger,
254  const cudaColorSpinorField *x, const double &m_f, const double &a, const double &b,
255  const int *commOverride, const int DS_type, TimeProfile &profile)
256  {
257 #ifdef GPU_DOMAIN_WALL_DIRAC
258  const_cast<cudaColorSpinorField*>(in)->createComms(1);
259 
260  DslashCuda *dslash = nullptr;
261  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
262  dslash = new DomainWallDslash4DPCCuda<double2,double2>(out, gauge, in, x, m_f, a, b, parity, dagger, commOverride, DS_type);
263  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
264  dslash = new DomainWallDslash4DPCCuda<float4,float4>(out, gauge, in, x, m_f, a, b, parity, dagger, commOverride, DS_type);
265  } else if (in->Precision() == QUDA_HALF_PRECISION) {
266  dslash = new DomainWallDslash4DPCCuda<short4,short4>(out, gauge, in, x, m_f, a, b, parity, dagger, commOverride, DS_type);
267  }
268 
269  // the parameters passed to dslashCuda must be 4-d volume and 3-d
270  // faces because Ls is added as the y-dimension in thread space
271  int ghostFace[QUDA_MAX_DIM];
272  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
273 
274  DslashPolicyImp* dslashImp = nullptr;
275  if (DS_type != 0) {
276  dslashImp = DslashFactory::create(QudaDslashPolicy::QUDA_DSLASH_NC);
277  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
278  delete dslashImp;
279  } else {
280  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
281  dslash_policy.apply(0);
282  }
283 
284  delete dslash;
285 #else
286  errorQuda("4D preconditioned Domain wall dslash has not been built");
287 #endif
288  }
289 
290 }
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
char * strcat(char *__s1, const char *__s2)
#define mferm
int Ls
Definition: test_util.cpp:39
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile)
unsigned long long flops
Definition: blas_quda.cu:42
virtual TuneKey tuneKey() const
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254