QUDA  0.9.0
dslash_mobius.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace mobius {
29 
30 #undef GPU_STAGGERED_DIRAC
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #ifdef GPU_DOMAIN_WALL_DIRAC
40 #include <mdw_dslash4_def.h> // Dslash4, intermediate operator for Mobius Mat_4 kernels
41 #include <mdw_dslash4pre_def.h> // Dslash4pre, intermediate operator for Mobius Mat_4 kernels
42 #include <mdw_dslash5_def.h> // Dslash5 Mobius Domain Wall kernels
43 #include <mdw_dslash5inv_def.h> // Dslash5inv Mobius Domain Wall kernels
44 #endif
45 
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
48 #endif
49 
50 #include <dslash_quda.cuh>
51  }
52 
53  // declare the dslash events
54 #include <dslash_events.cuh>
55 
56  using namespace mobius;
57 
58 #ifdef GPU_DOMAIN_WALL_DIRAC
59  //Dslash class definition for Mobius Domain Wall Fermion
60  template <typename sFloat, typename gFloat>
61  class MDWFDslashPCCuda : public DslashCuda {
62 
63  private:
64  const int DS_type;
65 
66  bool checkGrid(TuneParam &param) const {
67  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0] || param.grid.y > (unsigned int)deviceProp.maxGridSize[1]) {
68  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
69  param.block.x, param.block.y, param.block.z,
70  param.grid.x, param.grid.y, param.grid.z);
71  return false;
72  } else {
73  return true;
74  }
75  }
76 
77  protected:
78  bool advanceBlockDim(TuneParam &param) const
79  {
80  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
81  const int step[2] = { deviceProp.warpSize, 1 };
82  bool advance[2] = { false, false };
83 
84  // first try to advance block.x
85  param.block.x += step[0];
86  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0] ||
87  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
88  advance[0] = false;
89  param.block.x = step[0]; // reset block.x
90  } else {
91  advance[0] = true; // successfully advanced block.x
92  }
93 
94  if (!advance[0]) { // if failed to advance block.x, now try block.y
95  param.block.y += step[1];
96 
97  if (param.block.y > (unsigned)in->X(4) ||
98  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
99  advance[1] = false;
100  param.block.y = step[1]; // reset block.x
101  } else {
102  advance[1] = true; // successfully advanced block.y
103  }
104  }
105 
106  if (advance[0] || advance[1]) {
107  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
108  (in->X(4)+param.block.y-1) / param.block.y, 1);
109 
110  bool advance = true;
111  if (!checkGrid(param)) advance = advanceBlockDim(param);
112  return advance;
113  } else {
114  return false;
115  }
116  }
117 
118  unsigned int sharedBytesPerThread() const { return 0; }
119 
120  public:
121  MDWFDslashPCCuda(cudaColorSpinorField *out, const GaugeField &gauge, const cudaColorSpinorField *in,
122  const cudaColorSpinorField *x, const double mferm, const double a,
123  const double *b_5, const double *c_5, const double m5,
124  const int parity, const int dagger, const int *commOverride, const int DS_type)
125  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride), DS_type(DS_type)
126  {
127  dslashParam.a = a;
128  dslashParam.a_f = a;
129  dslashParam.mferm = mferm;
130  dslashParam.mferm_f = mferm;
131 
132  memcpy(dslashParam.mdwf_b5_d, b_5, out->X(4)*sizeof(double));
133  memcpy(dslashParam.mdwf_c5_d, c_5, out->X(4)*sizeof(double));
134  for (int s=0; s<out->X(4); s++) {
135  dslashParam.mdwf_b5_f[s] = (float)dslashParam.mdwf_b5_d[s];
136  dslashParam.mdwf_c5_f[s] = (float)dslashParam.mdwf_c5_d[s];
137  }
138 
139  dslashParam.m5_d = m5;
140  dslashParam.m5_f = (float)m5;
141  }
142  virtual ~MDWFDslashPCCuda() { unbindSpinorTex<sFloat>(in, out, x); }
143 
144  TuneKey tuneKey() const
145  {
146  TuneKey key = DslashCuda::tuneKey();
147  switch(DS_type){
148  case 0:
149  strcat(key.aux,",Dslash4");
150  break;
151  case 1:
152  strcat(key.aux,",Dslash4pre");
153  break;
154  case 2:
155  strcat(key.aux,",Dslash5");
156  break;
157  case 3:
158  strcat(key.aux,",Dslash5inv");
159  break;
160  }
161  return key;
162  }
163 
164  virtual void initTuneParam(TuneParam &param) const
165  {
167  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
168  (in->X(4)+param.block.y-1) / param.block.y, 1);
169  bool ok = true;
170  if (!checkGrid(param)) ok = advanceBlockDim(param);
171  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
172  }
173 
175  virtual void defaultTuneParam(TuneParam &param) const
176  {
178  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
179  (in->X(4)+param.block.y-1) / param.block.y, 1);
180  bool ok = true;
181  if (!checkGrid(param)) ok = advanceBlockDim(param);
182  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
183  }
184 
185  void apply(const cudaStream_t &stream)
186  {
187 #ifndef USE_TEXTURE_OBJECTS
188  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
189 #endif // USE_TEXTURE_OBJECTS
190  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
191  setParam();
192  switch(DS_type){
193  case 0:
194  DSLASH(MDWFDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
195  break;
196  case 1:
197  DSLASH(MDWFDslash4pre, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
198  break;
199  case 2:
200  DSLASH(MDWFDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
201  break;
202  case 3:
203  DSLASH(MDWFDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
204  break;
205  default:
206  errorQuda("invalid Dslash type");
207  }
208  }
209 
210  long long flops() const {
211  long long Ls = in->X(4);
212  long long vol4d = in->VolumeCB() / Ls;
213  long long bulk = (Ls-2)*vol4d;
214  long long wall = 2*vol4d;
215  long long flops = 0;
216  switch(DS_type){
217  case 0:
219  break;
220  case 1:
221  flops = 72ll*in->VolumeCB() + 96ll*bulk + 120ll*wall;
222  break;
223  case 2:
224  flops = (x ? 96ll : 48ll)*in->VolumeCB() + 96ll*bulk + 120ll*wall;
225  break;
226  case 3:
227  flops = 144ll*in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
228  break;
229  default:
230  errorQuda("invalid Dslash type");
231  }
232  return flops;
233  }
234 
235  long long bytes() const {
236  bool isHalf = in->Precision() == sizeof(short) ? true : false;
237  int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
238  long long Ls = in->X(4);
239  long long bytes = 0;
240 
241  switch(DS_type){
242  case 0:
244  break;
245  case 1:
246  case 2:
247  bytes = (x ? 5ll : 4ll) * spinor_bytes * in->VolumeCB();
248  break;
249  case 3:
250  bytes = (x ? Ls + 2 : Ls + 1) * spinor_bytes * in->VolumeCB();
251  break;
252  default:
253  errorQuda("invalid Dslash type");
254  }
255  return bytes;
256  }
257  };
258 #endif // GPU_DOMAIN_WALL_DIRAC
259 
260 #include <dslash_policy.cuh>
261 
262  //-----------------------------------------------------
263  // Modification for 4D preconditioned Mobius DWF operator
264  // Additional Arg. is added to give a function name.
265  //
266  // pre-defined DS_type list
267  // 0 = MDWF dslash4
268  // 1 = MDWF dslash4pre
269  // 2 = MDWF dslash5
270  // 3 = MDWF dslash5inv
271  //-----------------------------------------------------
272 
274  const cudaColorSpinorField *in, const int parity, const int dagger,
275  const cudaColorSpinorField *x, const double &m_f, const double &k2,
276  const double *b_5, const double *c_5, const double &m5,
277  const int *commOverride, const int DS_type, TimeProfile &profile)
278  {
279 #ifdef GPU_DOMAIN_WALL_DIRAC
280  const_cast<cudaColorSpinorField*>(in)->createComms(1);
281 
282  DslashCuda *dslash = nullptr;
283  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
284  dslash = new MDWFDslashPCCuda<double2,double2>(out, gauge, in, x, m_f, k2, b_5, c_5, m5, parity, dagger, commOverride, DS_type);
285  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
286  dslash = new MDWFDslashPCCuda<float4,float4>(out, gauge, in, x, m_f, k2, b_5, c_5, m5, parity, dagger, commOverride, DS_type);
287  } else if (in->Precision() == QUDA_HALF_PRECISION) {
288  dslash = new MDWFDslashPCCuda<short4,short4>(out, gauge, in, x, m_f, k2, b_5, c_5, m5, parity, dagger, commOverride, DS_type);
289  }
290 
291  // the parameters passed to dslashCuda must be 4-d volume and 3-d
292  // faces because Ls is added as the y-dimension in thread space
293  int ghostFace[QUDA_MAX_DIM];
294  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
295 
296  DslashPolicyImp* dslashImp = nullptr;
297  if (DS_type != 0) {
298  dslashImp = DslashFactory::create(QudaDslashPolicy::QUDA_DSLASH_NC);
299  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
300  delete dslashImp;
301  } else {
302  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
303  dslash_policy.apply(0);
304  }
305 
306  delete dslash;
307 #else
308  errorQuda("Domain wall dslash has not been built");
309 #endif
310  }
311 
312 }
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void MDWFDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const double *b5, const double *c_5, const double &m5, const int *commDim, const int DS_type, TimeProfile &profile)
cudaStream_t * stream
#define m5
char * strcat(char *__s1, const char *__s2)
#define mferm
int Ls
Definition: test_util.cpp:39
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
void * memcpy(void *__dst, const void *__src, size_t __n)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
virtual TuneKey tuneKey() const
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254