QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_mobius.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 #include <face_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace mobius {
30 
31 #undef GPU_STAGGERED_DIRAC
32 #include <dslash_constants.h>
33 #include <dslash_textures.h>
34 #include <dslash_index.cuh>
35 
36  // Enable shared memory dslash for Fermi architecture
37  //#define SHARED_WILSON_DSLASH
38  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
39 
40 #ifdef GPU_DOMAIN_WALL_DIRAC
41 #include <mdw_dslash4_def.h> // Dslash4, intermediate operator for Mobius Mat_4 kernels
42 #include <mdw_dslash4pre_def.h> // Dslash4pre, intermediate operator for Mobius Mat_4 kernels
43 #include <mdw_dslash5_def.h> // Dslash5 Mobius Domain Wall kernels
44 #include <mdw_dslash5inv_def.h> // Dslash5inv Mobius Domain Wall kernels
45 #endif
46 
47 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
48 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
49 #endif
50 
51 #include <dslash_quda.cuh>
52  }
53 
54  // declare the dslash events
55 #include <dslash_events.cuh>
56 
57  using namespace mobius;
58 
59 #ifdef GPU_DOMAIN_WALL_DIRAC
60  //Dslash class definition for Mobius Domain Wall Fermion
61  template <typename sFloat, typename gFloat>
62  class MDWFDslashPCCuda : public DslashCuda {
63 
64  private:
65  const gFloat *gauge0, *gauge1;
66  const double mferm, a;
67  double *b5, *c5;
68  const int DS_type;
69 
70  bool checkGrid(TuneParam &param) const {
71  if (param.grid.x > deviceProp.maxGridSize[0] || param.grid.y > deviceProp.maxGridSize[1]) {
72  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
73  param.block.x, param.block.y, param.block.z,
74  param.grid.x, param.grid.y, param.grid.z);
75  return false;
76  } else {
77  return true;
78  }
79  }
80 
81  protected:
82  bool advanceBlockDim(TuneParam &param) const
83  {
84  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
85  const int step[2] = { deviceProp.warpSize, 1 };
86  bool advance[2] = { false, false };
87 
88  // first try to advance block.x
89  param.block.x += step[0];
90  if (param.block.x > deviceProp.maxThreadsDim[0] ||
91  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
92  advance[0] = false;
93  param.block.x = step[0]; // reset block.x
94  } else {
95  advance[0] = true; // successfully advanced block.x
96  }
97 
98  if (!advance[0]) { // if failed to advance block.x, now try block.y
99  param.block.y += step[1];
100 
101  if (param.block.y > in->X(4) ||
102  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
103  advance[1] = false;
104  param.block.y = step[1]; // reset block.x
105  } else {
106  advance[1] = true; // successfully advanced block.y
107  }
108  }
109 
110  if (advance[0] || advance[1]) {
111  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
112  (in->X(4)+param.block.y-1) / param.block.y, 1);
113 
114  bool advance = true;
115  if (!checkGrid(param)) advance = advanceBlockDim(param);
116  return advance;
117  } else {
118  return false;
119  }
120  }
121 
122  unsigned int sharedBytesPerThread() const { return 0; }
123 
124  public:
125  MDWFDslashPCCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
126  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
127  const cudaColorSpinorField *x, const double mferm,
128  const double a, const int dagger, const int DS_type)
129  : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1),
130  mferm(mferm), a(a), DS_type(DS_type)
131  {
132  bindSpinorTex<sFloat>(in, out, x);
133  }
134  virtual ~MDWFDslashPCCuda() { unbindSpinorTex<sFloat>(in, out, x); }
135 
136  TuneKey tuneKey() const
137  {
138  TuneKey key = DslashCuda::tuneKey();
139  switch(DS_type){
140  case 0:
141  strcat(key.aux,",Dslash4");
142  break;
143  case 1:
144  strcat(key.aux,",Dslash4pre");
145  break;
146  case 2:
147  strcat(key.aux,",Dslash5");
148  break;
149  case 3:
150  strcat(key.aux,",Dslash5inv");
151  break;
152  }
153  return key;
154  }
155 
156  virtual void initTuneParam(TuneParam &param) const
157  {
158  Tunable::initTuneParam(param);
159  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
160  (in->X(4)+param.block.y-1) / param.block.y, 1);
161  bool ok = true;
162  if (!checkGrid(param)) ok = advanceBlockDim(param);
163  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
164  }
165 
167  virtual void defaultTuneParam(TuneParam &param) const
168  {
170  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
171  (in->X(4)+param.block.y-1) / param.block.y, 1);
172  bool ok = true;
173  if (!checkGrid(param)) ok = advanceBlockDim(param);
174  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
175  }
176 
177  void apply(const cudaStream_t &stream)
178  {
179  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
180 
181  switch(DS_type){
182  case 0:
183  DSLASH(MDWFDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
184  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
185  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
186  break;
187  case 1:
188  DSLASH(MDWFDslash4pre, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
189  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
190  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
191  break;
192  case 2:
193  DSLASH(MDWFDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
194  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
195  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
196  break;
197  case 3:
198  DSLASH(MDWFDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
199  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
200  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
201  break;
202  default:
203  errorQuda("invalid Dslash type");
204  }
205  }
206 
207  long long flops() const { // FIXME for multi-GPU
208  long long Ls = in->X(4);
209  long long vol4d = in->VolumeCB() / Ls;
210  long long bulk = (Ls-2)*vol4d;
211  long long wall = 2*vol4d;
212  long long flops_Tmp;
213  switch(DS_type){
214  case 0:
215  flops_Tmp = (x ? 1368ll : 1320ll)*in->VolumeCB();
216  break;
217  case 1:
218  flops_Tmp = 72ll*in->VolumeCB() + 96ll*bulk + 120ll*wall;
219  break;
220  case 2:
221  flops_Tmp = (x ? 96ll : 48ll)*in->VolumeCB() + 96ll*bulk + 120ll*wall;
222  break;
223  case 3:
224  flops_Tmp = 144ll*in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
225  break;
226  default:
227  errorQuda("invalid Dslash type");
228  }
229  return flops_Tmp;
230  }
231  };
232 #endif // GPU_DOMAIN_WALL_DIRAC
233 
234 #include <dslash_policy.cuh>
235 
236  //-----------------------------------------------------
237  // Modification for 4D preconditioned Mobius DWF operator
238  // Additional Arg. is added to give a function name.
239  //
240  // pre-defined DS_type list
241  // 0 = MDWF dslash4
242  // 1 = MDWF dslash4pre
243  // 2 = MDWF dslash5
244  // 3 = MDWF dslash5inv
245  //-----------------------------------------------------
246 
248  const cudaColorSpinorField *in, const int parity, const int dagger,
249  const cudaColorSpinorField *x, const double &m_f, const double &k2,
250  const int *commOverride, const int DS_type, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
251  {
252  inSpinor = (cudaColorSpinorField*)in; // EVIL
253 
254  dslashParam.parity = parity;
255 
256 #ifdef GPU_DOMAIN_WALL_DIRAC
257  //currently splitting in space-time is impelemented:
258  int dirs = 4;
259  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
260  for(int i = 0;i < dirs; i++){
261  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
262  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
263  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
264  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
265  }
266 
267  void *gauge0, *gauge1;
268  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
269 
270  if (in->Precision() != gauge.Precision())
271  errorQuda("Mixing gauge and spinor precision not supported");
272 
273  DslashCuda *dslash = 0;
274  size_t regSize = sizeof(float);
275 
276  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
277 #if (__COMPUTE_CAPABILITY__ >= 130)
278  dslash = new MDWFDslashPCCuda<double2,double2>(out, (double2*)gauge0, (double2*)gauge1,
279  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
280  regSize = sizeof(double);
281 #else
282  errorQuda("Double precision not supported on this GPU");
283 #endif
284  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
285  dslash = new MDWFDslashPCCuda<float4,float4>(out, (float4*)gauge0, (float4*)gauge1,
286  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
287  } else if (in->Precision() == QUDA_HALF_PRECISION) {
288  dslash = new MDWFDslashPCCuda<short4,short4>(out, (short4*)gauge0, (short4*)gauge1,
289  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
290  }
291 
292  // the parameters passed to dslashCuda must be 4-d volume and 3-d
293  // faces because Ls is added as the y-dimension in thread space
294  int ghostFace[QUDA_MAX_DIM];
295  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
296 
297  DslashPolicyImp* dslashImp = NULL;
298  if (DS_type != 0) {
299  dslashImp = DslashFactory::create(QUDA_DSLASH_NC);
300  } else {
301 #ifndef GPU_COMMS
302  dslashImp = DslashFactory::create(dslashPolicy);
303 #else
304  dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
305 #endif
306  }
307  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume()/in->X(4), ghostFace, profile);
308  delete dslashImp;
309 
310  delete dslash;
311  unbindGaugeTex(gauge);
312 
313  checkCudaError();
314 #else
315  errorQuda("Domain wall dslash has not been built");
316 #endif
317  }
318 
319 }
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const int * X() const
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
QudaDagType dagger
Definition: test_util.cpp:1558
int Ls
Definition: test_util.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
#define warningQuda(...)
Definition: util_quda.h:84
void MDWFDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, const int DS_type, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int x[4]
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const