QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_domain_wall_4d.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 #include <face_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace domainwall4d {
30 
31 #undef GPU_STAGGERED_DIRAC
32 #include <dslash_constants.h>
33 #include <dslash_textures.h>
34 #include <dslash_index.cuh>
35 
36  // Enable shared memory dslash for Fermi architecture
37  //#define SHARED_WILSON_DSLASH
38  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
39 
40 #ifdef GPU_DOMAIN_WALL_DIRAC
41 #include <dw_dslash4_def.h> // Dslash4 Domain Wall kernels
42 #include <dw_dslash5_def.h> // Dslash5 Domain Wall kernels
43 #include <dw_dslash5inv_def.h> // Dslash5inv Domain Wall kernels
44 #endif
45 
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
48 #endif
49 
50 #include <dslash_quda.cuh>
51  }
52 
53  // declare the dslash events
54 #include <dslash_events.cuh>
55 
56  using namespace domainwall4d;
57 
58 #ifdef GPU_DOMAIN_WALL_DIRAC
59  template <typename sFloat, typename gFloat>
60  class DomainWallDslash4DPCCuda : public DslashCuda {
61 
62  private:
63  const gFloat *gauge0, *gauge1;
64  const double mferm;
65  const double a;
66  const int DS_type;
67 
68  bool checkGrid(TuneParam &param) const {
69  if (param.grid.x > deviceProp.maxGridSize[0] || param.grid.y > deviceProp.maxGridSize[1]) {
70  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
71  param.block.x, param.block.y, param.block.z,
72  param.grid.x, param.grid.y, param.grid.z);
73  return false;
74  } else {
75  return true;
76  }
77  }
78 
79  protected:
80  bool advanceBlockDim(TuneParam &param) const
81  {
82  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
83  const int step[2] = { deviceProp.warpSize, 1 };
84  bool advance[2] = { false, false };
85 
86  // first try to advance block.x
87  param.block.x += step[0];
88  if (param.block.x > deviceProp.maxThreadsDim[0] ||
89  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
90  advance[0] = false;
91  param.block.x = step[0]; // reset block.x
92  } else {
93  advance[0] = true; // successfully advanced block.x
94  }
95 
96  if (!advance[0]) { // if failed to advance block.x, now try block.y
97  param.block.y += step[1];
98 
99  if (param.block.y > in->X(4) ||
100  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
101  advance[1] = false;
102  param.block.y = step[1]; // reset block.x
103  } else {
104  advance[1] = true; // successfully advanced block.y
105  }
106  }
107 
108  if (advance[0] || advance[1]) {
109  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
110  (in->X(4)+param.block.y-1) / param.block.y, 1);
111 
112  bool advance = true;
113  if (!checkGrid(param)) advance = advanceBlockDim(param);
114  return advance;
115  } else {
116  return false;
117  }
118  }
119 
120  unsigned int sharedBytesPerThread() const { return 0; }
121 
122  public:
123  DomainWallDslash4DPCCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
124  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
125  const cudaColorSpinorField *x, const double mferm,
126  const double a, const int dagger, const int DS_type)
127  : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1),
128  mferm(mferm), a(a), DS_type(DS_type)
129  {
130  bindSpinorTex<sFloat>(in, out, x);
131  }
132  virtual ~DomainWallDslash4DPCCuda() { unbindSpinorTex<sFloat>(in, out, x); }
133 
134  TuneKey tuneKey() const
135  {
136  TuneKey key = DslashCuda::tuneKey();
137  switch(DS_type){
138  case 0:
139  strcat(key.aux,",Dslash4");
140  break;
141  case 1:
142  strcat(key.aux,",Dslash5");
143  break;
144  case 2:
145  strcat(key.aux,",Dslash5inv");
146  break;
147  }
148  return key;
149  }
150 
151  virtual void initTuneParam(TuneParam &param) const
152  {
153  Tunable::initTuneParam(param);
154  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
155  (in->X(4)+param.block.y-1) / param.block.y, 1);
156  bool ok = true;
157  if (!checkGrid(param)) ok = advanceBlockDim(param);
158  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
159  }
160 
162  virtual void defaultTuneParam(TuneParam &param) const
163  {
165  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
166  (in->X(4)+param.block.y-1) / param.block.y, 1);
167  bool ok = true;
168  if (!checkGrid(param)) ok = advanceBlockDim(param);
169  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
170  }
171 
172  void apply(const cudaStream_t &stream)
173  {
174  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
175 
176  switch(DS_type){
177  case 0:
178  DSLASH(domainWallDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
179  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
180  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
181  break;
182  case 1:
183  DSLASH(domainWallDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
184  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
185  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
186  break;
187  case 2:
188  DSLASH(domainWallDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
189  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, (sFloat*)in->V(),
190  (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
191  break;
192  default:
193  errorQuda("invalid Dslash type");
194  }
195  }
196 
197  long long flops() const { // FIXME for multi-GPU
198  long long Ls = in->X(4);
199  long long vol4d = in->VolumeCB() / Ls;
200  long long bulk = (Ls-2)*vol4d;
201  long long wall = 2*vol4d;
202  long long flops_Tmp;
203  switch(DS_type){
204  case 0:
205  flops_Tmp = (x ? 1368ll : 1320ll)*in->VolumeCB();
206  break;
207  case 1:
208  flops_Tmp = (x ? 48ll : 0 ) * in->VolumeCB() + 96ll*bulk + 120ll*wall;
209  break;
210  case 2:
211  flops_Tmp = 144ll*in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
212  break;
213  default:
214  errorQuda("invalid Dslash type");
215  }
216  return flops_Tmp;
217  }
218  };
219 #endif // GPU_DOMAIN_WALL_DIRAC
220 
221 #include <dslash_policy.cuh>
222 
223 
224  //-----------------------------------------------------
225  // Modification for 4D preconditioned DWF operator
226  // Additional Arg. is added to give a function name.
227  //
228  // pre-defined DS_type list
229  // 0 = dslash4
230  // 1 = dslash5
231  // 2 = dslash5inv
232  //-----------------------------------------------------
233 
235  const cudaColorSpinorField *in, const int parity, const int dagger,
236  const cudaColorSpinorField *x, const double &m_f, const double &k2,
237  const int *commOverride, const int DS_type, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
238  {
239  inSpinor = (cudaColorSpinorField*)in; // EVIL
240 
241  dslashParam.parity = parity;
242 
243 #ifdef GPU_DOMAIN_WALL_DIRAC
244  //currently splitting in space-time is impelemented:
245  int dirs = 4;
246  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
247  for(int i = 0;i < dirs; i++){
248  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
249  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
250  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
251  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
252  }
253 
254  void *gauge0, *gauge1;
255  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
256 
257  if (in->Precision() != gauge.Precision())
258  errorQuda("Mixing gauge and spinor precision not supported");
259 
260  DslashCuda *dslash = 0;
261  size_t regSize = sizeof(float);
262 
263  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
264 #if (__COMPUTE_CAPABILITY__ >= 130)
265  dslash = new DomainWallDslash4DPCCuda<double2,double2>(out, (double2*)gauge0, (double2*)gauge1,
266  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
267  regSize = sizeof(double);
268 #else
269  errorQuda("Double precision not supported on this GPU");
270 #endif
271  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
272  dslash = new DomainWallDslash4DPCCuda<float4,float4>(out, (float4*)gauge0, (float4*)gauge1,
273  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
274  } else if (in->Precision() == QUDA_HALF_PRECISION) {
275  dslash = new DomainWallDslash4DPCCuda<short4,short4>(out, (short4*)gauge0, (short4*)gauge1,
276  gauge.Reconstruct(), in, x, m_f, k2, dagger, DS_type);
277  }
278 
279  // the parameters passed to dslashCuda must be 4-d volume and 3-d
280  // faces because Ls is added as the y-dimension in thread space
281  int ghostFace[QUDA_MAX_DIM];
282  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
283 
284  DslashPolicyImp* dslashImp = NULL;
285  if (DS_type != 0) {
286  dslashImp = DslashFactory::create(QUDA_DSLASH_NC);
287  } else {
288 #ifndef GPU_COMMS
289  dslashImp = DslashFactory::create(dslashPolicy);
290 #else
291  dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
292 #endif
293  }
294  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume()/in->X(4), ghostFace, profile);
295  delete dslashImp;
296 
297  delete dslash;
298  unbindGaugeTex(gauge);
299 
300  checkCudaError();
301 #else
302  errorQuda("4D preconditioned Domain wall dslash has not been built");
303 #endif
304  }
305 
306 }
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const int * X() const
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
QudaDagType dagger
Definition: test_util.cpp:1558
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH)
int Ls
Definition: test_util.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
#define warningQuda(...)
Definition: util_quda.h:84
int x[4]
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const