QUDA  0.9.0
dslash_domain_wall.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace domainwall {
29 
30 #undef GPU_STAGGERED_DIRAC
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #ifdef GPU_DOMAIN_WALL_DIRAC
40 #include <dw_dslash_def.h> // Domain Wall kernels
41 #endif
42 
43 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
44 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
45 #endif
46 
47 #include <dslash_quda.cuh>
48  }
49 
50  // declare the dslash events
51 #include <dslash_events.cuh>
52 
53  using namespace domainwall;
54 
55 #ifdef GPU_DOMAIN_WALL_DIRAC
56  template <typename sFloat, typename gFloat>
57  class DomainWallDslashCuda : public DslashCuda {
58 
59  private:
60  bool checkGrid(TuneParam &param) const {
61  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0] || param.grid.y > (unsigned int)deviceProp.maxGridSize[1]) {
62  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
63  param.block.x, param.block.y, param.block.z,
64  param.grid.x, param.grid.y, param.grid.z);
65  return false;
66  } else {
67  return true;
68  }
69  }
70 
71  protected:
72  bool advanceBlockDim(TuneParam &param) const
73  {
74  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
75  const int step[2] = { deviceProp.warpSize, 1 };
76  bool advance[2] = { false, false };
77 
78  // first try to advance block.x
79  param.block.x += step[0];
80  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0] ||
81  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
82  advance[0] = false;
83  param.block.x = step[0]; // reset block.x
84  } else {
85  advance[0] = true; // successfully advanced block.x
86  }
87 
88  if (!advance[0]) { // if failed to advance block.x, now try block.y
89  param.block.y += step[1];
90 
91  if (param.block.y > (unsigned)in->X(4) ||
92  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
93  advance[1] = false;
94  param.block.y = step[1]; // reset block.x
95  } else {
96  advance[1] = true; // successfully advanced block.y
97  }
98  }
99 
100  if (advance[0] || advance[1]) {
101  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
102  (in->X(4)+param.block.y-1) / param.block.y, 1);
103 
104  bool advance = true;
105  if (!checkGrid(param)) advance = advanceBlockDim(param);
106  return advance;
107  } else {
108  return false;
109  }
110  }
111 
112  unsigned int sharedBytesPerThread() const { return 0; }
113 
114  public:
115  DomainWallDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const cudaColorSpinorField *in,
116  const cudaColorSpinorField *x, const double mferm, const double a,
117  const int parity, const int dagger, const int* commOverride)
118  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride)
119  {
120  dslashParam.a = a;
121  dslashParam.a_f = a;
122  dslashParam.a_inv = 1.0/a;
123  dslashParam.a_inv_f = 1.0/a;
124  dslashParam.mferm = mferm;
125  dslashParam.mferm_f = mferm;
126  }
127  virtual ~DomainWallDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
128 
129  virtual void initTuneParam(TuneParam &param) const
130  {
132  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
133  (in->X(4)+param.block.y-1) / param.block.y, 1);
134  bool ok = true;
135  if (!checkGrid(param)) ok = advanceBlockDim(param);
136  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
137  }
138 
140  virtual void defaultTuneParam(TuneParam &param) const
141  {
143  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
144  (in->X(4)+param.block.y-1) / param.block.y, 1);
145  bool ok = true;
146  if (!checkGrid(param)) ok = advanceBlockDim(param);
147  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
148  }
149 
150  void apply(const cudaStream_t &stream)
151  {
152 #ifndef USE_TEXTURE_OBJECTS
153  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
154 #endif // USE_TEXTURE_OBJECTS
155  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
156  setParam();
157  DSLASH(domainWallDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
158  }
159 
160  long long flops() const {
161  long long flops = DslashCuda::flops();
162  switch(dslashParam.kernel_type) {
163  case EXTERIOR_KERNEL_X:
164  case EXTERIOR_KERNEL_Y:
165  case EXTERIOR_KERNEL_Z:
166  case EXTERIOR_KERNEL_T:
167  case EXTERIOR_KERNEL_ALL:
168  break;
169  case INTERIOR_KERNEL:
170  case KERNEL_POLICY:
171  int Ls = in->X(4);
172  long long bulk = (Ls-2)*(in->VolumeCB()/Ls);
173  long long wall = 2*(in->VolumeCB()/Ls);
174  flops += 96ll*bulk + 120ll*wall;
175  break;
176  }
177  return flops;
178  }
179 
180  virtual long long bytes() const {
181  bool isHalf = in->Precision() == sizeof(short) ? true : false;
182  int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
183  long long bytes = DslashCuda::bytes();
184  switch(dslashParam.kernel_type) {
185  case EXTERIOR_KERNEL_X:
186  case EXTERIOR_KERNEL_Y:
187  case EXTERIOR_KERNEL_Z:
188  case EXTERIOR_KERNEL_T:
189  case EXTERIOR_KERNEL_ALL:
190  break;
191  case INTERIOR_KERNEL:
192  case KERNEL_POLICY:
193  bytes += 2 * spinor_bytes * in->VolumeCB();
194  break;
195  }
196  return bytes;
197  }
198  };
199 #endif // GPU_DOMAIN_WALL_DIRAC
200 
201 #include <dslash_policy.cuh>
202 
204  const cudaColorSpinorField *in, const int parity, const int dagger,
205  const cudaColorSpinorField *x, const double &m_f, const double &k2,
206  const int *commOverride, TimeProfile &profile)
207  {
208 #ifdef GPU_DOMAIN_WALL_DIRAC
209  const_cast<cudaColorSpinorField*>(in)->createComms(1);
210 
211  DslashCuda *dslash = 0;
212  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
213  dslash = new DomainWallDslashCuda<double2,double2>(out, gauge, in, x, m_f, k2, parity, dagger, commOverride);
214  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
215  dslash = new DomainWallDslashCuda<float4,float4>(out, gauge, in, x, m_f, k2, parity, dagger, commOverride);
216  } else if (in->Precision() == QUDA_HALF_PRECISION) {
217  dslash = new DomainWallDslashCuda<short4,short4>(out, gauge, in, x, m_f, k2, parity, dagger, commOverride);
218  }
219 
220  // the parameters passed to dslashCuda must be 4-d volume and 3-d
221  // faces because Ls is added as the y-dimension in thread space
222  int ghostFace[QUDA_MAX_DIM];
223  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
224 
225  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
226  dslash_policy.apply(0);
227 
228  delete dslash;
229 #else
230  errorQuda("Domain wall dslash has not been built");
231 #endif
232  }
233 
234 }
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
#define mferm
int Ls
Definition: test_util.cpp:39
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile)
unsigned long long flops
Definition: blas_quda.cu:42
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254