QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_domain_wall.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 #include <face_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace domainwall {
30 
31 #undef GPU_STAGGERED_DIRAC
32 #include <dslash_constants.h>
33 #include <dslash_textures.h>
34 #include <dslash_index.cuh>
35 
36  // Enable shared memory dslash for Fermi architecture
37  //#define SHARED_WILSON_DSLASH
38  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
39 
40 #ifdef GPU_DOMAIN_WALL_DIRAC
41 #include <dw_dslash_def.h> // Domain Wall kernels
42 #endif
43 
44 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
45 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
46 #endif
47 
48 #include <dslash_quda.cuh>
49  }
50 
51  // declare the dslash events
52 #include <dslash_events.cuh>
53 
54  using namespace domainwall;
55 
56 #ifdef GPU_DOMAIN_WALL_DIRAC
57  template <typename sFloat, typename gFloat>
58  class DomainWallDslashCuda : public DslashCuda {
59 
60  private:
61  const gFloat *gauge0, *gauge1;
62  const double mferm;
63  const double a;
64 
65  bool checkGrid(TuneParam &param) const {
66  if (param.grid.x > deviceProp.maxGridSize[0] || param.grid.y > deviceProp.maxGridSize[1]) {
67  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
68  param.block.x, param.block.y, param.block.z,
69  param.grid.x, param.grid.y, param.grid.z);
70  return false;
71  } else {
72  return true;
73  }
74  }
75 
76  protected:
77  bool advanceBlockDim(TuneParam &param) const
78  {
79  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
80  const int step[2] = { deviceProp.warpSize, 1 };
81  bool advance[2] = { false, false };
82 
83  // first try to advance block.x
84  param.block.x += step[0];
85  if (param.block.x > deviceProp.maxThreadsDim[0] ||
86  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
87  advance[0] = false;
88  param.block.x = step[0]; // reset block.x
89  } else {
90  advance[0] = true; // successfully advanced block.x
91  }
92 
93  if (!advance[0]) { // if failed to advance block.x, now try block.y
94  param.block.y += step[1];
95 
96  if (param.block.y > in->X(4) ||
97  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
98  advance[1] = false;
99  param.block.y = step[1]; // reset block.x
100  } else {
101  advance[1] = true; // successfully advanced block.y
102  }
103  }
104 
105  if (advance[0] || advance[1]) {
106  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
107  (in->X(4)+param.block.y-1) / param.block.y, 1);
108 
109  bool advance = true;
110  if (!checkGrid(param)) advance = advanceBlockDim(param);
111  return advance;
112  } else {
113  return false;
114  }
115  }
116 
117  unsigned int sharedBytesPerThread() const { return 0; }
118 
119  public:
120  DomainWallDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
121  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
122  const cudaColorSpinorField *x, const double mferm,
123  const double a, const int dagger)
124  : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0),
125  gauge1(gauge1), mferm(mferm), a(a)
126  {
127  bindSpinorTex<sFloat>(in, out, x);
128  }
129  virtual ~DomainWallDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
130 
131  virtual void initTuneParam(TuneParam &param) const
132  {
133  Tunable::initTuneParam(param);
134  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
135  (in->X(4)+param.block.y-1) / param.block.y, 1);
136  bool ok = true;
137  if (!checkGrid(param)) ok = advanceBlockDim(param);
138  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
139  }
140 
142  virtual void defaultTuneParam(TuneParam &param) const
143  {
145  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
146  (in->X(4)+param.block.y-1) / param.block.y, 1);
147  bool ok = true;
148  if (!checkGrid(param)) ok = advanceBlockDim(param);
149  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
150  }
151 
152  void apply(const cudaStream_t &stream)
153  {
154  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
155  DSLASH(domainWallDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
156  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
157  (sFloat*)in->V(), (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
158  }
159 
160  long long flops() const { // FIXME for multi-GPU
161  long long Ls = in->X(4);
162  long long vol4d = in->VolumeCB()/Ls;
163  long long bulk = (Ls-2)*vol4d;
164  long long wall = 2*vol4d;
165  return (x ? 1368ll : 1320ll)*in->VolumeCB() + 96ll*bulk + 120ll*wall;
166  }
167  };
168 #endif // GPU_DOMAIN_WALL_DIRAC
169 
170 #include <dslash_policy.cuh>
171 
173  const cudaColorSpinorField *in, const int parity, const int dagger,
174  const cudaColorSpinorField *x, const double &m_f, const double &k2,
175  const int *commOverride, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
176  {
177  inSpinor = (cudaColorSpinorField*)in; // EVIL
178 
179  dslashParam.parity = parity;
180 
181 #ifdef GPU_DOMAIN_WALL_DIRAC
182  //currently splitting in space-time is impelemented:
183  int dirs = 4;
184  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
185  for(int i = 0;i < dirs; i++){
186  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
187  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
188  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
189  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
190  }
191 
192  void *gauge0, *gauge1;
193  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
194 
195  if (in->Precision() != gauge.Precision())
196  errorQuda("Mixing gauge and spinor precision not supported");
197 
198  DslashCuda *dslash = 0;
199  size_t regSize = sizeof(float);
200 
201  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
202 #if (__COMPUTE_CAPABILITY__ >= 130)
203  dslash = new DomainWallDslashCuda<double2,double2>(out, (double2*)gauge0, (double2*)gauge1,
204  gauge.Reconstruct(), in, x, m_f, k2, dagger);
205  regSize = sizeof(double);
206 #else
207  errorQuda("Double precision not supported on this GPU");
208 #endif
209  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
210  dslash = new DomainWallDslashCuda<float4,float4>(out, (float4*)gauge0, (float4*)gauge1,
211  gauge.Reconstruct(), in, x, m_f, k2, dagger);
212  } else if (in->Precision() == QUDA_HALF_PRECISION) {
213  dslash = new DomainWallDslashCuda<short4,short4>(out, (short4*)gauge0, (short4*)gauge1,
214  gauge.Reconstruct(), in, x, m_f, k2, dagger);
215  }
216 
217  // the parameters passed to dslashCuda must be 4-d volume and 3-d
218  // faces because Ls is added as the y-dimension in thread space
219  int ghostFace[QUDA_MAX_DIM];
220  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
221  dslashCuda(*dslash, regSize, parity, dagger, in->Volume() / in->X(4), ghostFace, profile);
222 
223 #ifndef GPU_COMMS
224  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
225 #else
226  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
227 #endif
228 
229  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume()/in->X(4), ghostFace, profile);
230  delete dslashImp;
231 
232  delete dslash;
233  unbindGaugeTex(gauge);
234 
235  checkCudaError();
236 #else
237  errorQuda("Domain wall dslash has not been built");
238 #endif
239  }
240 
241 }
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const int * X() const
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
QudaDagType dagger
Definition: test_util.cpp:1558
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH)
int Ls
Definition: test_util.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
#define warningQuda(...)
Definition: util_quda.h:84
int x[4]
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const