QUDA  0.9.0
dslash_improved_staggered.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 //these are access control for staggered action
10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
12 //#define DIRECT_ACCESS_FAT_LINK
13 //#define DIRECT_ACCESS_LONG_LINK
14 //#define DIRECT_ACCESS_SPINOR
15 //#define DIRECT_ACCESS_ACCUM
16 //#define DIRECT_ACCESS_INTER
17 //#define DIRECT_ACCESS_PACK
18 #else // Fermi
19 //#define DIRECT_ACCESS_FAT_LINK
20 //#define DIRECT_ACCESS_LONG_LINK
21 //#define DIRECT_ACCESS_SPINOR
22 //#define DIRECT_ACCESS_ACCUM
23 //#define DIRECT_ACCESS_INTER
24 //#define DIRECT_ACCESS_PACK
25 #endif
26 #endif // GPU_STAGGERED_DIRAC
27 
28 #include <quda_internal.h>
29 #include <dslash_quda.h>
30 #include <sys/time.h>
31 #include <blas_quda.h>
32 
33 #include <inline_ptx.h>
34 
35 namespace quda {
36 
37  namespace improvedstaggered {
38 #include <dslash_constants.h>
39 #include <dslash_textures.h>
40 #include <dslash_index.cuh>
41 
42 #undef GPU_NDEG_TWISTED_MASS_DIRAC
43 #undef GPU_CLOVER_DIRAC
44 #undef GPU_DOMAIN_WALL_DIRAC
45 #define DD_IMPROVED 1
46 #include <staggered_dslash_def.h> // staggered Dslash kernels
47 #undef DD_IMPROVED
48 
49 #include <dslash_quda.cuh>
50  } // end namespace improvedstaggered
51 
52  // declare the dslash events
53 #include <dslash_events.cuh>
54 
55  using namespace improvedstaggered;
56 
57 #ifdef GPU_STAGGERED_DIRAC
58  template <typename sFloat, typename fatGFloat, typename longGFloat, typename phaseFloat>
59  class StaggeredDslashCuda : public DslashCuda {
60 
61  private:
62  const GaugeField &fatGauge;
63  const GaugeField &longGauge;
64  const unsigned int nSrc;
65 
66  protected:
67  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
68  unsigned int sharedBytesPerThread() const
69  {
70 #ifdef PARALLEL_DIR
71  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
72  return 6 * reg_size;
73 #else
74  return 0;
75 #endif
76  }
77 
78  public:
79  StaggeredDslashCuda(cudaColorSpinorField *out, const GaugeField &fatGauge, const GaugeField &longGauge,
80  const cudaColorSpinorField *in, const cudaColorSpinorField *x, const double a,
81  const int parity, const int dagger, const int *commOverride)
82  : DslashCuda(out, in, x, longGauge, parity, dagger, commOverride),
83  fatGauge(fatGauge), longGauge(longGauge), nSrc(in->X(4))
84  {
85 #ifdef MULTI_GPU
86  for(int i=0;i < 4; i++){
87  if(comm_dim_partitioned(i) && (fatGauge.X()[i] < 6)){
88  errorQuda("ERROR: partitioned dimension with local size less than 6 is not supported in improved staggered dslash\n");
89  }
90  }
91 #endif
92 
93  bindFatGaugeTex(static_cast<const cudaGaugeField&>(fatGauge), parity, dslashParam);
94  bindLongGaugeTex(static_cast<const cudaGaugeField&>(longGauge), parity, dslashParam);
95 
96  if (in->Precision() != fatGauge.Precision() || in->Precision() != longGauge.Precision()){
97  errorQuda("Mixing gauge and spinor precision not supported"
98  "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
99  in->Precision(), fatGauge.Precision(), longGauge.Precision());
100  }
101 
102  dslashParam.a = a;
103  dslashParam.a_f = a;
104  dslashParam.fat_link_max = fatGauge.LinkMax();
105  dslashParam.coeff = 1.0/longGauge.Scale();
106  dslashParam.coeff_f = (float)dslashParam.coeff;
107  }
108 
109  virtual ~StaggeredDslashCuda() {
110  unbindSpinorTex<sFloat>(in, out, x);
111  unbindFatGaugeTex(static_cast<const cudaGaugeField&>(fatGauge));
112  unbindLongGaugeTex(static_cast<const cudaGaugeField&>(longGauge));
113  }
114 
115  void apply(const cudaStream_t &stream)
116  {
117 #ifndef USE_TEXTURE_OBJECTS
118  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
119 #endif // USE_TEXTURE_OBJECTS
120  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
121  setParam();
122  dslashParam.gauge_stride = fatGauge.Stride();
123  dslashParam.long_gauge_stride = longGauge.Stride();
124  dslashParam.swizzle = tp.aux.x;
125  IMPROVED_STAGGERED_DSLASH(tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
126  }
127 
128  bool advanceBlockDim(TuneParam &param) const
129  {
130  const unsigned int max_shared = deviceProp.sharedMemPerBlock;
131  // first try to advance block.y (number of right-hand sides per block)
132  if (param.block.y < nSrc && param.block.y < (unsigned int)deviceProp.maxThreadsDim[1] &&
133  sharedBytesPerThread()*param.block.x*param.block.y < max_shared &&
134  (param.block.x*(param.block.y+1u)) <= (unsigned int)deviceProp.maxThreadsPerBlock) {
135  param.block.y++;
136  param.grid.y = (nSrc + param.block.y - 1) / param.block.y;
137  return true;
138  } else {
139  bool rtn = DslashCuda::advanceBlockDim(param);
140  param.block.y = 1;
141  param.grid.y = nSrc;
142  return rtn;
143  }
144  }
145 
146  bool advanceAux(TuneParam &param) const
147  {
148 #ifdef SWIZZLE
149  if (param.aux.x < 2*deviceProp.multiProcessorCount) {
150  param.aux.x++;
151  return true;
152  } else {
153  param.aux.x = 1;
154  return false;
155  }
156 #else
157  return false;
158 #endif
159  }
160 
161  void initTuneParam(TuneParam &param) const
162  {
163  DslashCuda::initTuneParam(param);
164  param.block.y = 1;
165  param.grid.y = nSrc;
166  param.aux.x = 1;
167  }
168 
169  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
170 
171  int Nface() const { return 6; }
172 
173  /*
174  per direction / dimension flops
175  SU(3) matrix-vector flops = (8 Nc - 2) * Nc
176  xpay = 2 * 2 * Nc * Ns
177 
178  So for the full dslash we have
179  flops = (2 * 2 * Nd * (8*Nc-2) * Nc) + ((2 * 2 * Nd - 1) * 2 * Nc * Ns)
180  flops_xpay = flops + 2 * 2 * Nc * Ns
181 
182  For Asqtad this should give 1146 for Nc=3,Ns=2 and 1158 for the axpy equivalent
183  */
184  virtual long long flops() const {
185  int mv_flops = (8 * in->Ncolor() - 2) * in->Ncolor(); // SU(3) matrix-vector flops
186  int ghost_flops = (3 + 1) * (mv_flops + 2*in->Ncolor()*in->Nspin());
187  int xpay_flops = 2 * 2 * in->Ncolor() * in->Nspin(); // multiply and add per real component
188  int num_dir = 2 * 4; // dir * dim
189 
190  long long flops = 0;
191  switch(dslashParam.kernel_type) {
192  case EXTERIOR_KERNEL_X:
193  case EXTERIOR_KERNEL_Y:
194  case EXTERIOR_KERNEL_Z:
195  case EXTERIOR_KERNEL_T:
196  flops = ghost_flops * 2 * in->GhostFace()[dslashParam.kernel_type];
197  break;
198  case EXTERIOR_KERNEL_ALL:
199  {
200  long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
201  flops = ghost_flops * ghost_sites;
202  break;
203  }
204  case INTERIOR_KERNEL:
205  case KERNEL_POLICY:
206  {
207  long long sites = in->VolumeCB();
208  flops = (2*num_dir*mv_flops + // SU(3) matrix-vector multiplies
209  (2*num_dir-1)*2*in->Ncolor()*in->Nspin()) * sites; // accumulation
210  if (x) flops += xpay_flops * sites; // axpy is always on interior
211 
212  if (dslashParam.kernel_type == KERNEL_POLICY) break;
213  // now correct for flops done by exterior kernel
214  long long ghost_sites = 0;
215  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2 * in->GhostFace()[d];
216  flops -= ghost_flops * ghost_sites;
217 
218  break;
219  }
220  }
221  return flops;
222  }
223 
224  virtual long long bytes() const {
225  int gauge_bytes_fat = QUDA_RECONSTRUCT_NO * in->Precision();
226  int gauge_bytes_long = reconstruct * in->Precision();
227  bool isHalf = in->Precision() == sizeof(short) ? true : false;
228  int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
229  int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat) + spinor_bytes;
230  int num_dir = 2 * 4; // set to 4 dimensions since we take care of 5-d fermions in derived classes where necessary
231 
232  long long bytes = 0;
233  switch(dslashParam.kernel_type) {
234  case EXTERIOR_KERNEL_X:
235  case EXTERIOR_KERNEL_Y:
236  case EXTERIOR_KERNEL_Z:
237  case EXTERIOR_KERNEL_T:
238  bytes = ghost_bytes * 2 * in->GhostFace()[dslashParam.kernel_type];
239  break;
240  case EXTERIOR_KERNEL_ALL:
241  {
242  long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
243  bytes = ghost_bytes * ghost_sites;
244  break;
245  }
246  case INTERIOR_KERNEL:
247  case KERNEL_POLICY:
248  {
249  long long sites = in->VolumeCB();
250  bytes = (num_dir*(gauge_bytes_fat + gauge_bytes_long) + // gauge reads
251  num_dir*2*spinor_bytes + // spinor reads
252  spinor_bytes)*sites; // spinor write
253  if (x) bytes += spinor_bytes;
254 
255  if (dslashParam.kernel_type == KERNEL_POLICY) break;
256  // now correct for bytes done by exterior kernel
257  long long ghost_sites = 0;
258  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2*in->GhostFace()[d];
259  bytes -= ghost_bytes * ghost_sites;
260 
261  break;
262  }
263  }
264  return bytes;
265  }
266 
267  };
268 #endif // GPU_STAGGERED_DIRAC
269 
270 #include <dslash_policy.cuh>
271 
273  const cudaGaugeField &longGauge, const cudaColorSpinorField *in,
274  const int parity, const int dagger, const cudaColorSpinorField *x,
275  const double &k, const int *commOverride, TimeProfile &profile)
276  {
277 #ifdef GPU_STAGGERED_DIRAC
278  const_cast<cudaColorSpinorField*>(in)->createComms(3);
279 
280  DslashCuda *dslash = nullptr;
281  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
282  dslash = new StaggeredDslashCuda<double2, double2, double2, double>
283  (out, fatGauge, longGauge, in, x, k, parity, dagger, commOverride);
284  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
285  dslash = new StaggeredDslashCuda<float2, float2, float4, float>
286  (out, fatGauge, longGauge, in, x, k, parity, dagger, commOverride);
287  } else if (in->Precision() == QUDA_HALF_PRECISION) {
288  dslash = new StaggeredDslashCuda<short2, short2, short4, short>
289  (out, fatGauge, longGauge, in, x, k, parity, dagger, commOverride);
290  }
291 
292  // the parameters passed to dslashCuda must be 4-d volume and 3-d
293  // faces because Ls is added as the y-dimension in thread space
294  int ghostFace[QUDA_MAX_DIM];
295  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
296 
297  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
298  dslash_policy.apply(0);
299 
300  delete dslash;
301 #else
302  errorQuda("Staggered dslash has not been built");
303 #endif // GPU_STAGGERED_DIRAC
304  }
305 
306 }
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
QudaGaugeParam param
Definition: pack_test.cpp:17
void unbindLongGaugeTex(const cudaGaugeField &gauge)
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void improvedStaggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, const cudaGaugeField &longGauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
cpuColorSpinorField * out
void bindLongGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)
unsigned long long flops
Definition: blas_quda.cu:42
#define IMPROVED_STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)
void unbindFatGaugeTex(const cudaGaugeField &gauge)