QUDA  0.9.0
dslash_staggered.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 //these are access control for staggered action
10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
12 //#define DIRECT_ACCESS_FAT_LINK
13 //#define DIRECT_ACCESS_LONG_LINK
14 //#define DIRECT_ACCESS_SPINOR
15 //#define DIRECT_ACCESS_ACCUM
16 //#define DIRECT_ACCESS_INTER
17 //#define DIRECT_ACCESS_PACK
18 #else // Fermi
19 //#define DIRECT_ACCESS_FAT_LINK
20 //#define DIRECT_ACCESS_LONG_LINK
21 //#define DIRECT_ACCESS_SPINOR
22 //#define DIRECT_ACCESS_ACCUM
23 //#define DIRECT_ACCESS_INTER
24 //#define DIRECT_ACCESS_PACK
25 #endif
26 
27 #endif // GPU_STAGGERED_DIRAC
28 
29 #include <quda_internal.h>
30 #include <dslash_quda.h>
31 #include <sys/time.h>
32 #include <blas_quda.h>
33 
34 #include <inline_ptx.h>
35 
36 namespace quda {
37 
38  namespace staggered {
39 #include <dslash_constants.h>
40 #include <dslash_textures.h>
41 #include <dslash_index.cuh>
42 
43 #undef GPU_CLOVER_DIRAC
44 #undef GPU_DOMAIN_WALL_DIRAC
45 #define DD_IMPROVED 0
46 #include <staggered_dslash_def.h> // staggered Dslash kernels
47 #undef DD_IMPROVED
48 
49 #include <dslash_quda.cuh>
50  } // end namespace staggered
51 
52  // declare the dslash events
53 #include <dslash_events.cuh>
54 
55  using namespace staggered;
56 
57 #ifdef GPU_STAGGERED_DIRAC
58  template <typename sFloat, typename gFloat>
59  class StaggeredDslashCuda : public DslashCuda {
60 
61  private:
62  const unsigned int nSrc;
63 
64  protected:
65  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
66  unsigned int sharedBytesPerThread() const
67  {
68 #ifdef PARALLEL_DIR
69  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
70  return 6 * reg_size;
71 #else
72  return 0;
73 #endif
74  }
75 
76  public:
77  StaggeredDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const cudaColorSpinorField *in,
78  const cudaColorSpinorField *x, const double a,
79  const int parity, const int dagger, const int *commOverride)
80  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride), nSrc(in->X(4))
81  {
82  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_9 || gauge.Reconstruct() == QUDA_RECONSTRUCT_13) {
83  errorQuda("Reconstruct %d not supported", gauge.Reconstruct());
84  }
85  dslashParam.a = a;
86  dslashParam.a_f = a;
87  dslashParam.fat_link_max = gauge.LinkMax();
88  }
89 
90  virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
91 
92  void apply(const cudaStream_t &stream)
93  {
94 #ifndef USE_TEXTURE_OBJECTS
95  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
96 #endif // USE_TEXTURE_OBJECTS
97  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
98  setParam();
99  dslashParam.swizzle = tp.aux.x;
100  STAGGERED_DSLASH(tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
101  }
102 
103  bool advanceBlockDim(TuneParam &param) const
104  {
105  const unsigned int max_shared = deviceProp.sharedMemPerBlock;
106  // first try to advance block.y (number of right-hand sides per block)
107  if (param.block.y < nSrc && param.block.y < (unsigned int)deviceProp.maxThreadsDim[1] &&
108  sharedBytesPerThread()*param.block.x*param.block.y < max_shared &&
109  (param.block.x*(param.block.y+1u)) <= (unsigned int)deviceProp.maxThreadsPerBlock) {
110  param.block.y++;
111  param.grid.y = (nSrc + param.block.y - 1) / param.block.y;
112  return true;
113  } else {
114  bool rtn = DslashCuda::advanceBlockDim(param);
115  param.block.y = 1;
116  param.grid.y = nSrc;
117  return rtn;
118  }
119  }
120 
121  bool advanceAux(TuneParam &param) const
122  {
123 #ifdef SWIZZLE
124  if (param.aux.x < 2*deviceProp.multiProcessorCount) {
125  param.aux.x++;
126  return true;
127  } else {
128  param.aux.x = 1;
129  return false;
130  }
131 #else
132  return false;
133 #endif
134  }
135 
136  void initTuneParam(TuneParam &param) const
137  {
138  DslashCuda::initTuneParam(param);
139  param.block.y = 1;
140  param.grid.y = nSrc;
141  param.aux.x = 1;
142  }
143 
144  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
145 
146  int Nface() const { return 2; }
147  };
148 #endif // GPU_STAGGERED_DIRAC
149 
150 #include <dslash_policy.cuh>
151 
153  const cudaColorSpinorField *in, const int parity,
154  const int dagger, const cudaColorSpinorField *x,
155  const double &k, const int *commOverride, TimeProfile &profile)
156  {
157 #ifdef GPU_STAGGERED_DIRAC
158  const_cast<cudaColorSpinorField*>(in)->createComms(1);
159 
160  DslashCuda *dslash = nullptr;
161  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
162  dslash = new StaggeredDslashCuda<double2, double2>(out, gauge, in, x, k, parity, dagger, commOverride);
163  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
164  dslash = new StaggeredDslashCuda<float2, float2>(out, gauge, in, x, k, parity, dagger, commOverride);
165  } else if (in->Precision() == QUDA_HALF_PRECISION) {
166  dslash = new StaggeredDslashCuda<short2, short2>(out, gauge, in, x, k, parity, dagger, commOverride);
167  }
168 
169  // the parameters passed to dslashCuda must be 4-d volume and 3-d
170  // faces because Ls is added as the y-dimension in thread space
171  int ghostFace[QUDA_MAX_DIM];
172  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
173 
174  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume()/in->X(4), ghostFace, profile);
175  dslash_policy.apply(0);
176 
177  delete dslash;
178 #else
179  errorQuda("Staggered dslash has not been built");
180 #endif // GPU_STAGGERED_DIRAC
181  }
182 
183 }
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
QudaGaugeParam param
Definition: pack_test.cpp:17
void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
cpuColorSpinorField * out
#define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
#define a