QUDA  v1.1.0
A library for QCD on GPUs
clover_deriv_quda.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <gauge_field.h>
6 #include <jitify_helper.cuh>
7 #include <kernels/clover_deriv.cuh>
8 
9 /**
10  @file clover_deriv_quda.cu
11 
12  @brief This kernel has been a bit of a pain to optimize since it is
13  excessively register bound. To reduce register pressure we use
14  shared memory to help offload some of this pressure. Annoyingly,
15  the optimal approach for CUDA 8.0 is not the same as CUDA 7.5, so
16  implementation is compiler version dependent. The CUDA 8.0 optimal
17  code runs 10x slower on 7.5, though the 7.5 code runs fine on 8.0.
18 
19  CUDA >= 8.0
20  - Used shared memory for force accumulator matrix
21  - Template mu / nu to prevent register spilling of indexing arrays
22  - Force the computation routine to inline
23 
24  CUDA <= 7.5
25  - Used shared memory for force accumulator matrix
26  - Keep mu/nu dynamic and use shared memory to store indexing arrays
27  - Do not inline computation routine
28 
29  For the shared-memory dynamic indexing arrays, we use chars, since
30  the array is 4-d, a 4-d coordinate can be stored in a single word
31  which means that we will not have to worry about bank conflicts,
32  and the shared array can be passed to the usual indexing routines
33  (getCoordsExtended and linkIndexShift) with no code changes. This
34  strategy works as long as each local lattice coordinate is less
35  than 256.
36  */
37 
38 
39 namespace quda {
40 
41 #ifdef GPU_CLOVER_DIRAC
42 
43  template<typename Float, typename Arg>
44  class CloverDerivative : public TunableVectorY {
45 
46  private:
47  Arg arg;
48  const GaugeField &meta;
49 
50 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY)
51  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float) + 8; }
52 #elif defined(SHARED_ACCUMULATOR)
53  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float); }
54 #else
55  unsigned int sharedBytesPerThread() const { return 0; }
56 #endif
57  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
58 
59  unsigned int minThreads() const { return arg.volumeCB; }
60  bool tuneGridDim() const { return false; } // don't tune the grid dimension
61 
62  public:
63  CloverDerivative(const Arg &arg, const GaugeField &meta) : TunableVectorY(2), arg(arg), meta(meta) {
64  writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
65  arg.volumeCB,sizeof(Float),arg.force.stride,
66  arg.gauge.stride,arg.oprod.stride);
67 #ifdef JITIFY
68  create_jitify_program("kernels/clover_deriv.cuh");
69 #endif
70  }
71  virtual ~CloverDerivative() {}
72 
73  void apply(const qudaStream_t &stream){
74  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
75 #ifdef JITIFY
76  using namespace jitify::reflection;
77  jitify_error = program->kernel("quda::cloverDerivativeKernel")
78  .instantiate(Type<Float>(), Type<Arg>())
79  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
80  .launch(arg);
81 #else
82  qudaLaunchKernel(cloverDerivativeKernel<Float, Arg>, tp, stream, arg);
83 #endif
84  } // apply
85 
86  bool advanceBlockDim(TuneParam &param) const {
87  dim3 block = param.block;
88  dim3 grid = param.grid;
89  bool rtn = TunableVectorY::advanceBlockDim(param);
90  param.block.z = block.z;
91  param.grid.z = grid.z;
92 
93  if (!rtn) {
94  if (param.block.z < 4) {
95  param.block.z++;
96  param.grid.z = (4 + param.block.z - 1) / param.block.z;
97  rtn = true;
98  } else {
99  param.block.z = 1;
100  param.grid.z = 4;
101  rtn = false;
102  }
103  }
104  return rtn;
105  }
106 
107  void initTuneParam(TuneParam &param) const {
108  TunableVectorY::initTuneParam(param);
109  param.block.y = 1;
110  param.block.z = 1;
111  param.grid.y = 2;
112  param.grid.z = 4;
113  }
114 
115  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
116 
117  // The force field is updated so we must preserve its initial state
118  void preTune() { arg.force.save(); }
119  void postTune() { arg.force.load(); }
120 
121  long long flops() const { return 16 * 198 * 3 * 4 * 2 * (long long)arg.volumeCB; }
122  long long bytes() const { return ((8*arg.gauge.Bytes() + 4*arg.oprod.Bytes())*3 + 2*arg.force.Bytes()) * 4 * 2 * arg.volumeCB; }
123 
124  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
125  };
126 
127  template<typename Float>
128  void cloverDerivative(cudaGaugeField &force,
129  cudaGaugeField &gauge,
130  cudaGaugeField &oprod,
131  double coeff, int parity) {
132 
133  if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
134 
135  if (force.Order() != oprod.Order()) errorQuda("Force and Oprod orders must match");
136 
137  if (force.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
138 
139  if (force.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
140  typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
141  typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
142 
143  if (gauge.isNative()) {
144  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
145  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
146  typedef CloverDerivArg<Float,F,G,O> Arg;
147  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
148  CloverDerivative<Float, Arg> deriv(arg, gauge);
149  deriv.apply(0);
150 #if 0
151  } else if (gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
152  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
153  typedef CloverDerivArg<Float,F,G,O> Arg;
154  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
155  CloverDerivative<Float, Arg> deriv(arg, gauge);
156  deriv.apply(0);
157 #endif
158  } else {
159  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
160  }
161  } else {
162  errorQuda("Gauge order %d not supported", gauge.Order());
163  }
164  } else {
165  errorQuda("Force order %d not supported", force.Order());
166  } // force / oprod order
167 
168  qudaDeviceSynchronize();
169  }
170 #endif // GPU_CLOVER
171 
172  void cloverDerivative(
173  cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
174  {
175 #ifdef GPU_CLOVER_DIRAC
176  assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
177  assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
178 
179  for (int d=0; d<4; d++) {
180  if (oprod.X()[d] != gauge.X()[d])
181  errorQuda("Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.X()[d], oprod.X()[d]);
182  }
183 
184  int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
185 
186  if(force.Precision() == QUDA_DOUBLE_PRECISION){
187  cloverDerivative<double>(force, gauge, oprod, coeff, device_parity);
188 #if 0
189  } else if (force.Precision() == QUDA_SINGLE_PRECISION){
190  cloverDerivative<float>(force, gauge, oprod, coeff, device_parity);
191 #endif
192  } else {
193  errorQuda("Precision %d not supported", force.Precision());
194  }
195 
196  return;
197 #else
198  errorQuda("Clover has not been built");
199 #endif
200  }
201 
202 } // namespace quda