QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
clover_deriv_quda.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cuda.h>
4 #include <tune_quda.h>
5 #include <gauge_field.h>
6 #include <cassert>
7 
8 #include <jitify_helper.cuh>
10 
41 namespace quda {
42 
43 #ifdef GPU_CLOVER_DIRAC
44 
45  template<typename Float, typename Arg>
46  class CloverDerivative : public TunableVectorY {
47 
48  private:
49  Arg arg;
50  const GaugeField &meta;
51 
52 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY)
53  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float) + 8; }
54 #elif defined(SHARED_ACCUMULATOR)
55  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float); }
56 #else
57  unsigned int sharedBytesPerThread() const { return 0; }
58 #endif
59  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
60 
61  unsigned int minThreads() const { return arg.volumeCB; }
62  bool tuneGridDim() const { return false; } // don't tune the grid dimension
63 
64  public:
65  CloverDerivative(const Arg &arg, const GaugeField &meta) : TunableVectorY(2), arg(arg), meta(meta) {
66  writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
67  arg.volumeCB,sizeof(Float),arg.force.stride,
68  arg.gauge.stride,arg.oprod.stride);
69 #ifdef JITIFY
70  create_jitify_program("kernels/clover_deriv.cuh");
71 #endif
72  }
73  virtual ~CloverDerivative() {}
74 
75  void apply(const cudaStream_t &stream){
76  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
77 #ifdef JITIFY
78  using namespace jitify::reflection;
79  jitify_error = program->kernel("quda::cloverDerivativeKernel")
80  .instantiate(Type<Float>(), Type<Arg>())
81  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
82  .launch(arg);
83 #else
84  cloverDerivativeKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
85 #endif
86  } // apply
87 
88  bool advanceBlockDim(TuneParam &param) const {
89  dim3 block = param.block;
90  dim3 grid = param.grid;
91  bool rtn = TunableVectorY::advanceBlockDim(param);
92  param.block.z = block.z;
93  param.grid.z = grid.z;
94 
95  if (!rtn) {
96  if (param.block.z < 4) {
97  param.block.z++;
98  param.grid.z = (4 + param.block.z - 1) / param.block.z;
99  rtn = true;
100  } else {
101  param.block.z = 1;
102  param.grid.z = 4;
103  rtn = false;
104  }
105  }
106  return rtn;
107  }
108 
109  void initTuneParam(TuneParam &param) const {
111  param.block.y = 1;
112  param.block.z = 1;
113  param.grid.y = 2;
114  param.grid.z = 4;
115  }
116 
117  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
118 
119  // The force field is updated so we must preserve its initial state
120  void preTune() { arg.force.save(); }
121  void postTune() { arg.force.load(); }
122 
123  long long flops() const { return 16 * 198 * 3 * 4 * 2 * (long long)arg.volumeCB; }
124  long long bytes() const { return ((8*arg.gauge.Bytes() + 4*arg.oprod.Bytes())*3 + 2*arg.force.Bytes()) * 4 * 2 * arg.volumeCB; }
125 
126  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
127  };
128 
129  template<typename Float>
130  void cloverDerivative(cudaGaugeField &force,
131  cudaGaugeField &gauge,
132  cudaGaugeField &oprod,
133  double coeff, int parity) {
134 
135  if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
136 
137  if (force.Order() != oprod.Order()) errorQuda("Force and Oprod orders must match");
138 
139  if (force.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
140 
141  if (force.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
142  typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
143  typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
144 
145  if (gauge.isNative()) {
146  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
147  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
148  typedef CloverDerivArg<Float,F,G,O> Arg;
149  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
150  CloverDerivative<Float, Arg> deriv(arg, gauge);
151  deriv.apply(0);
152 #if 0
153  } else if (gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
154  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
155  typedef CloverDerivArg<Float,F,G,O> Arg;
156  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
157  CloverDerivative<Float, Arg> deriv(arg, gauge);
158  deriv.apply(0);
159 #endif
160  } else {
161  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
162  }
163  } else {
164  errorQuda("Gauge order %d not supported", gauge.Order());
165  }
166  } else {
167  errorQuda("Force order %d not supported", force.Order());
168  } // force / oprod order
169 
171  }
172 #endif // GPU_CLOVER
173 
175  cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
176  {
177 #ifdef GPU_CLOVER_DIRAC
178  assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
179  assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
180 
181  for (int d=0; d<4; d++) {
182  if (oprod.X()[d] != gauge.X()[d])
183  errorQuda("Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.X()[d], oprod.X()[d]);
184  }
185 
186  int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
187 
188  if(force.Precision() == QUDA_DOUBLE_PRECISION){
189  cloverDerivative<double>(force, gauge, oprod, coeff, device_parity);
190 #if 0
191  } else if (force.Precision() == QUDA_SINGLE_PRECISION){
192  cloverDerivative<float>(force, gauge, oprod, coeff, device_parity);
193 #endif
194  } else {
195  errorQuda("Precision %d not supported", force.Precision());
196  }
197 
198  return;
199 #else
200  errorQuda("Clover has not been built");
201 #endif
202  }
203 
204 } // namespace quda
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
cudaStream_t * stream
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:466
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:440
QudaGaugeParam param
Definition: pack_test.cpp:17
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
enum QudaParity_s QudaParity
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
const int * X() const