5 #include <gauge_field.h>
6 #include <jitify_helper.cuh>
7 #include <kernels/clover_deriv.cuh>
10 @file clover_deriv_quda.cu
12 @brief This kernel has been a bit of a pain to optimize since it is
13 excessively register bound. To reduce register pressure we use
14 shared memory to help offload some of this pressure. Annoyingly,
15 the optimal approach for CUDA 8.0 is not the same as CUDA 7.5, so
16 implementation is compiler version dependent. The CUDA 8.0 optimal
17 code runs 10x slower on 7.5, though the 7.5 code runs fine on 8.0.
20 - Used shared memory for force accumulator matrix
21 - Template mu / nu to prevent register spilling of indexing arrays
22 - Force the computation routine to inline
25 - Used shared memory for force accumulator matrix
26 - Keep mu/nu dynamic and use shared memory to store indexing arrays
27 - Do not inline computation routine
29 For the shared-memory dynamic indexing arrays, we use chars, since
30 the array is 4-d, a 4-d coordinate can be stored in a single word
31 which means that we will not have to worry about bank conflicts,
32 and the shared array can be passed to the usual indexing routines
33 (getCoordsExtended and linkIndexShift) with no code changes. This
34 strategy works as long as each local lattice coordinate is less
41 #ifdef GPU_CLOVER_DIRAC
43 template<typename Float, typename Arg>
44 class CloverDerivative : public TunableVectorY {
48 const GaugeField &meta;
50 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY)
51 unsigned int sharedBytesPerThread() const { return 18*sizeof(Float) + 8; }
52 #elif defined(SHARED_ACCUMULATOR)
53 unsigned int sharedBytesPerThread() const { return 18*sizeof(Float); }
55 unsigned int sharedBytesPerThread() const { return 0; }
57 unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
59 unsigned int minThreads() const { return arg.volumeCB; }
60 bool tuneGridDim() const { return false; } // don't tune the grid dimension
63 CloverDerivative(const Arg &arg, const GaugeField &meta) : TunableVectorY(2), arg(arg), meta(meta) {
64 writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
65 arg.volumeCB,sizeof(Float),arg.force.stride,
66 arg.gauge.stride,arg.oprod.stride);
68 create_jitify_program("kernels/clover_deriv.cuh");
71 virtual ~CloverDerivative() {}
73 void apply(const qudaStream_t &stream){
74 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
76 using namespace jitify::reflection;
77 jitify_error = program->kernel("quda::cloverDerivativeKernel")
78 .instantiate(Type<Float>(), Type<Arg>())
79 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
82 qudaLaunchKernel(cloverDerivativeKernel<Float, Arg>, tp, stream, arg);
86 bool advanceBlockDim(TuneParam ¶m) const {
87 dim3 block = param.block;
88 dim3 grid = param.grid;
89 bool rtn = TunableVectorY::advanceBlockDim(param);
90 param.block.z = block.z;
91 param.grid.z = grid.z;
94 if (param.block.z < 4) {
96 param.grid.z = (4 + param.block.z - 1) / param.block.z;
107 void initTuneParam(TuneParam ¶m) const {
108 TunableVectorY::initTuneParam(param);
115 void defaultTuneParam(TuneParam ¶m) const { initTuneParam(param); }
117 // The force field is updated so we must preserve its initial state
118 void preTune() { arg.force.save(); }
119 void postTune() { arg.force.load(); }
121 long long flops() const { return 16 * 198 * 3 * 4 * 2 * (long long)arg.volumeCB; }
122 long long bytes() const { return ((8*arg.gauge.Bytes() + 4*arg.oprod.Bytes())*3 + 2*arg.force.Bytes()) * 4 * 2 * arg.volumeCB; }
124 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
127 template<typename Float>
128 void cloverDerivative(cudaGaugeField &force,
129 cudaGaugeField &gauge,
130 cudaGaugeField &oprod,
131 double coeff, int parity) {
133 if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
135 if (force.Order() != oprod.Order()) errorQuda("Force and Oprod orders must match");
137 if (force.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
139 if (force.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
140 typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
141 typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
143 if (gauge.isNative()) {
144 if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
145 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
146 typedef CloverDerivArg<Float,F,G,O> Arg;
147 Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
148 CloverDerivative<Float, Arg> deriv(arg, gauge);
151 } else if (gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
152 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
153 typedef CloverDerivArg<Float,F,G,O> Arg;
154 Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
155 CloverDerivative<Float, Arg> deriv(arg, gauge);
159 errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
162 errorQuda("Gauge order %d not supported", gauge.Order());
165 errorQuda("Force order %d not supported", force.Order());
166 } // force / oprod order
168 qudaDeviceSynchronize();
172 void cloverDerivative(
173 cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
175 #ifdef GPU_CLOVER_DIRAC
176 assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
177 assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
179 for (int d=0; d<4; d++) {
180 if (oprod.X()[d] != gauge.X()[d])
181 errorQuda("Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.X()[d], oprod.X()[d]);
184 int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
186 if(force.Precision() == QUDA_DOUBLE_PRECISION){
187 cloverDerivative<double>(force, gauge, oprod, coeff, device_parity);
189 } else if (force.Precision() == QUDA_SINGLE_PRECISION){
190 cloverDerivative<float>(force, gauge, oprod, coeff, device_parity);
193 errorQuda("Precision %d not supported", force.Precision());
198 errorQuda("Clover has not been built");