43 #ifdef GPU_CLOVER_DIRAC 45 template<
typename Float,
typename Arg>
46 class CloverDerivative :
public TunableVectorY {
50 const GaugeField &meta;
52 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY) 53 unsigned int sharedBytesPerThread()
const {
return 18*
sizeof(Float) + 8; }
54 #elif defined(SHARED_ACCUMULATOR) 55 unsigned int sharedBytesPerThread()
const {
return 18*
sizeof(Float); }
57 unsigned int sharedBytesPerThread()
const {
return 0; }
59 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
61 unsigned int minThreads()
const {
return arg.volumeCB; }
62 bool tuneGridDim()
const {
return false; }
65 CloverDerivative(
const Arg &arg,
const GaugeField &meta) : TunableVectorY(2), arg(arg), meta(meta) {
66 writeAuxString(
"threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
67 arg.volumeCB,
sizeof(Float),arg.force.stride,
68 arg.gauge.stride,arg.oprod.stride);
70 create_jitify_program(
"kernels/clover_deriv.cuh");
73 virtual ~CloverDerivative() {}
75 void apply(
const cudaStream_t &
stream){
78 using namespace jitify::reflection;
79 jitify_error = program->kernel(
"quda::cloverDerivativeKernel")
80 .instantiate(Type<Float>(), Type<Arg>())
81 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
84 cloverDerivativeKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
88 bool advanceBlockDim(TuneParam &
param)
const {
89 dim3 block = param.block;
90 dim3 grid = param.grid;
92 param.block.z = block.z;
93 param.grid.z = grid.z;
96 if (param.block.z < 4) {
98 param.grid.z = (4 + param.block.z - 1) / param.block.z;
109 void initTuneParam(TuneParam ¶m)
const {
117 void defaultTuneParam(TuneParam ¶m)
const { initTuneParam(param); }
120 void preTune() { arg.force.save(); }
121 void postTune() { arg.force.load(); }
123 long long flops()
const {
return 16 * 198 * 3 * 4 * 2 * (
long long)arg.volumeCB; }
124 long long bytes()
const {
return ((8*arg.gauge.Bytes() + 4*arg.oprod.Bytes())*3 + 2*arg.force.Bytes()) * 4 * 2 * arg.volumeCB; }
126 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
129 template<
typename Float>
131 cudaGaugeField &gauge,
132 cudaGaugeField &oprod,
133 double coeff,
int parity) {
137 if (force.Order() != oprod.Order())
errorQuda(
"Force and Oprod orders must match");
142 typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
143 typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
145 if (gauge.isNative()) {
147 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
148 typedef CloverDerivArg<Float,F,G,O> Arg;
149 Arg
arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff,
parity);
150 CloverDerivative<Float, Arg> deriv(arg, gauge);
154 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
155 typedef CloverDerivArg<Float,F,G,O> Arg;
156 Arg
arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff,
parity);
157 CloverDerivative<Float, Arg> deriv(arg, gauge);
161 errorQuda(
"Reconstruction type %d not supported",gauge.Reconstruct());
164 errorQuda(
"Gauge order %d not supported", gauge.Order());
167 errorQuda(
"Force order %d not supported", force.Order());
177 #ifdef GPU_CLOVER_DIRAC 181 for (
int d=0; d<4; d++) {
182 if (oprod.
X()[d] != gauge.
X()[d])
183 errorQuda(
"Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.
X()[d], oprod.
X()[d]);
189 cloverDerivative<double>(force, gauge, oprod, coeff, device_parity);
192 cloverDerivative<float>(force, gauge, oprod, coeff, device_parity);
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaVerbosity getVerbosity()
Helper file when using jitify run-time compilation. This file should be included in source code...
void initTuneParam(TuneParam ¶m) const
QudaFieldGeometry Geometry() const
bool advanceBlockDim(TuneParam ¶m) const
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaParity_s QudaParity
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const