28 template <
typename Float,
int nDim,
int nColor,
int nParity,
bool dagger,
bool xpay, KernelType kernel_type,
typename Arg>
32 static constexpr
const char *kernel =
"quda::covDevGPU";
34 template <
typename Dslash>
35 inline static void launch(Dslash &dslash, TuneParam &tp, Arg &
arg,
const cudaStream_t &
stream)
37 static_assert(
xpay ==
false,
"Covariant derivative operator only defined without xpay");
38 static_assert(nParity == 2,
"Covariant derivative operator only defined for full field");
39 dslash.launch(covDevGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
43 template <
typename Float,
int nDim,
int nColor,
typename Arg>
class CovDev :
public Dslash<Float>
48 const ColorSpinorField &
in;
51 CovDev(Arg &arg,
const ColorSpinorField &
out,
const ColorSpinorField &in) :
52 Dslash<Float>(arg, out, in,
"kernels/covDev.cuh"),
60 void apply(
const cudaStream_t &
stream)
64 if (arg.xpay)
errorQuda(
"Covariant derivative operator only defined without xpay");
65 if (arg.nParity != 2)
errorQuda(
"Covariant derivative operator only defined for full field");
67 constexpr
bool xpay =
false;
68 constexpr
int nParity = 2;
69 Dslash<Float>::template instantiate<CovDevLaunch, nDim, nColor, nParity, xpay>(tp,
arg,
stream);
72 long long flops()
const 74 int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor();
75 int num_mv_multiply = in.Nspin();
76 int ghost_flops = num_mv_multiply * mv_flops;
80 switch (arg.kernel_type) {
85 if (arg.kernel_type != dim)
break;
86 flops_ = (ghost_flops)*in.GhostFace()[dim];
89 long long ghost_sites = in.GhostFace()[dim];
90 flops_ = ghost_flops * ghost_sites;
95 long long sites = in.Volume();
96 flops_ = num_mv_multiply * mv_flops * sites;
100 long long ghost_sites = arg.commDim[dim] ? in.GhostFace()[dim] : 0;
101 flops_ -= ghost_flops * ghost_sites;
110 long long bytes()
const 112 int gauge_bytes = arg.reconstruct * in.Precision();
113 bool isFixed = (in.Precision() ==
sizeof(short) || in.Precision() ==
sizeof(char)) ? true :
false;
114 int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ?
sizeof(float) : 0);
115 int ghost_bytes = gauge_bytes + 3 * spinor_bytes;
116 int dim = arg.mu % 4;
117 long long bytes_ = 0;
119 switch (arg.kernel_type) {
124 if (arg.kernel_type != dim)
break;
125 bytes_ = ghost_bytes * in.GhostFace()[dim];
128 long long ghost_sites = in.GhostFace()[dim];
129 bytes_ = ghost_bytes * ghost_sites;
134 long long sites = in.Volume();
135 bytes_ = (gauge_bytes + 2 * spinor_bytes) * sites;
139 long long ghost_sites = arg.commDim[dim] ? in.GhostFace()[dim] : 0;
140 bytes_ -= ghost_bytes * ghost_sites;
148 TuneKey tuneKey()
const 157 return TuneKey(in.VolString(),
typeid(*this).name(), aux);
161 template <
typename Float,
int nColor, QudaReconstructType recon>
struct CovDevApply {
163 inline CovDevApply(ColorSpinorField &
out,
const ColorSpinorField &in,
const GaugeField &U,
int mu,
int parity,
164 bool dagger,
const int *comm_override, TimeProfile &profile)
167 constexpr
int nDim = 4;
168 CovDevArg<Float, nColor, recon>
arg(out, in, U, mu, parity, dagger, comm_override);
169 CovDev<Float, nDim, nColor, CovDevArg<Float, nColor, recon>>
covDev(arg, out, in);
171 dslash::DslashPolicyTune<decltype(covDev)> policy(
172 covDev, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
173 in.GhostFaceCB(), profile);
186 bool dagger,
const int *comm_override,
TimeProfile &profile)
189 if (in.
V() == out.
V())
errorQuda(
"Aliasing pointers");
205 errorQuda(
"Covariant derivative kernels have not been built");
QudaVerbosity getVerbosity()
#define checkPrecision(...)
void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the covariant derivative.
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
__device__ __host__ void covDev(Arg &arg, int idx, int parity)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
cpuColorSpinorField * out
void u32toa(char *buffer, uint32_t value)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void pushKernelPackT(bool pack)
char aux[8][TuneKey::aux_n]
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaFieldOrder FieldOrder() const