26 template <
typename Float,
int nDim,
int nColor,
int nParity,
bool dagger,
bool xpay, KernelType kernel_type,
typename Arg>
30 static constexpr
const char *
kernel =
"quda::laplaceGPU";
32 template <
typename Dslash>
35 dslash.
launch(laplaceGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
39 template <
typename Float,
int nDim,
int nColor,
typename Arg>
class Laplace :
public Dslash<Float>
48 Dslash<Float>(arg, out, in,
"kernels/laplace.cuh"),
66 int num_mv_multiply = in.
Nspin() == 4 ? 2 : 1;
67 int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.
Ncolor() * in.
Nspin());
69 int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);
76 switch (arg.kernel_type) {
81 flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.
GhostFace()[arg.kernel_type];
85 flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
90 long long sites = in.
Volume();
92 num_dir * num_mv_multiply * mv_flops +
95 if (arg.xpay) flops_ += xpay_flops * sites;
99 long long ghost_sites = 0;
100 for (
int d = 0; d < 4; d++)
101 if (arg.commDim[d]) ghost_sites += 2 * in.
GhostFace()[d];
102 flops_ -= ghost_flops * ghost_sites;
113 int gauge_bytes = arg.reconstruct * in.
Precision();
115 int spinor_bytes = 2 * in.
Ncolor() * in.
Nspin() * in.
Precision() + (isFixed ?
sizeof(float) : 0);
116 int proj_spinor_bytes = in.
Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
117 int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes;
118 int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);
120 long long bytes_ = 0;
122 switch (arg.kernel_type) {
129 bytes_ = ghost_bytes * ghost_sites;
134 long long sites = in.
Volume();
135 bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
136 if (arg.xpay) bytes_ += spinor_bytes;
140 long long ghost_sites = 0;
141 for (
int d = 0; d < 4; d++)
142 if (arg.commDim[d]) ghost_sites += 2 * in.
GhostFace()[d];
143 bytes_ -= ghost_bytes * ghost_sites;
156 strcat(aux,
",laplace=");
159 strcat(aux, laplace);
164 template <
typename Float,
int nColor, QudaReconstructType recon>
struct LaplaceApply {
171 constexpr
int nDim = 4;
172 LaplaceArg<Float, nColor, recon> arg(out, in, U, dir, a, x, parity, dagger, comm_override);
176 laplace, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.
VolumeCB(),
192 if (in.
V() == out.
V())
errorQuda(
"Aliasing pointers");
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
#define checkPrecision(...)
const ColorSpinorField & in
const char * VolString() const
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double kappa, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the Laplace stencil.
This is a helper class that is used to instantiate the correct templated kernel for the dslash...
virtual long long bytes() const
static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Parameter structure for driving the covariatnt derivative operator.
const int * GhostFaceCB() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
Laplace(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
cpuColorSpinorField * out
void u32toa(char *buffer, uint32_t value)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int * GhostFace() const
static constexpr const char * kernel
__device__ __host__ void laplace(Arg &arg, int idx, int parity)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
void apply(const cudaStream_t &stream)
QudaFieldOrder FieldOrder() const