29 jitify::Program *program_;
39 comm[0] = (arg.
commDim[0] ?
'1' :
'0');
40 comm[1] = (arg.
commDim[1] ?
'1' :
'0');
41 comm[2] = (arg.
commDim[2] ?
'1' :
'0');
42 comm[3] = (arg.
commDim[3] ?
'1' :
'0');
44 strcpy(aux_base,
",commDim=");
45 strcat(aux_base, comm);
47 if (arg.
xpay) strcat(aux_base,
",xpay");
48 if (arg.
dagger) strcat(aux_base,
",dagger");
58 strcpy(aux[kernel_type], kernel_str);
75 static void *ghost[8] = {};
76 for (
int dim = 0; dim < 4; dim++) {
78 for (
int dir = 0; dir < 2; dir++) {
89 arg.in.resetGhost(in, ghost);
100 template <
typename T,
typename Arg>
106 void *args[] = {&arg};
117 template <
template <
typename,
int,
int,
int,
bool,
bool, KernelType,
typename>
class Launch,
int nDim,
int nColor,
125 switch (arg.kernel_type) {
127 Launch<Float, nDim, nColor, nParity, dagger, xpay, INTERIOR_KERNEL, Arg>::launch(*
this, tp, arg, stream);
131 Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_X, Arg>::launch(*
this, tp, arg, stream);
134 Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Y, Arg>::launch(*
this, tp, arg, stream);
137 Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Z, Arg>::launch(*
this, tp, arg, stream);
140 Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_T, Arg>::launch(*
this, tp, arg, stream);
143 Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_ALL, Arg>::launch(*
this, tp, arg, stream);
145 default:
errorQuda(
"Unexpected kernel type %d", arg.kernel_type);
147 default:
errorQuda(
"Unexpected kernel type %d for single-GPU build", arg.kernel_type);
160 template <
template <
typename,
int,
int,
int,
bool,
bool, KernelType,
typename>
class Launch,
int nDim,
int nColor,
161 int nParity,
bool xpay,
typename Arg>
165 using namespace jitify::reflection;
166 const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
168 = program_->kernel(kernel)
169 .instantiate(Type<Float>(), nDim, nColor, nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
174 instantiate<Launch, nDim, nColor, nParity, true, xpay>(tp,
arg,
stream);
176 instantiate<Launch, nDim, nColor, nParity, false, xpay>(tp,
arg,
stream);
187 template <
template <
typename,
int,
int,
int,
bool,
bool, KernelType,
typename>
class Launch,
int nDim,
int nColor,
192 using namespace jitify::reflection;
193 const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
195 = program_->kernel(kernel)
196 .instantiate(Type<Float>(), nDim, nColor, arg.
nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
201 case 1: instantiate<Launch, nDim, nColor, 1, xpay>(tp,
arg,
stream);
break;
202 case 2: instantiate<Launch, nDim, nColor, 2, xpay>(tp,
arg,
stream);
break;
203 default:
errorQuda(
"nParity = %d undefined\n", arg.nParity);
215 template <
template <
typename,
int,
int,
int,
bool,
bool, KernelType,
typename>
class Launch,
int nDim,
int nColor,
220 using namespace jitify::reflection;
221 const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
223 .instantiate(Type<Float>(), nDim, nColor, arg.
nParity, arg.dagger, arg.xpay,
224 arg.kernel_type, Type<Arg>())
229 instantiate<Launch, nDim, nColor, true>(tp,
arg,
stream);
231 instantiate<Launch, nDim, nColor, false>(tp,
arg,
stream);
246 errorQuda(
"CPU Fields not supported in Dslash framework yet");
266 create_jitify_program(src);
273 return 2 * arg.
nFace;
319 int num_mv_multiply = in.
Nspin() == 4 ? 2 : 1;
320 int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.
Ncolor() * in.
Nspin());
324 long long flops_ = 0;
338 flops_ = (ghost_flops + (arg.
xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
343 long long sites = in.
Volume();
345 num_dir * num_mv_multiply * mv_flops +
348 if (arg.
xpay) flops_ += xpay_flops * sites;
352 long long ghost_sites = 0;
353 for (
int d = 0; d < 4; d++)
355 flops_ -= ghost_flops * ghost_sites;
368 int spinor_bytes = 2 * in.
Ncolor() * in.
Nspin() * in.
Precision() + (isFixed ?
sizeof(float) : 0);
369 int proj_spinor_bytes = in.
Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
370 int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes;
373 long long bytes_ = 0;
382 bytes_ = ghost_bytes * ghost_sites;
387 long long sites = in.
Volume();
388 bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
389 if (arg.
xpay) bytes_ += spinor_bytes;
393 long long ghost_sites = 0;
394 for (
int d = 0; d < 4; d++)
396 bytes_ -= ghost_bytes * ghost_sites;
424 template <
template <
typename,
int, QudaReconstructType>
class Apply,
typename Recon,
typename Float,
int nColor,
429 #if QUDA_RECONSTRUCT & 4 430 Apply<Float, nColor, Recon::recon0>(
out,
in, U, args...);
432 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-18", QUDA_RECONSTRUCT);
435 #if QUDA_RECONSTRUCT & 2 436 Apply<Float, nColor, Recon::recon1>(
out,
in, U, args...);
438 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
441 #if QUDA_RECONSTRUCT & 1 442 Apply<Float, nColor, Recon::recon2>(
out,
in, U, args...);
444 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
458 template <
template <
typename,
int, QudaReconstructType>
class Apply,
typename Recon,
typename Float,
typename... Args>
462 instantiate<Apply, Recon, Float, 3>(
out,
in, U, args...);
475 template <
template <
typename,
int, QudaReconstructType>
class Apply,
typename Recon =
WilsonReconstruct,
typename... Args>
479 #if QUDA_PRECISION & 8 480 instantiate<Apply, Recon, double>(
out,
in, U, args...);
482 errorQuda(
"QUDA_PRECISION=%d does not enable double precision", QUDA_PRECISION);
485 #if QUDA_PRECISION & 4 486 instantiate<Apply, Recon, float>(
out,
in, U, args...);
488 errorQuda(
"QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
491 #if QUDA_PRECISION & 2 492 instantiate<Apply, Recon, short>(
out,
in, U, args...);
494 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
497 #if QUDA_PRECISION & 1 498 instantiate<Apply, Recon, char>(
out,
in, U, args...);
500 errorQuda(
"QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
virtual void postTune()
Restore the output field if doing exterior kernel.
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
unsigned int minThreads() const
cudaDeviceProp deviceProp
Helper file when using jitify run-time compilation. This file should be included in source code...
QudaPrecision GhostPrecision() const
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
void augmentAux(KernelType type, const char *extra)
const QudaReconstructType reconstruct
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Dslash(DslashArg< Float > &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src)
void setAux(KernelType type, const char *aux_)
virtual long long bytes() const
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
const ColorSpinorField & in
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
#define checkLocation(...)
virtual void backup() const
Backs up the LatticeField.
DslashArg< Float > & dslashParam
const ColorSpinorField & out
int GhostOffset(const int i) const
bool comm_peer2peer_enabled(int dir, int dim)
QudaFieldLocation Location() const
enum QudaReconstructType_s QudaReconstructType
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the KernelType template required for the multi-G...
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the nParity template.
const int * GhostFace() const
QudaReconstructType Reconstruct() const
char aux_base[TuneKey::aux_n - 32]
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the xpay template.
virtual const void * Ghost2() const
unsigned int maxDynamicSharedBytesPerBlock() const
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the dagger template.
char aux[8][TuneKey::aux_n]
virtual long long flops() const
QudaPrecision Precision() const
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
const char * getAux(KernelType type) const
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
virtual void restore() const
Restores the LatticeField.
virtual int tuningIter() const
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.