33 long long Ls = meta.
X(4);
34 long long bulk = (Ls - 2) * (meta.
Volume() /
Ls);
35 long long wall = 2 * meta.
Volume() /
Ls;
40 case DSLASH5_DWF: flops_ = n * (8ll * bulk + 10ll * wall + (arg.xpay ? 4ll * meta.
Volume() : 0));
break;
42 flops_ = n * (8ll * bulk + 10ll * wall + 14ll * meta.
Volume() + (arg.xpay ? 8ll * meta.
Volume() : 0));
45 flops_ = n * (8ll * bulk + 10ll * wall + 8ll * meta.
Volume() + (arg.xpay ? 8ll * meta.
Volume() : 0));
50 flops_ = (144 * Ls + (arg.xpay ? 4ll : 0)) * meta.
Volume();
54 flops_ = (144 * Ls + (arg.xpay ? 8ll : 0)) * meta.
Volume();
56 default:
errorQuda(
"Unknown Dslash5Type %d", arg.type);
64 long long Ls = meta.
X(4);
66 case DSLASH5_DWF:
return arg.out.Bytes() + 2 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
67 case DSLASH5_MOBIUS_PRE:
return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
68 case DSLASH5_MOBIUS:
return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
69 case M5_INV_DWF:
return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
70 case M5_INV_MOBIUS:
return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
71 case M5_INV_ZMOBIUS:
return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
72 default:
errorQuda(
"Unknown Dslash5Type %d", arg.type);
78 unsigned int minThreads()
const {
return arg.volume_4d_cb; }
85 int nSpin = var_inverse ? meta.
Nspin() / 2 : meta.
Nspin();
106 if (arg.dagger) strcat(
aux,
",Dagger");
107 if (arg.xpay) strcat(
aux,
",xpay");
115 default:
errorQuda(
"Unknown Dslash5Type %d", arg.type);
126 void *args[] = {&arg};
133 errorQuda(
"CPU variant not instantiated");
138 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, true, DSLASH5_DWF, Arg>, tp, arg, stream) :
139 launch(dslash5GPU<Float, nColor, false, true, DSLASH5_DWF, Arg>, tp, arg, stream);
141 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, false, DSLASH5_DWF, Arg>, tp, arg, stream) :
142 launch(dslash5GPU<Float, nColor, false, false, DSLASH5_DWF, Arg>, tp, arg, stream);
145 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
146 launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
148 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
149 launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
152 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
153 launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
155 arg.dagger ?
launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
156 launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
160 launch(dslash5invGPU<Float, nColor, true, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
161 launch(dslash5invGPU<Float, nColor, false, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
164 launch(dslash5invGPU<Float, nColor, true, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
165 launch(dslash5invGPU<Float, nColor, false, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
169 dslash5invGPU<Float, nColor, true, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
170 launch(dslash5invGPU<Float, nColor, false, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
174 dslash5invGPU<Float, nColor, true, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
175 launch(dslash5invGPU<Float, nColor, false, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
180 dslash5invGPU<Float, nColor, true, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
181 launch(dslash5invGPU<Float, nColor, false, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp,
185 dslash5invGPU<Float, nColor, true, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
186 launch(dslash5invGPU<Float, nColor, false, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>,
196 param.
block.y = arg.Ls;
206 param.
block.y = arg.Ls;
215 template <
typename Float,
int nColor>
219 Dslash5Arg<Float, nColor> arg(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type);
225 template <
typename Float>
230 case 3: ApplyDslash5<Float, 3>(
out,
in, x, m_f, m_5, b_5, c_5, a,
dagger, type);
break;
231 default:
errorQuda(
"Unsupported number of colors %d\n", in.Ncolor());
240 #ifdef GPU_DOMAIN_WALL_DIRAC 249 default:
errorQuda(
"Unsupported precision %d\n", in.Precision());
252 errorQuda(
"Domain wall dslash has not been built");
const char * AuxString() const
QudaVerbosity getVerbosity()
static constexpr bool var_inverse
#define checkPrecision(...)
static constexpr bool shared
void defaultTuneParam(TuneParam ¶m) const
const char * VolString() const
void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
Apply either the domain-wall / mobius Dslash5 operator or the M5 inverse operator. In the current implementation, it is expected that the color-spinor fields are 4-d preconditioned.
void apply(const cudaStream_t &stream)
Dslash5(Arg &arg, const ColorSpinorField &meta)
const ColorSpinorField & meta
unsigned int sharedBytesPerThread() const
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
unsigned int minThreads() const
QudaPCType PCType() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
std::complex< double > Complex
void initTuneParam(TuneParam ¶m) const
void initTuneParam(TuneParam ¶m) const
QudaFieldLocation Location() const
cpuColorSpinorField * out
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
unsigned int maxDynamicSharedBytesPerBlock() const
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Parameter structure for applying the Dslash.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
void defaultTuneParam(TuneParam ¶m) const