10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 30 #undef GPU_STAGGERED_DIRAC 39 #ifdef GPU_DOMAIN_WALL_DIRAC 46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 56 using namespace mobius;
58 #ifdef GPU_DOMAIN_WALL_DIRAC 60 template <
typename sFloat,
typename gFloat>
66 bool checkGrid(TuneParam &
param)
const {
68 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
78 bool advanceBlockDim(TuneParam &
param)
const 80 const unsigned int max_shared = 16384;
81 const int step[2] = {
deviceProp.warpSize, 1 };
82 bool advance[2] = {
false,
false };
85 param.block.x += step[0];
87 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
89 param.block.x = step[0];
95 param.block.y += step[1];
97 if (
param.block.y > (
unsigned)
in->X(4) ||
98 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
100 param.block.y = step[1];
106 if (advance[0] || advance[1]) {
107 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
111 if (!checkGrid(
param)) advance = advanceBlockDim(
param);
118 unsigned int sharedBytesPerThread()
const {
return 0; }
121 MDWFDslashPCCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const cudaColorSpinorField *
in,
122 const cudaColorSpinorField *
x,
const double mferm,
const double a,
123 const double *b_5,
const double *c_5,
const double m5,
124 const int parity,
const int dagger,
const int *commOverride,
const int DS_type)
129 dslashParam.mferm =
mferm;
130 dslashParam.mferm_f =
mferm;
134 for (
int s=0;
s<
out->X(4);
s++) {
135 dslashParam.mdwf_b5_f[
s] = (
float)dslashParam.mdwf_b5_d[
s];
136 dslashParam.mdwf_c5_f[
s] = (
float)dslashParam.mdwf_c5_d[
s];
139 dslashParam.m5_d =
m5;
142 virtual ~MDWFDslashPCCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
144 TuneKey tuneKey()
const 149 strcat(key.aux,
",Dslash4");
152 strcat(key.aux,
",Dslash4pre");
155 strcat(key.aux,
",Dslash5");
158 strcat(key.aux,
",Dslash5inv");
164 virtual void initTuneParam(TuneParam &
param)
const 167 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
170 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
171 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
175 virtual void defaultTuneParam(TuneParam &
param)
const 178 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
181 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
182 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
185 void apply(
const cudaStream_t &
stream)
187 #ifndef USE_TEXTURE_OBJECTS 189 #endif // USE_TEXTURE_OBJECTS 194 DSLASH(MDWFDslash4, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
197 DSLASH(MDWFDslash4pre, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
200 DSLASH(MDWFDslash5, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
203 DSLASH(MDWFDslash5inv, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
210 long long flops()
const {
211 long long Ls =
in->X(4);
212 long long vol4d =
in->VolumeCB() /
Ls;
213 long long bulk = (
Ls-2)*vol4d;
214 long long wall = 2*vol4d;
221 flops = 72ll*
in->VolumeCB() + 96ll*bulk + 120ll*wall;
224 flops = (
x ? 96ll : 48ll)*
in->VolumeCB() + 96ll*bulk + 120ll*wall;
235 long long bytes()
const {
236 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
237 int spinor_bytes = 2 *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
238 long long Ls =
in->X(4);
247 bytes = (
x ? 5ll : 4ll) * spinor_bytes *
in->VolumeCB();
250 bytes = (
x ?
Ls + 2 :
Ls + 1) * spinor_bytes *
in->VolumeCB();
258 #endif // GPU_DOMAIN_WALL_DIRAC 276 const double *b_5,
const double *c_5,
const double &
m5,
277 const int *commOverride,
const int DS_type,
TimeProfile &profile)
279 #ifdef GPU_DOMAIN_WALL_DIRAC 284 dslash =
new MDWFDslashPCCuda<double2,double2>(
out, gauge,
in,
x, m_f, k2, b_5, c_5,
m5,
parity,
dagger, commOverride, DS_type);
286 dslash =
new MDWFDslashPCCuda<float4,float4>(
out, gauge,
in,
x, m_f, k2, b_5, c_5,
m5,
parity,
dagger, commOverride, DS_type);
288 dslash =
new MDWFDslashPCCuda<short4,short4>(
out, gauge,
in,
x, m_f, k2, b_5, c_5,
m5,
parity,
dagger, commOverride, DS_type);
294 for (
int i=0;
i<4;
i++) ghostFace[
i] =
in->GhostFace()[
i] /
in->X(4);
296 DslashPolicyImp* dslashImp =
nullptr;
298 dslashImp = DslashFactory::create(QudaDslashPolicy::QUDA_DSLASH_NC);
302 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume()/
in->X(4), ghostFace, profile);
303 dslash_policy.apply(0);
308 errorQuda(
"Domain wall dslash has not been built");
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
void MDWFDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const double *b5, const double *c_5, const double &m5, const int *commDim, const int DS_type, TimeProfile &profile)
char * strcat(char *__s1, const char *__s2)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void * memcpy(void *__dst, const void *__src, size_t __n)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
virtual TuneKey tuneKey() const
virtual void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
virtual void defaultTuneParam(TuneParam ¶m) const