10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 28 namespace domainwall4d {
30 #undef GPU_STAGGERED_DIRAC 39 #ifdef GPU_DOMAIN_WALL_DIRAC 45 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 46 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 55 using namespace domainwall4d;
57 #ifdef GPU_DOMAIN_WALL_DIRAC 58 template <
typename sFloat,
typename gFloat>
59 class DomainWallDslash4DPCCuda :
public DslashCuda {
64 bool checkGrid(TuneParam &
param)
const {
66 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
75 bool advanceBlockDim(TuneParam &
param)
const 77 const unsigned int max_shared = 16384;
78 const int step[2] = {
deviceProp.warpSize, 1 };
79 bool advance[2] = {
false,
false };
82 param.block.x += step[0];
84 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
86 param.block.x = step[0];
92 param.block.y += step[1];
94 if (
param.block.y > (
unsigned)
in->X(4) ||
95 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
97 param.block.y = step[1];
103 if (advance[0] || advance[1]) {
104 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
108 if (!checkGrid(
param)) advance = advanceBlockDim(
param);
115 unsigned int sharedBytesPerThread()
const {
return 0; }
118 DomainWallDslash4DPCCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const cudaColorSpinorField *
in,
119 const cudaColorSpinorField *
x,
const double mferm,
120 const double a,
const double b,
const int parity,
const int dagger,
const int *commOverride,
const int DS_type)
127 dslashParam.mferm =
mferm;
128 dslashParam.mferm_f =
mferm;
130 virtual ~DomainWallDslash4DPCCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
132 TuneKey tuneKey()
const 137 strcat(key.aux,
",Dslash4");
140 strcat(key.aux,
",Dslash5");
143 strcat(key.aux,
",Dslash5inv");
149 virtual void initTuneParam(TuneParam &
param)
const 152 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
155 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
156 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
160 virtual void defaultTuneParam(TuneParam &
param)
const 163 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
166 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
167 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
170 void apply(
const cudaStream_t &
stream)
172 #ifndef USE_TEXTURE_OBJECTS 174 #endif // USE_TEXTURE_OBJECTS 180 DSLASH(domainWallDslash4, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
183 DSLASH(domainWallDslash5, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
186 DSLASH(domainWallDslash5inv, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
193 long long flops()
const {
194 long long Ls =
in->X(4);
195 long long vol4d =
in->VolumeCB() /
Ls;
196 long long bulk = (
Ls-2)*vol4d;
197 long long wall = 2*vol4d;
204 flops = (
x ? 48ll : 0 ) *
in->VolumeCB() + 96ll*bulk + 120ll*wall;
215 long long bytes()
const {
216 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
217 int spinor_bytes = 2 *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
218 long long Ls =
in->X(4);
226 bytes = (
x ? 5ll : 4ll ) * spinor_bytes *
in->VolumeCB();
229 bytes = (
x ?
Ls + 2 :
Ls + 1) * spinor_bytes *
in->VolumeCB();
237 #endif // GPU_DOMAIN_WALL_DIRAC 255 const int *commOverride,
const int DS_type,
TimeProfile &profile)
257 #ifdef GPU_DOMAIN_WALL_DIRAC 262 dslash =
new DomainWallDslash4DPCCuda<double2,double2>(
out, gauge,
in,
x, m_f,
a,
b,
parity,
dagger, commOverride, DS_type);
264 dslash =
new DomainWallDslash4DPCCuda<float4,float4>(
out, gauge,
in,
x, m_f,
a,
b,
parity,
dagger, commOverride, DS_type);
266 dslash =
new DomainWallDslash4DPCCuda<short4,short4>(
out, gauge,
in,
x, m_f,
a,
b,
parity,
dagger, commOverride, DS_type);
272 for (
int i=0;
i<4;
i++) ghostFace[
i] =
in->GhostFace()[
i] /
in->X(4);
274 DslashPolicyImp* dslashImp =
nullptr;
276 dslashImp = DslashFactory::create(QudaDslashPolicy::QUDA_DSLASH_NC);
280 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume()/
in->X(4), ghostFace, profile);
281 dslash_policy.apply(0);
286 errorQuda(
"4D preconditioned Domain wall dslash has not been built");
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
char * strcat(char *__s1, const char *__s2)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile)
virtual TuneKey tuneKey() const
virtual void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
virtual void defaultTuneParam(TuneParam ¶m) const