10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 28 namespace domainwall {
30 #undef GPU_STAGGERED_DIRAC 39 #ifdef GPU_DOMAIN_WALL_DIRAC 43 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 44 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 53 using namespace domainwall;
55 #ifdef GPU_DOMAIN_WALL_DIRAC 56 template <
typename sFloat,
typename gFloat>
57 class DomainWallDslashCuda :
public DslashCuda {
60 bool checkGrid(TuneParam &
param)
const {
62 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
72 bool advanceBlockDim(TuneParam &
param)
const 74 const unsigned int max_shared = 16384;
75 const int step[2] = {
deviceProp.warpSize, 1 };
76 bool advance[2] = {
false,
false };
79 param.block.x += step[0];
81 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
83 param.block.x = step[0];
89 param.block.y += step[1];
91 if (
param.block.y > (
unsigned)
in->X(4) ||
92 sharedBytesPerThread()*
param.block.x*
param.block.y > max_shared) {
94 param.block.y = step[1];
100 if (advance[0] || advance[1]) {
101 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
105 if (!checkGrid(
param)) advance = advanceBlockDim(
param);
112 unsigned int sharedBytesPerThread()
const {
return 0; }
115 DomainWallDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const cudaColorSpinorField *
in,
116 const cudaColorSpinorField *
x,
const double mferm,
const double a,
117 const int parity,
const int dagger,
const int* commOverride)
122 dslashParam.a_inv = 1.0/
a;
123 dslashParam.a_inv_f = 1.0/
a;
124 dslashParam.mferm =
mferm;
125 dslashParam.mferm_f =
mferm;
127 virtual ~DomainWallDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
129 virtual void initTuneParam(TuneParam &
param)
const 132 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
135 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
136 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
140 virtual void defaultTuneParam(TuneParam &
param)
const 143 param.grid = dim3( (dslashParam.threads+
param.block.x-1) /
param.block.x,
146 if (!checkGrid(
param)) ok = advanceBlockDim(
param);
147 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
150 void apply(
const cudaStream_t &
stream)
152 #ifndef USE_TEXTURE_OBJECTS 154 #endif // USE_TEXTURE_OBJECTS 157 DSLASH(domainWallDslash, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
160 long long flops()
const {
162 switch(dslashParam.kernel_type) {
172 long long bulk = (
Ls-2)*(
in->VolumeCB()/
Ls);
173 long long wall = 2*(
in->VolumeCB()/
Ls);
174 flops += 96ll*bulk + 120ll*wall;
180 virtual long long bytes()
const {
181 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
182 int spinor_bytes = 2 *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
184 switch(dslashParam.kernel_type) {
193 bytes += 2 * spinor_bytes *
in->VolumeCB();
199 #endif // GPU_DOMAIN_WALL_DIRAC 208 #ifdef GPU_DOMAIN_WALL_DIRAC 223 for (
int i=0;
i<4;
i++) ghostFace[
i] =
in->GhostFace()[
i] /
in->X(4);
225 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume()/
in->X(4), ghostFace, profile);
226 dslash_policy.apply(0);
230 errorQuda(
"Domain wall dslash has not been built");
virtual long long bytes() const
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile)
virtual void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
virtual void defaultTuneParam(TuneParam ¶m) const