10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
29 namespace domainwall {
31 #undef GPU_STAGGERED_DIRAC
34 #include <dslash_index.cuh>
40 #ifdef GPU_DOMAIN_WALL_DIRAC
44 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
45 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
48 #include <dslash_quda.cuh>
52 #include <dslash_events.cuh>
54 using namespace domainwall;
56 #ifdef GPU_DOMAIN_WALL_DIRAC
57 template <
typename sFloat,
typename gFloat>
58 class DomainWallDslashCuda :
public DslashCuda {
61 const gFloat *gauge0, *gauge1;
65 bool checkGrid(TuneParam &
param)
const {
67 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
68 param.block.x, param.block.y, param.block.z,
69 param.grid.x, param.grid.y, param.grid.z);
77 bool advanceBlockDim(TuneParam ¶m)
const
79 const unsigned int max_shared = 16384;
80 const int step[2] = {
deviceProp.warpSize, 1 };
81 bool advance[2] = {
false,
false };
84 param.block.x += step[0];
85 if (param.block.x >
deviceProp.maxThreadsDim[0] ||
86 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
88 param.block.x = step[0];
94 param.block.y += step[1];
96 if (param.block.y >
in->X(4) ||
97 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
99 param.block.y = step[1];
105 if (advance[0] || advance[1]) {
106 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
107 (
in->X(4)+param.block.y-1) / param.block.y, 1);
110 if (!checkGrid(param)) advance = advanceBlockDim(param);
117 unsigned int sharedBytesPerThread()
const {
return 0; }
120 DomainWallDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
122 const cudaColorSpinorField *
x,
const double mferm,
123 const double a,
const int dagger)
124 : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0),
125 gauge1(gauge1), mferm(mferm), a(a)
127 bindSpinorTex<sFloat>(
in,
out,
x);
129 virtual ~DomainWallDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
131 virtual void initTuneParam(TuneParam ¶m)
const
134 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
135 (
in->X(4)+param.block.y-1) / param.block.y, 1);
137 if (!checkGrid(param)) ok = advanceBlockDim(param);
138 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
142 virtual void defaultTuneParam(TuneParam ¶m)
const
145 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
146 (
in->X(4)+param.block.y-1) / param.block.y, 1);
148 if (!checkGrid(param)) ok = advanceBlockDim(param);
149 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
152 void apply(
const cudaStream_t &
stream)
155 DSLASH(domainWallDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
156 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
157 (sFloat*)
in->V(), (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
160 long long flops()
const {
161 long long Ls =
in->X(4);
162 long long vol4d =
in->VolumeCB()/
Ls;
163 long long bulk = (Ls-2)*vol4d;
164 long long wall = 2*vol4d;
165 return (
x ? 1368ll : 1320ll)*
in->VolumeCB() + 96ll*bulk + 120ll*wall;
168 #endif // GPU_DOMAIN_WALL_DIRAC
170 #include <dslash_policy.cuh>
179 dslashParam.parity =
parity;
181 #ifdef GPU_DOMAIN_WALL_DIRAC
185 for(
int i = 0;i < dirs; i++){
192 void *gauge0, *gauge1;
196 errorQuda(
"Mixing gauge and spinor precision not supported");
198 DslashCuda *dslash = 0;
199 size_t regSize =
sizeof(float);
202 #if (__COMPUTE_CAPABILITY__ >= 130)
203 dslash =
new DomainWallDslashCuda<double2,double2>(
out, (double2*)gauge0, (double2*)gauge1,
205 regSize =
sizeof(double);
207 errorQuda(
"Double precision not supported on this GPU");
210 dslash =
new DomainWallDslashCuda<float4,float4>(
out, (float4*)gauge0, (float4*)gauge1,
213 dslash =
new DomainWallDslashCuda<short4,short4>(
out, (short4*)gauge0, (short4*)gauge1,
220 for (
int i=0; i<4; i++) ghostFace[i] = in->
GhostFace()[i] / in->
X(4);
221 dslashCuda(*dslash, regSize, parity, dagger, in->
Volume() / in->
X(4),
ghostFace, profile);
224 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
237 errorQuda(
"Domain wall dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam ¶m) const
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH)
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
virtual void defaultTuneParam(TuneParam ¶m) const
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
int GhostOffset(const int i) const
const int * GhostFace() const