10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
29 namespace domainwall4d {
31 #undef GPU_STAGGERED_DIRAC
34 #include <dslash_index.cuh>
40 #ifdef GPU_DOMAIN_WALL_DIRAC
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
50 #include <dslash_quda.cuh>
54 #include <dslash_events.cuh>
56 using namespace domainwall4d;
58 #ifdef GPU_DOMAIN_WALL_DIRAC
59 template <
typename sFloat,
typename gFloat>
60 class DomainWallDslash4DPCCuda :
public DslashCuda {
63 const gFloat *gauge0, *gauge1;
68 bool checkGrid(TuneParam &
param)
const {
70 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
71 param.block.x, param.block.y, param.block.z,
72 param.grid.x, param.grid.y, param.grid.z);
80 bool advanceBlockDim(TuneParam ¶m)
const
82 const unsigned int max_shared = 16384;
83 const int step[2] = {
deviceProp.warpSize, 1 };
84 bool advance[2] = {
false,
false };
87 param.block.x += step[0];
88 if (param.block.x >
deviceProp.maxThreadsDim[0] ||
89 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
91 param.block.x = step[0];
97 param.block.y += step[1];
99 if (param.block.y >
in->X(4) ||
100 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
102 param.block.y = step[1];
108 if (advance[0] || advance[1]) {
109 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
110 (
in->X(4)+param.block.y-1) / param.block.y, 1);
113 if (!checkGrid(param)) advance = advanceBlockDim(param);
120 unsigned int sharedBytesPerThread()
const {
return 0; }
123 DomainWallDslash4DPCCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
125 const cudaColorSpinorField *
x,
const double mferm,
126 const double a,
const int dagger,
const int DS_type)
127 : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1),
128 mferm(mferm), a(a), DS_type(DS_type)
130 bindSpinorTex<sFloat>(
in,
out,
x);
132 virtual ~DomainWallDslash4DPCCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
134 TuneKey tuneKey()
const
136 TuneKey key = DslashCuda::tuneKey();
139 strcat(key.aux,
",Dslash4");
142 strcat(key.aux,
",Dslash5");
145 strcat(key.aux,
",Dslash5inv");
151 virtual void initTuneParam(TuneParam ¶m)
const
154 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
155 (
in->X(4)+param.block.y-1) / param.block.y, 1);
157 if (!checkGrid(param)) ok = advanceBlockDim(param);
158 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
162 virtual void defaultTuneParam(TuneParam ¶m)
const
165 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
166 (
in->X(4)+param.block.y-1) / param.block.y, 1);
168 if (!checkGrid(param)) ok = advanceBlockDim(param);
169 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
172 void apply(
const cudaStream_t &
stream)
178 DSLASH(domainWallDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
179 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
180 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
183 DSLASH(domainWallDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
184 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
185 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
188 DSLASH(domainWallDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
189 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
190 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
197 long long flops()
const {
198 long long Ls =
in->X(4);
199 long long vol4d =
in->VolumeCB() /
Ls;
200 long long bulk = (Ls-2)*vol4d;
201 long long wall = 2*vol4d;
205 flops_Tmp = (
x ? 1368ll : 1320ll)*
in->VolumeCB();
208 flops_Tmp = (
x ? 48ll : 0 ) *
in->VolumeCB() + 96ll*bulk + 120ll*wall;
211 flops_Tmp = 144ll*
in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
219 #endif // GPU_DOMAIN_WALL_DIRAC
221 #include <dslash_policy.cuh>
241 dslashParam.parity =
parity;
243 #ifdef GPU_DOMAIN_WALL_DIRAC
247 for(
int i = 0;i < dirs; i++){
254 void *gauge0, *gauge1;
258 errorQuda(
"Mixing gauge and spinor precision not supported");
260 DslashCuda *dslash = 0;
261 size_t regSize =
sizeof(float);
264 #if (__COMPUTE_CAPABILITY__ >= 130)
265 dslash =
new DomainWallDslash4DPCCuda<double2,double2>(
out, (double2*)gauge0, (double2*)gauge1,
267 regSize =
sizeof(double);
269 errorQuda(
"Double precision not supported on this GPU");
272 dslash =
new DomainWallDslash4DPCCuda<float4,float4>(
out, (float4*)gauge0, (float4*)gauge1,
275 dslash =
new DomainWallDslash4DPCCuda<short4,short4>(
out, (short4*)gauge0, (short4*)gauge1,
282 for (
int i=0; i<4; i++) ghostFace[i] = in->
GhostFace()[i] / in->
X(4);
284 DslashPolicyImp* dslashImp = NULL;
289 dslashImp = DslashFactory::create(dslashPolicy);
302 errorQuda(
"4D preconditioned Domain wall dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam ¶m) const
void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH)
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
virtual void defaultTuneParam(TuneParam ¶m) const
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
int GhostOffset(const int i) const
const int * GhostFace() const