10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
31 #undef GPU_STAGGERED_DIRAC
34 #include <dslash_index.cuh>
40 #ifdef GPU_DOMAIN_WALL_DIRAC
47 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
48 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
51 #include <dslash_quda.cuh>
55 #include <dslash_events.cuh>
57 using namespace mobius;
59 #ifdef GPU_DOMAIN_WALL_DIRAC
61 template <
typename sFloat,
typename gFloat>
62 class MDWFDslashPCCuda :
public DslashCuda {
65 const gFloat *gauge0, *gauge1;
66 const double mferm, a;
70 bool checkGrid(TuneParam &
param)
const {
72 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
73 param.block.x, param.block.y, param.block.z,
74 param.grid.x, param.grid.y, param.grid.z);
82 bool advanceBlockDim(TuneParam ¶m)
const
84 const unsigned int max_shared = 16384;
85 const int step[2] = {
deviceProp.warpSize, 1 };
86 bool advance[2] = {
false,
false };
89 param.block.x += step[0];
90 if (param.block.x >
deviceProp.maxThreadsDim[0] ||
91 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
93 param.block.x = step[0];
99 param.block.y += step[1];
101 if (param.block.y >
in->X(4) ||
102 sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
104 param.block.y = step[1];
110 if (advance[0] || advance[1]) {
111 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
112 (
in->X(4)+param.block.y-1) / param.block.y, 1);
115 if (!checkGrid(param)) advance = advanceBlockDim(param);
122 unsigned int sharedBytesPerThread()
const {
return 0; }
125 MDWFDslashPCCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
127 const cudaColorSpinorField *
x,
const double mferm,
128 const double a,
const int dagger,
const int DS_type)
129 : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1),
130 mferm(mferm), a(a), DS_type(DS_type)
132 bindSpinorTex<sFloat>(
in,
out,
x);
134 virtual ~MDWFDslashPCCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
136 TuneKey tuneKey()
const
138 TuneKey key = DslashCuda::tuneKey();
141 strcat(key.aux,
",Dslash4");
144 strcat(key.aux,
",Dslash4pre");
147 strcat(key.aux,
",Dslash5");
150 strcat(key.aux,
",Dslash5inv");
156 virtual void initTuneParam(TuneParam ¶m)
const
159 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
160 (
in->X(4)+param.block.y-1) / param.block.y, 1);
162 if (!checkGrid(param)) ok = advanceBlockDim(param);
163 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
167 virtual void defaultTuneParam(TuneParam ¶m)
const
170 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
171 (
in->X(4)+param.block.y-1) / param.block.y, 1);
173 if (!checkGrid(param)) ok = advanceBlockDim(param);
174 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
177 void apply(
const cudaStream_t &
stream)
183 DSLASH(MDWFDslash4, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
184 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
185 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
188 DSLASH(MDWFDslash4pre, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
189 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
190 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
193 DSLASH(MDWFDslash5, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
194 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
195 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
198 DSLASH(MDWFDslash5inv, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
199 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, (sFloat*)
in->V(),
200 (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
207 long long flops()
const {
208 long long Ls =
in->X(4);
209 long long vol4d =
in->VolumeCB() /
Ls;
210 long long bulk = (Ls-2)*vol4d;
211 long long wall = 2*vol4d;
215 flops_Tmp = (
x ? 1368ll : 1320ll)*
in->VolumeCB();
218 flops_Tmp = 72ll*
in->VolumeCB() + 96ll*bulk + 120ll*wall;
221 flops_Tmp = (
x ? 96ll : 48ll)*
in->VolumeCB() + 96ll*bulk + 120ll*wall;
224 flops_Tmp = 144ll*
in->VolumeCB()*Ls + 3ll*Ls*(Ls-1ll);
232 #endif // GPU_DOMAIN_WALL_DIRAC
234 #include <dslash_policy.cuh>
254 dslashParam.parity =
parity;
256 #ifdef GPU_DOMAIN_WALL_DIRAC
260 for(
int i = 0;i < dirs; i++){
267 void *gauge0, *gauge1;
271 errorQuda(
"Mixing gauge and spinor precision not supported");
273 DslashCuda *dslash = 0;
274 size_t regSize =
sizeof(float);
277 #if (__COMPUTE_CAPABILITY__ >= 130)
278 dslash =
new MDWFDslashPCCuda<double2,double2>(
out, (double2*)gauge0, (double2*)gauge1,
280 regSize =
sizeof(double);
282 errorQuda(
"Double precision not supported on this GPU");
285 dslash =
new MDWFDslashPCCuda<float4,float4>(
out, (float4*)gauge0, (float4*)gauge1,
288 dslash =
new MDWFDslashPCCuda<short4,short4>(
out, (short4*)gauge0, (short4*)gauge1,
295 for (
int i=0; i<4; i++) ghostFace[i] = in->
GhostFace()[i] / in->
X(4);
297 DslashPolicyImp* dslashImp = NULL;
302 dslashImp = DslashFactory::create(dslashPolicy);
315 errorQuda(
"Domain wall dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
int GhostNormOffset(const int i) const
virtual void initTuneParam(TuneParam ¶m) const
__constant__ int ghostFace[QUDA_MAX_DIM+1]
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void MDWFDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &m_f, const double &k, const int *commDim, const int DS_type, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
virtual void defaultTuneParam(TuneParam ¶m) const
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
int GhostOffset(const int i) const
const int * GhostFace() const