10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
18 #elif (__COMPUTE_CAPABILITY__ >= 200)
21 #define DIRECT_ACCESS_SPINOR
26 #define DIRECT_ACCESS_FAT_LINK
33 #endif // GPU_STAGGERED_DIRAC
45 namespace improvedstaggered {
48 #include <dslash_index.cuh>
50 #define STAGGERED_TESLA_HACK
51 #undef GPU_NDEG_TWISTED_MASS_DIRAC
52 #undef GPU_CLOVER_DIRAC
53 #undef GPU_DOMAIN_WALL_DIRAC
58 #include <dslash_quda.cuh>
62 #include <dslash_events.cuh>
64 using namespace improvedstaggered;
73 #ifdef GPU_STAGGERED_DIRAC
74 template <
typename sFloat,
typename fatGFloat,
typename longGFloat,
typename phaseFloat>
75 class StaggeredDslashCuda :
public DslashCuda {
78 const fatGFloat *fat0, *fat1;
79 const longGFloat *long0, *long1;
80 const phaseFloat *phase0, *phase1;
84 unsigned int sharedBytesPerThread()
const
86 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
91 StaggeredDslashCuda(cudaColorSpinorField *
out,
const fatGFloat *fat0,
const fatGFloat *fat1,
92 const longGFloat *long0,
const longGFloat *long1,
93 const phaseFloat *phase0,
const phaseFloat *phase1,
95 const cudaColorSpinorField *
x,
const double a,
const int dagger)
96 : DslashCuda(out, in, x, reconstruct, dagger), fat0(fat0), fat1(fat1), long0(long0),
97 long1(long1), phase0(phase0), phase1(phase1), a(a)
99 bindSpinorTex<sFloat>(
in,
out,
x);
102 virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
104 void apply(
const cudaStream_t &
stream)
107 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
108 #if (__COMPUTE_CAPABILITY__ >= 200)
109 IMPROVED_STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
110 (sFloat*)
out->V(), (
float*)
out->Norm(),
111 fat0, fat1, long0, long1, phase0, phase1,
112 (sFloat*)
in->V(), (
float*)
in->Norm(),
113 (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
115 IMPROVED_STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
116 (sFloat*)
out->V(), (
float*)
out->Norm(),
117 fat0, fat1, long0, long1,
118 (sFloat*)
in->V(), (
float*)
in->Norm(),
119 (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
123 int Nface() {
return 6; }
125 long long flops()
const {
127 flops = (
x ? 1158ll : 1146ll) *
in->VolumeCB();
131 #endif // GPU_STAGGERED_DIRAC
133 #include <dslash_policy.cuh>
142 #ifdef GPU_STAGGERED_DIRAC
145 for(
int i=0;i < 4; i++){
147 errorQuda(
"ERROR: partitioned dimension with local size less than 6 is not supported in staggered dslash\n");
154 dslashParam.parity =
parity;
155 dslashParam.gauge_stride = fatGauge.
Stride();
156 dslashParam.long_gauge_stride = longGauge.
Stride();
157 dslashParam.fat_link_max = fatGauge.
LinkMax();
159 for(
int i=0;i<4;i++){
166 void *fatGauge0, *fatGauge1;
167 void* longGauge0, *longGauge1;
170 void *longPhase0 = (
char*)longGauge0 + longGauge.
PhaseOffset();
171 void *longPhase1 = (
char*)longGauge1 + longGauge.
PhaseOffset();
174 errorQuda(
"Mixing gauge and spinor precision not supported"
175 "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
179 DslashCuda *dslash = 0;
180 size_t regSize =
sizeof(float);
183 #if (__COMPUTE_CAPABILITY__ >= 130)
184 dslash =
new StaggeredDslashCuda<double2, double2, double2, double>
185 (
out, (double2*)fatGauge0, (double2*)fatGauge1,
186 (double2*)longGauge0, (double2*)longGauge1,
187 (
double*)longPhase0, (
double*)longPhase1,
189 regSize =
sizeof(double);
191 errorQuda(
"Double precision not supported on this GPU");
194 dslash =
new StaggeredDslashCuda<float2, float2, float4, float>
195 (
out, (float2*)fatGauge0, (float2*)fatGauge1,
196 (float4*)longGauge0, (float4*)longGauge1,
197 (
float*)longPhase0, (
float*)longPhase1,
200 dslash =
new StaggeredDslashCuda<short2, short2, short4, short>
201 (
out, (short2*)fatGauge0, (short2*)fatGauge1,
202 (short4*)longGauge0, (short4*)longGauge1,
203 (
short*)longPhase0, (
short*)longPhase1,
208 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
222 errorQuda(
"Staggered dslash has not been built");
223 #endif // GPU_STAGGERED_DIRAC
size_t PhaseOffset() const
int commDimPartitioned(int dir)
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaVerbosity getVerbosity()
void bindLongGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
void improvedStaggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, const cudaGaugeField &longGauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
void unbindLongGaugeTex(const cudaGaugeField &gauge)
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const double & LinkMax() const
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
int GhostOffset(const int i) const
const int * GhostFace() const
void unbindFatGaugeTex(const cudaGaugeField &gauge)