10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
18 #elif (__COMPUTE_CAPABILITY__ >= 200)
21 #define DIRECT_ACCESS_SPINOR
26 #define DIRECT_ACCESS_FAT_LINK
33 #endif // GPU_STAGGERED_DIRAC
48 #include <dslash_index.cuh>
50 #undef GPU_CLOVER_DIRAC
51 #undef GPU_DOMAIN_WALL_DIRAC
56 #include <dslash_quda.cuh>
60 #include <dslash_events.cuh>
62 using namespace staggered;
64 template<
typename T>
struct RealType {};
71 #ifdef GPU_STAGGERED_DIRAC
72 template <
typename sFloat,
typename gFloat>
73 class StaggeredDslashCuda :
public DslashCuda {
76 const gFloat *gauge0, *gauge1;
80 unsigned int sharedBytesPerThread()
const
82 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
87 StaggeredDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
89 const cudaColorSpinorField *
x,
const double a,
const int dagger)
90 : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), a(a)
92 bindSpinorTex<sFloat>(
in,
out,
x);
95 virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
97 void apply(
const cudaStream_t &
stream)
100 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
101 STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
102 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
103 (sFloat*)
in->V(), (
float*)
in->Norm(),
104 (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
107 int Nface() {
return 2; }
109 long long flops()
const {
111 flops = (
x ? 666ll : 654ll) *
in->VolumeCB();
115 #endif // GPU_STAGGERED_DIRAC
117 #include <dslash_policy.cuh>
126 #ifdef GPU_STAGGERED_DIRAC
130 dslashParam.parity =
parity;
131 dslashParam.gauge_stride = gauge.
Stride();
132 dslashParam.fat_link_max = gauge.
LinkMax();
135 for(
int i=0;i<4;i++){
141 void *gauge0, *gauge1;
145 errorQuda(
"Mixing precisions gauge=%d and spinor=%d not supported",
149 DslashCuda *dslash = 0;
150 size_t regSize =
sizeof(float);
153 #if (__COMPUTE_CAPABILITY__ >= 130)
154 dslash =
new StaggeredDslashCuda<double2, double2>
156 regSize =
sizeof(double);
158 errorQuda(
"Double precision not supported on this GPU");
161 dslash =
new StaggeredDslashCuda<float2, float2>
164 dslash =
new StaggeredDslashCuda<short2, short2>
169 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
183 errorQuda(
"Staggered dslash has not been built");
184 #endif // GPU_STAGGERED_DIRAC
int commDimPartitioned(int dir)
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaVerbosity getVerbosity()
void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const double & LinkMax() const
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
int GhostOffset(const int i) const
const int * GhostFace() const
void unbindFatGaugeTex(const cudaGaugeField &gauge)