10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
34 #include <dslash_index.cuh>
40 #ifdef GPU_WILSON_DIRAC
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
50 #include <dslash_quda.cuh>
55 #include <dslash_events.cuh>
57 using namespace wilson;
59 #ifdef GPU_WILSON_DIRAC
60 template <
typename sFloat,
typename gFloat>
61 class WilsonDslashCuda :
public SharedDslashCuda {
64 const gFloat *gauge0, *gauge1;
68 unsigned int sharedBytesPerThread()
const
70 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input
72 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
77 #else // Pre-Fermi uses shared memory only for pseudo-registers
78 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
84 WilsonDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
86 const cudaColorSpinorField *
x,
const double a,
const int dagger)
87 : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), a(a)
89 bindSpinorTex<sFloat>(
in,
out,
x);
92 virtual ~WilsonDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
94 void apply(
const cudaStream_t &
stream)
96 #ifdef SHARED_WILSON_DSLASH
98 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
101 DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream,
102 dslashParam, (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
103 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
106 long long flops()
const {
return (
x ? 1368ll : 1320ll) *
in->VolumeCB(); }
108 #endif // GPU_WILSON_DIRAC
110 #include <dslash_policy.cuh>
119 #ifdef GPU_WILSON_DIRAC
121 for(
int i=0;i<4;i++){
128 void *gauge0, *gauge1;
132 errorQuda(
"Mixing gauge %d and spinor %d precision not supported",
135 DslashCuda *dslash = 0;
136 size_t regSize =
sizeof(float);
138 #if (__COMPUTE_CAPABILITY__ >= 130)
139 dslash =
new WilsonDslashCuda<double2, double2>(
out, (double2*)gauge0, (double2*)gauge1,
141 regSize =
sizeof(double);
143 errorQuda(
"Double precision not supported on this GPU");
146 dslash =
new WilsonDslashCuda<float4, float4>(
out, (float4*)gauge0, (float4*)gauge1,
149 dslash =
new WilsonDslashCuda<short4, short4>(
out, (short4*)gauge0, (short4*)gauge1,
154 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
167 errorQuda(
"Wilson dslash has not been built");
168 #endif // GPU_WILSON_DIRAC
void unbindGaugeTex(const cudaGaugeField &gauge)
#define DSLASH_SHARED_FLOATS_PER_THREAD
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void wilsonDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
int GhostOffset(const int i) const
const int * GhostFace() const