10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
29 namespace asym_clover {
31 #undef GPU_STAGGERED_DIRAC
34 #include <dslash_index.cuh>
40 #ifdef GPU_CLOVER_DIRAC
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
50 #include <dslash_quda.cuh>
55 #include <dslash_events.cuh>
57 using namespace asym_clover;
59 #ifdef GPU_CLOVER_DIRAC
60 template <
typename sFloat,
typename gFloat,
typename cFloat>
61 class AsymCloverDslashCuda :
public SharedDslashCuda {
64 const gFloat *gauge0, *gauge1;
66 const float *cloverNorm;
70 unsigned int sharedBytesPerThread()
const
72 #if (__COMPUTE_CAPABILITY__ >= 200)
74 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
80 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
86 AsymCloverDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
88 const float *cloverNorm,
int cl_stride,
const cudaColorSpinorField *
in,
89 const cudaColorSpinorField *
x,
const double a,
const int dagger)
90 : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), clover(clover),
91 cloverNorm(cloverNorm), a(a)
93 bindSpinorTex<sFloat>(
in,
out,
x);
94 dslashParam.cl_stride = cl_stride;
95 if (!x)
errorQuda(
"Asymmetric clover dslash only defined for Xpay");
98 virtual ~AsymCloverDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
100 void apply(
const cudaStream_t &
stream)
102 #ifdef SHARED_WILSON_DSLASH
104 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
107 ASYM_DSLASH(asymCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
108 (sFloat*)out->V(), (
float*)out->Norm(), gauge0, gauge1, clover, cloverNorm,
109 (sFloat*)in->V(), (
float*)in->Norm(), (sFloat*)x, (
float*)x->Norm(), a);
112 long long flops()
const {
return 1872ll * in->VolumeCB(); }
114 #endif // GPU_CLOVER_DIRAC
116 #include <dslash_policy.cuh>
125 #ifdef GPU_CLOVER_DIRAC
127 for(
int i=0;i<4;i++){
134 void *cloverP, *cloverNormP;
137 void *gauge0, *gauge1;
141 errorQuda(
"Mixing gauge and spinor precision not supported");
144 errorQuda(
"Mixing clover and spinor precision not supported");
146 DslashCuda *dslash = 0;
147 size_t regSize =
sizeof(float);
150 #if (__COMPUTE_CAPABILITY__ >= 130)
151 dslash =
new AsymCloverDslashCuda<double2, double2, double2>
153 (double2*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
154 regSize =
sizeof(double);
156 errorQuda(
"Double precision not supported on this GPU");
159 dslash =
new AsymCloverDslashCuda<float4, float4, float4>
161 (float4*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
163 dslash =
new AsymCloverDslashCuda<short4, short4, short4>
165 (short4*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
169 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
182 errorQuda(
"Clover dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
#define DSLASH_SHARED_FLOATS_PER_THREAD
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void unbindCloverTex(const FullClover clover)
void asymCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaPrecision bindCloverTex(const FullClover clover, const int oddBit, void **cloverP, void **cloverNormP)
int GhostOffset(const int i) const
const int * GhostFace() const