10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
29 namespace twistedclover {
33 #include <dslash_index.cuh>
39 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
43 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
44 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
47 #include <dslash_quda.cuh>
52 #include <dslash_events.cuh>
54 using namespace twistedclover;
56 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
57 template <
typename sFloat,
typename gFloat,
typename cFloat>
58 class TwistedCloverDslashCuda :
public SharedDslashCuda {
61 const gFloat *gauge0, *gauge1;
66 const cFloat *cloverInv;
70 unsigned int sharedBytesPerThread()
const
72 #if (__COMPUTE_CAPABILITY__ >= 200)
74 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
80 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
86 TwistedCloverDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
88 const cFloat *cloverInv,
const float *cNrm2,
int cl_stride,
const cudaColorSpinorField *
in,
90 const double mu,
const double epsilon,
const double k,
const int dagger)
91 : SharedDslashCuda(out, in, x, reconstruct,dagger),gauge0(gauge0), gauge1(gauge1), clover(clover),
92 cNorm(cNorm), cloverInv(cloverInv), cNrm2(cNrm2), dslashType(dslashType)
94 bindSpinorTex<sFloat>(
in,
out,
x);
95 dslashParam.cl_stride = cl_stride;
96 dslashParam.fl_stride = in->VolumeCB();
102 virtual ~TwistedCloverDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
104 TuneKey tuneKey()
const
106 TuneKey key = DslashCuda::tuneKey();
109 strcat(key.aux,
",CloverTwistInvDslash");
112 strcat(key.aux,
",Dslash");
115 strcat(key.aux,
",DslashCloverTwist");
121 void apply(
const cudaStream_t &
stream)
123 #ifdef SHARED_WILSON_DSLASH
125 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
131 DSLASH(twistedCloverInvDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
132 (sFloat*)out->V(), (
float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
133 (sFloat*)in->V(), (
float*)in->Norm(), a, b, (sFloat*)(x ? x->V() : 0), (
float*)(x ? x->Norm() : 0));
136 DSLASH(twistedCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
137 (sFloat*)out->V(), (
float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
138 (sFloat*)in->V(), (
float*)in->Norm(), a, b, (sFloat*)(x ? x->V() : 0), (
float*)(x ? x->Norm() : 0));
141 DSLASH(twistedCloverDslashTwist, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
142 (sFloat*)out->V(), (
float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
143 (sFloat*)in->V(), (
float*)in->Norm(), a, b, (sFloat*)x->V(), (
float*)x->Norm());
145 default:
errorQuda(
"Invalid twisted clover dslash type");
149 long long flops()
const {
return (x ? 1416ll : 1392ll) * in->VolumeCB(); }
151 #endif // GPU_TWISTED_CLOVER_DIRAC
153 #include <dslash_policy.cuh>
158 const double &epsilon,
const double &k,
const int *commOverride,
162 errorQuda(
"Twisted-clover dslash does not yet support a fused exterior dslash kernel");
165 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
168 int ghost_threads[4] = {0};
171 for(
int i=0;i<4;i++){
184 void *gauge0, *gauge1;
187 void *cloverP, *cloverNormP, *cloverInvP, *cloverInvNormP;
191 errorQuda(
"Mixing clover and spinor precision not supported");
194 errorQuda(
"Mixing gauge and spinor precision not supported");
197 errorQuda(
"clover and cloverInv must have matching strides (%d != %d)", clover->
stride, cloverInv->
stride);
199 DslashCuda *dslash = 0;
200 size_t regSize =
sizeof(float);
203 #if (__COMPUTE_CAPABILITY__ >= 130)
204 dslash =
new TwistedCloverDslashCuda<double2,double2,double2>(
out, (double2*)gauge0,(double2*)gauge1, gauge.
Reconstruct(), (double2*)cloverP, (
float*)cloverNormP,
205 (double2*)cloverInvP, (
float*)cloverInvNormP, clover->
stride,
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
207 regSize =
sizeof(double);
209 errorQuda(
"Double precision not supported on this GPU");
212 dslash =
new TwistedCloverDslashCuda<float4,float4,float4>(
out, (float4*)gauge0,(float4*)gauge1, gauge.
Reconstruct(), (float4*)cloverP, (
float*)cloverNormP,
213 (float4*)cloverInvP, (
float*)cloverInvNormP, clover->
stride,
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
216 dslash =
new TwistedCloverDslashCuda<short4,short4,short4>(
out, (short4*)gauge0,(short4*)gauge1, gauge.
Reconstruct(), (short4*)cloverP, (
float*)cloverNormP,
217 (short4*)cloverInvP, (
float*)cloverInvNormP, clover->
stride,
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
221 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
225 (*dslashImp)(*dslash,
const_cast<cudaColorSpinorField*
>(
in), regSize, parity, dagger, bulk_threads, ghost_threads, profile);
236 #if (__COMPUTE_CAPABILITY__ < 200)
237 errorQuda(
"Twisted-clover fermions not supported on pre-Fermi architecture");
239 errorQuda(
"Twisted clover dslash has not been built");
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, void **cloverP, void **cloverNormP, void **cloverInvP, void **cloverInvNormP)
void unbindGaugeTex(const cudaGaugeField &gauge)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
enum QudaTwistCloverDslashType_s QudaTwistCloverDslashType
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
VOLATILE spinorFloat kappa
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void twistedCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover *clover, const FullClover *cloverInv, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
QudaTwistFlavorType TwistFlavor() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define DSLASH_SHARED_FLOATS_PER_THREAD
void unbindTwistedCloverTex(const FullClover clover)
int GhostOffset(const int i) const
const int * GhostFace() const