10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
31 #undef GPU_STAGGERED_DIRAC // do not delete - hack for Tesla architecture
32 #define GPU_DOMAIN_WALL_DIRAC // do not delete - work around for CUDA 6.5 alignment bug
36 #include <dslash_index.cuh>
42 #ifdef GPU_CLOVER_DIRAC
48 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
49 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
52 #include <dslash_quda.cuh>
57 #include <dslash_events.cuh>
59 using namespace clover;
61 #ifdef GPU_CLOVER_DIRAC
62 template <
typename sFloat,
typename gFloat,
typename cFloat>
63 class CloverDslashCuda :
public SharedDslashCuda {
66 const gFloat *gauge0, *gauge1;
68 const float *cloverNorm;
72 unsigned int sharedBytesPerThread()
const
74 #if (__COMPUTE_CAPABILITY__ >= 200)
76 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
82 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
87 CloverDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
89 const float *cloverNorm,
int cl_stride,
const cudaColorSpinorField *
in,
90 const cudaColorSpinorField *
x,
const double a,
const int dagger)
91 : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), clover(clover),
92 cloverNorm(cloverNorm), a(a)
94 bindSpinorTex<sFloat>(
in,
out,
x);
95 dslashParam.cl_stride = cl_stride;
97 virtual ~CloverDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
99 void apply(
const cudaStream_t &
stream)
101 #ifdef SHARED_WILSON_DSLASH
103 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
106 DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
107 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, clover, cloverNorm,
108 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
111 long long flops()
const {
return (
x ? 1872ll : 1824ll) *
in->VolumeCB(); }
113 #endif // GPU_CLOVER_DIRAC
115 #include <dslash_policy.cuh>
124 #ifdef GPU_CLOVER_DIRAC
126 for(
int i=0;i<4;i++){
133 void *cloverP, *cloverNormP;
136 void *gauge0, *gauge1;
140 errorQuda(
"Mixing gauge and spinor precision not supported");
143 errorQuda(
"Mixing clover and spinor precision not supported");
145 DslashCuda *dslash = 0;
146 size_t regSize =
sizeof(float);
149 #if (__COMPUTE_CAPABILITY__ >= 130)
150 dslash =
new CloverDslashCuda<double2, double2, double2>
152 (double2*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
153 regSize =
sizeof(double);
155 errorQuda(
"Double precision not supported on this GPU");
158 dslash =
new CloverDslashCuda<float4, float4, float4>
160 (float4*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
162 dslash =
new CloverDslashCuda<short4, short4, short4>
164 (short4*)cloverP, (
float*)cloverNormP, cloverInv.
stride,
in,
x, a,
dagger);
168 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
181 errorQuda(
"Clover dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
int GhostNormOffset(const int i) const
#define DSLASH_SHARED_FLOATS_PER_THREAD
void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaPrecision Precision() const
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void unbindCloverTex(const FullClover clover)
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaPrecision bindCloverTex(const FullClover clover, const int oddBit, void **cloverP, void **cloverNormP)
int GhostOffset(const int i) const
const int * GhostFace() const