10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
30 namespace ndegtwisted {
34 #include <dslash_index.cuh>
41 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_NDEG_TWISTED_MASS_DIRAC)
45 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD
46 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0
49 #include <dslash_quda.cuh>
54 #include <dslash_events.cuh>
56 using namespace ndegtwisted;
58 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_NDEG_TWISTED_MASS_DIRAC)
59 template <
typename sFloat,
typename gFloat>
60 class NdegTwistedDslashCuda :
public SharedDslashCuda {
63 const gFloat *gauge0, *gauge1;
68 unsigned int sharedBytesPerThread()
const
70 #if (__COMPUTE_CAPABILITY__ >= 200)
72 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
78 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
84 NdegTwistedDslashCuda(cudaColorSpinorField *
out,
const gFloat *gauge0,
const gFloat *gauge1,
87 const double epsilon,
const double k,
const int dagger)
88 : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), dslashType(dslashType)
90 bindSpinorTex<sFloat>(
in,
out,
x);
96 dslashParam.fl_stride = in->VolumeCB()/2;
98 virtual ~NdegTwistedDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
100 TuneKey tuneKey()
const
102 TuneKey key = DslashCuda::tuneKey();
103 strcat(key.aux,
",NdegDslash");
107 void apply(
const cudaStream_t &
stream)
110 #ifdef SHARED_WILSON_DSLASH
112 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
115 NDEG_TM_DSLASH(twistedNdegMassDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
116 (sFloat*)out->V(), (
float*)out->Norm(), gauge0, gauge1,
117 (sFloat*)in->V(), (
float*)in->Norm(), a, b, c, d, (sFloat*)(x ? x->V() : 0), (
float*)(x ? x->Norm() : 0));
120 long long flops()
const {
return (x ? 1416ll : 1392ll) * in->VolumeCB(); }
122 #endif // GPU_NDEG_TWISTED_MASS_DIRAC
125 #include <dslash_policy.cuh>
130 const double &kappa,
const double &mu,
const double &epsilon,
131 const double &k,
const int *commOverride,
TimeProfile &profile,
135 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_NDEG_TWISTED_MASS_DIRAC)
138 int ghost_threads[4] = {0};
139 int bulk_threads = in->
Volume() / 2;
141 for(
int i=0;i<4;i++){
146 ghost_threads[i] = in->
GhostFace()[i] / 2;
149 void *gauge0, *gauge1;
153 errorQuda(
"Mixing gauge and spinor precision not supported");
155 DslashCuda *dslash = 0;
156 size_t regSize =
sizeof(float);
159 #if (__COMPUTE_CAPABILITY__ >= 130)
160 dslash =
new NdegTwistedDslashCuda<double2,double2>(
out, (double2*)gauge0,(double2*)gauge1, gauge.
Reconstruct(),
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
161 regSize =
sizeof(double);
163 errorQuda(
"Double precision not supported on this GPU");
166 dslash =
new NdegTwistedDslashCuda<float4,float4>(
out, (float4*)gauge0,(float4*)gauge1, gauge.
Reconstruct(),
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
169 dslash =
new NdegTwistedDslashCuda<short4,short4>(
out, (short4*)gauge0,(short4*)gauge1, gauge.
Reconstruct(),
in,
x, type,
kappa,
mu, epsilon, k,
dagger);
173 DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
177 (*dslashImp)(*dslash,
const_cast<cudaColorSpinorField*
>(
in), regSize, parity, dagger, bulk_threads, ghost_threads, profile);
187 #if (__COMPUTE_CAPABILITY__ < 200)
188 errorQuda(
"Non-degenerate twisted-mass fermions not supported on pre-Fermi architecture");
190 errorQuda(
"Non-degenerate twisted mass dslash has not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
#define NDEGTM_SHARED_FLOATS_PER_THREAD
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int GhostNormOffset(const int i) const
QudaPrecision Precision() const
void ndegTwistedMassDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH)
VOLATILE spinorFloat kappa
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
QudaFieldOrder FieldOrder() const
enum QudaTwistDslashType_s QudaTwistDslashType
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
int GhostOffset(const int i) const
const int * GhostFace() const