10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 28 namespace twistedclover {
38 #ifdef GPU_TWISTED_CLOVER_DIRAC 42 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 43 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 53 using namespace twistedclover;
55 #ifdef GPU_TWISTED_CLOVER_DIRAC 56 template <
typename sFloat,
typename gFloat,
typename cFloat>
63 const FullClover &cloverInv;
66 unsigned int sharedBytesPerThread()
const 69 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
77 TwistedCloverDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const FullClover &
clover,
const FullClover &cloverInv,
80 const double kappa,
const double mu,
const double epsilon,
const double k,
81 const int parity,
const int dagger,
const int *commOverride)
85 if (
in->Precision() != clover_prec)
errorQuda(
"Mixing clover and spinor precision not supported");
87 #ifndef DYNAMIC_CLOVER 88 if (
clover.stride != cloverInv.stride)
89 errorQuda(
"clover and cloverInv must have matching strides (%d != %d)",
clover.stride, cloverInv.stride);
97 dslashParam.twist_a = 0.0;
98 dslashParam.twist_b = 0.0;
99 dslashParam.a =
kappa;
100 dslashParam.a_f =
kappa;
102 dslashParam.b_f =
mu;
103 dslashParam.cl_stride =
clover.stride;
104 dslashParam.fl_stride =
in->VolumeCB();
107 virtual ~TwistedCloverDslashCuda() {
108 unbindSpinorTex<sFloat>(
in,
out,
x);
112 void apply(
const cudaStream_t &
stream)
114 #ifdef SHARED_WILSON_DSLASH 117 #ifndef USE_TEXTURE_OBJECTS 119 #endif // USE_TEXTURE_OBJECTS 122 dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
123 for (
int i=0;
i<4;
i++) dslashParam.grid[
i] = ( (
i==0 ? 2 : 1) *
in->X(
i)) / dslashParam.block[
i];
127 DSLASH(twistedCloverInvDslash, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
130 DSLASH(twistedCloverDslash, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
133 DSLASH(twistedCloverDslashTwist, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
136 errorQuda(
"Invalid twisted clover dslash type");
140 TuneKey tuneKey()
const 143 switch (dslashType) {
145 #ifndef DYNAMIC_CLOVER 146 strcat(key.aux,
",CloverTwistInvDslash");
148 strcat(key.aux,
",CloverTwistInvDynDslash");
152 #ifndef DYNAMIC_CLOVER 153 strcat(key.aux,
",Dslash");
155 strcat(key.aux,
",DynDslash");
159 #ifndef DYNAMIC_CLOVER 160 strcat(key.aux,
",DslashCloverTwist");
162 strcat(key.aux,
",DynDslashCloverTwist");
166 errorQuda(
"Unsupported twisted-dslash type %d", dslashType);
171 long long flops()
const {
172 int clover_flops = 504 + 48;
174 switch(dslashParam.kernel_type) {
184 flops += clover_flops *
in->VolumeCB();
190 long long bytes()
const {
191 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
192 int clover_bytes = 72 *
in->Precision() + (isHalf ? 2*
sizeof(
float) : 0);
194 switch(dslashParam.kernel_type) {
203 bytes += clover_bytes*
in->VolumeCB();
211 #endif // GPU_TWISTED_CLOVER_DIRAC 218 const double &epsilon,
const double &k,
const int *commOverride,
TimeProfile &profile)
220 #ifdef GPU_TWISTED_CLOVER_DIRAC 225 dslash =
new TwistedCloverDslashCuda<double2,double2,double2>
226 (
out, gauge, *
clover, *cloverInv,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
228 dslash =
new TwistedCloverDslashCuda<float4,float4,float4>
229 (
out, gauge, *
clover, *cloverInv,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
231 dslash =
new TwistedCloverDslashCuda<short4,short4,short4>
232 (
out, gauge, *
clover, *cloverInv,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
235 int ghost_threads[4] = {0};
239 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in), bulk_threads, ghost_threads, profile);
240 dslash_policy.apply(0);
244 errorQuda(
"Twisted clover dslash has not been built");
virtual long long bytes() const
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, T &dslashParam)
enum QudaTwistCloverDslashType_s QudaTwistCloverDslashType
void twistedCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover *clover, const FullClover *cloverInv, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile)
char * strcat(char *__s1, const char *__s2)
VOLATILE spinorFloat kappa
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
virtual TuneKey tuneKey() const
#define DSLASH_SHARED_FLOATS_PER_THREAD
void unbindTwistedCloverTex(const FullClover clover)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
static __inline__ size_t size_t d