10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 29 namespace ndegtwisted {
39 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC 43 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD 44 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0 54 using namespace ndegtwisted;
56 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC 57 template <
typename sFloat,
typename gFloat>
65 unsigned int sharedBytesPerThread()
const 68 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
76 NdegTwistedDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
77 const cudaColorSpinorField *
in,
const cudaColorSpinorField *
x,
79 const double epsilon,
const double k,
const int parity,
const int dagger,
const int *commOverride)
86 dslashParam.a =
kappa;
87 dslashParam.a_f =
kappa;
90 dslashParam.c = epsilon;
91 dslashParam.c_f = epsilon;
96 dslashParam.fl_stride =
in->VolumeCB()/2;
98 virtual ~NdegTwistedDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
100 TuneKey tuneKey()
const 103 strcat(key.aux,
",NdegDslash");
107 void apply(
const cudaStream_t &
stream)
109 #ifdef SHARED_WILSON_DSLASH 112 #ifndef USE_TEXTURE_OBJECTS 114 #endif // USE_TEXTURE_OBJECTS 120 long long flops()
const {
121 int twisted_flops = 48;
123 switch(dslashParam.kernel_type) {
133 flops += twisted_flops *
in->VolumeCB();
139 #endif // GPU_NDEG_TWISTED_MASS_DIRAC 147 const double &
kappa,
const double &
mu,
const double &epsilon,
148 const double &k,
const int *commOverride,
TimeProfile &profile)
150 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC 155 dslash =
new NdegTwistedDslashCuda<double2,double2>(
out, gauge,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
157 dslash =
new NdegTwistedDslashCuda<float4,float4>(
out, gauge,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
159 dslash =
new NdegTwistedDslashCuda<short4,short4>(
out, gauge,
in,
x, type,
kappa,
mu, epsilon, k,
parity,
dagger, commOverride);
162 int bulk_threads =
in->Volume() / 2;
163 int ghost_threads[4] = {0};
164 for(
int i=0;
i<4;
i++) ghost_threads[
i] =
in->GhostFace()[
i] / 2;
165 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in), bulk_threads, ghost_threads, profile);
166 dslash_policy.apply(0);
170 errorQuda(
"Non-degenerate twisted mass dslash has not been built");
#define NDEGTM_SHARED_FLOATS_PER_THREAD
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
char * strcat(char *__s1, const char *__s2)
VOLATILE spinorFloat kappa
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaTwistDslashType_s QudaTwistDslashType
cpuColorSpinorField * out
void ndegTwistedMassDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile)
virtual TuneKey tuneKey() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
static __inline__ size_t size_t d
#define NDEG_TM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param)