10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 28 namespace asym_clover {
30 #undef GPU_STAGGERED_DIRAC 39 #ifdef GPU_CLOVER_DIRAC 45 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 46 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 56 using namespace asym_clover;
58 #ifdef GPU_CLOVER_DIRAC 59 template <
typename sFloat,
typename gFloat,
typename cFloat>
65 unsigned int sharedBytesPerThread()
const 68 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
76 AsymCloverDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const FullClover &
clover,
77 const cudaColorSpinorField *
in,
const cudaColorSpinorField *
x,
const double a,
78 const int parity,
const int dagger,
const int *commOverride)
82 if (
in->Precision() != clover_prec)
errorQuda(
"Mixing clover and spinor precision not supported");
85 dslashParam.cl_stride =
clover.stride;
86 dslashParam.rho =
clover.rho;
87 dslashParam.rho_f =
clover.rho;
89 if (!
x)
errorQuda(
"Asymmetric clover dslash only defined for Xpay");
92 virtual ~AsymCloverDslashCuda() {
93 unbindSpinorTex<sFloat>(
in,
out,
x);
97 void apply(
const cudaStream_t &
stream)
99 #ifdef SHARED_WILSON_DSLASH 102 #ifndef USE_TEXTURE_OBJECTS 104 #endif // USE_TEXTURE_OBJECTS 107 dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
108 for (
int i=0;
i<4;
i++) dslashParam.grid[
i] = ( (
i==0 ? 2 : 1) *
in->X(
i)) / dslashParam.block[
i];
109 ASYM_DSLASH(asymCloverDslash, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
112 long long flops()
const {
113 int clover_flops = 504;
115 switch(dslashParam.kernel_type) {
125 flops += clover_flops *
in->VolumeCB();
131 long long bytes()
const {
132 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
133 int clover_bytes = 72 *
in->Precision() + (isHalf ? 2*
sizeof(
float) : 0);
135 switch(dslashParam.kernel_type) {
144 bytes += clover_bytes*
in->VolumeCB();
152 #endif // GPU_CLOVER_DIRAC 161 #ifdef GPU_CLOVER_DIRAC 166 dslash =
new AsymCloverDslashCuda<double2, double2, double2>(
out, gauge,
clover,
in,
x,
a,
parity,
dagger, commOverride);
168 dslash =
new AsymCloverDslashCuda<float4, float4, float4>(
out, gauge,
clover,
in,
x,
a,
parity,
dagger, commOverride);
170 dslash =
new AsymCloverDslashCuda<short4, short4, short4>(
out, gauge,
clover,
in,
x,
a,
parity,
dagger, commOverride);
173 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume(),
in->GhostFace(), profile);
174 dslash_policy.apply(0);
178 errorQuda(
"Clover dslash has not been built");
virtual long long bytes() const
QudaPrecision bindCloverTex(const FullClover &clover, const int oddBit, T &dslashParam)
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
#define DSLASH_SHARED_FLOATS_PER_THREAD
#define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
void asymCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover &cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void unbindCloverTex(const FullClover clover)
cpuColorSpinorField * out
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const