10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 30 #undef GPU_STAGGERED_DIRAC // do not delete - hack for Tesla architecture 32 #ifndef GPU_DOMAIN_WALL_DIRAC 33 #define GPU_DOMAIN_WALL_DIRAC // do not delete - work around for CUDA 6.5 alignment bug 44 #ifdef GPU_CLOVER_DIRAC 50 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 51 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 63 #ifdef GPU_CLOVER_DIRAC 64 template <
typename sFloat,
typename gFloat,
typename cFloat>
68 const FullClover &cloverInv;
70 unsigned int sharedBytesPerThread()
const 73 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
80 CloverDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const FullClover &cloverInv,
const cudaColorSpinorField *
in,
81 const cudaColorSpinorField *
x,
const double a,
const int parity,
const int dagger,
const int *commOverride)
85 if (
in->Precision() != clover_prec)
errorQuda(
"Mixing clover and spinor precision not supported");
88 dslashParam.cl_stride = cloverInv.stride;
91 virtual ~CloverDslashCuda() {
92 unbindSpinorTex<sFloat>(
in,
out,
x);
96 void apply(
const cudaStream_t &
stream)
98 #ifdef SHARED_WILSON_DSLASH 101 #ifndef USE_TEXTURE_OBJECTS 103 #endif // USE_TEXTURE_OBJECTS 106 dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
107 for (
int i=0;
i<4;
i++) dslashParam.grid[
i] = ( (
i==0 ? 2 : 1) *
in->X(
i)) / dslashParam.block[
i];
108 DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes,
stream, dslashParam);
111 long long flops()
const {
112 int clover_flops = 504;
114 switch(dslashParam.kernel_type) {
119 flops += clover_flops *
in->GhostFace()[dslashParam.kernel_type];
122 flops += clover_flops * 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
126 flops += clover_flops *
in->VolumeCB();
130 long long ghost_sites = 0;
131 for (
int d=0;
d<4;
d++)
if (dslashParam.commDim[
d]) ghost_sites += 2 *
in->GhostFace()[
d];
132 flops -= clover_flops * ghost_sites;
139 long long bytes()
const {
140 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
141 int clover_bytes = 72 *
in->Precision() + (isHalf ? 2*
sizeof(
float) : 0);
144 switch(dslashParam.kernel_type) {
149 bytes += clover_bytes * 2 *
in->GhostFace()[dslashParam.kernel_type];
152 bytes += clover_bytes * 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
156 bytes += clover_bytes*
in->VolumeCB();
160 long long ghost_sites = 0;
161 for (
int d=0;
d<4;
d++)
if (dslashParam.commDim[
d]) ghost_sites += 2*
in->GhostFace()[
d];
162 bytes -= clover_bytes * ghost_sites;
171 #endif // GPU_CLOVER_DIRAC 180 #ifdef GPU_CLOVER_DIRAC 185 dslash =
new CloverDslashCuda<double2, double2, double2>(
out, gauge, cloverInv,
in,
x,
a,
parity,
dagger, commOverride);
187 dslash =
new CloverDslashCuda<float4, float4, float4>(
out, gauge, cloverInv,
in,
x,
a,
parity,
dagger, commOverride);
189 dslash =
new CloverDslashCuda<short4, short4, short4>(
out, gauge, cloverInv,
in,
x,
a,
parity,
dagger, commOverride);
192 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume(),
in->GhostFace(), profile);
193 dslash_policy.apply(0);
197 errorQuda(
"Clover dslash has not been built");
virtual long long bytes() const
QudaPrecision bindCloverTex(const FullClover &clover, const int oddBit, T &dslashParam)
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover &cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
#define DSLASH_SHARED_FLOATS_PER_THREAD
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void unbindCloverTex(const FullClover clover)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const
static __inline__ size_t size_t d