10 #ifdef GPU_STAGGERED_DIRAC 11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only 27 #endif // GPU_STAGGERED_DIRAC 43 #undef GPU_CLOVER_DIRAC 44 #undef GPU_DOMAIN_WALL_DIRAC 55 using namespace staggered;
57 #ifdef GPU_STAGGERED_DIRAC 58 template <
typename sFloat,
typename gFloat>
59 class StaggeredDslashCuda :
public DslashCuda {
62 const unsigned int nSrc;
65 bool tuneAuxDim()
const {
return true; }
66 unsigned int sharedBytesPerThread()
const 69 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
77 StaggeredDslashCuda(cudaColorSpinorField *
out,
const GaugeField &gauge,
const cudaColorSpinorField *
in,
78 const cudaColorSpinorField *
x,
const double a,
79 const int parity,
const int dagger,
const int *commOverride)
83 errorQuda(
"Reconstruct %d not supported", gauge.Reconstruct());
87 dslashParam.fat_link_max = gauge.LinkMax();
90 virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(
in,
out,
x); }
92 void apply(
const cudaStream_t &
stream)
94 #ifndef USE_TEXTURE_OBJECTS 96 #endif // USE_TEXTURE_OBJECTS 99 dslashParam.swizzle = tp.aux.x;
103 bool advanceBlockDim(TuneParam &
param)
const 105 const unsigned int max_shared =
deviceProp.sharedMemPerBlock;
108 sharedBytesPerThread()*
param.block.x*
param.block.y < max_shared &&
114 bool rtn = DslashCuda::advanceBlockDim(
param);
121 bool advanceAux(TuneParam &
param)
const 136 void initTuneParam(TuneParam &
param)
const 138 DslashCuda::initTuneParam(
param);
144 void defaultTuneParam(TuneParam &
param)
const { initTuneParam(
param); }
146 int Nface()
const {
return 2; }
148 #endif // GPU_STAGGERED_DIRAC 155 const double &k,
const int *commOverride,
TimeProfile &profile)
157 #ifdef GPU_STAGGERED_DIRAC 172 for (
int i=0;
i<4;
i++) ghostFace[
i] =
in->GhostFace()[
i] /
in->X(4);
174 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume()/
in->X(4), ghostFace, profile);
175 dslash_policy.apply(0);
179 errorQuda(
"Staggered dslash has not been built");
180 #endif // GPU_STAGGERED_DIRAC
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
cpuColorSpinorField * out
#define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...