10 #ifdef GPU_STAGGERED_DIRAC 11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only 26 #endif // GPU_STAGGERED_DIRAC 37 namespace improvedstaggered {
42 #undef GPU_NDEG_TWISTED_MASS_DIRAC 43 #undef GPU_CLOVER_DIRAC 44 #undef GPU_DOMAIN_WALL_DIRAC 55 using namespace improvedstaggered;
57 #ifdef GPU_STAGGERED_DIRAC 58 template <
typename sFloat,
typename fatGFloat,
typename longGFloat,
typename phaseFloat>
59 class StaggeredDslashCuda :
public DslashCuda {
62 const GaugeField &fatGauge;
63 const GaugeField &longGauge;
64 const unsigned int nSrc;
67 bool tuneAuxDim()
const {
return true; }
68 unsigned int sharedBytesPerThread()
const 71 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(
double) :
sizeof(
float));
79 StaggeredDslashCuda(cudaColorSpinorField *
out,
const GaugeField &fatGauge,
const GaugeField &longGauge,
80 const cudaColorSpinorField *
in,
const cudaColorSpinorField *
x,
const double a,
81 const int parity,
const int dagger,
const int *commOverride)
83 fatGauge(fatGauge), longGauge(longGauge), nSrc(
in->
X(4))
86 for(
int i=0;
i < 4;
i++){
88 errorQuda(
"ERROR: partitioned dimension with local size less than 6 is not supported in improved staggered dslash\n");
96 if (
in->Precision() != fatGauge.Precision() ||
in->Precision() != longGauge.Precision()){
97 errorQuda(
"Mixing gauge and spinor precision not supported" 98 "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
99 in->Precision(), fatGauge.Precision(), longGauge.Precision());
104 dslashParam.fat_link_max = fatGauge.LinkMax();
105 dslashParam.coeff = 1.0/longGauge.Scale();
106 dslashParam.coeff_f = (
float)dslashParam.coeff;
109 virtual ~StaggeredDslashCuda() {
110 unbindSpinorTex<sFloat>(
in,
out,
x);
115 void apply(
const cudaStream_t &
stream)
117 #ifndef USE_TEXTURE_OBJECTS 119 #endif // USE_TEXTURE_OBJECTS 122 dslashParam.gauge_stride = fatGauge.Stride();
123 dslashParam.long_gauge_stride = longGauge.Stride();
124 dslashParam.swizzle = tp.aux.x;
128 bool advanceBlockDim(TuneParam &
param)
const 130 const unsigned int max_shared =
deviceProp.sharedMemPerBlock;
133 sharedBytesPerThread()*
param.block.x*
param.block.y < max_shared &&
139 bool rtn = DslashCuda::advanceBlockDim(
param);
146 bool advanceAux(TuneParam &
param)
const 161 void initTuneParam(TuneParam &
param)
const 163 DslashCuda::initTuneParam(
param);
169 void defaultTuneParam(TuneParam &
param)
const { initTuneParam(
param); }
171 int Nface()
const {
return 6; }
184 virtual long long flops()
const {
185 int mv_flops = (8 *
in->Ncolor() - 2) *
in->Ncolor();
186 int ghost_flops = (3 + 1) * (mv_flops + 2*
in->Ncolor()*
in->Nspin());
187 int xpay_flops = 2 * 2 *
in->Ncolor() *
in->Nspin();
191 switch(dslashParam.kernel_type) {
196 flops = ghost_flops * 2 *
in->GhostFace()[dslashParam.kernel_type];
200 long long ghost_sites = 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
201 flops = ghost_flops * ghost_sites;
207 long long sites =
in->VolumeCB();
208 flops = (2*num_dir*mv_flops +
209 (2*num_dir-1)*2*
in->Ncolor()*
in->Nspin()) * sites;
210 if (
x)
flops += xpay_flops * sites;
214 long long ghost_sites = 0;
215 for (
int d=0;
d<4;
d++)
if (dslashParam.commDim[
d]) ghost_sites += 2 *
in->GhostFace()[
d];
216 flops -= ghost_flops * ghost_sites;
224 virtual long long bytes()
const {
226 int gauge_bytes_long = reconstruct *
in->Precision();
227 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
228 int spinor_bytes = 2 *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
229 int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat) + spinor_bytes;
233 switch(dslashParam.kernel_type) {
238 bytes = ghost_bytes * 2 *
in->GhostFace()[dslashParam.kernel_type];
242 long long ghost_sites = 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
243 bytes = ghost_bytes * ghost_sites;
249 long long sites =
in->VolumeCB();
250 bytes = (num_dir*(gauge_bytes_fat + gauge_bytes_long) +
251 num_dir*2*spinor_bytes +
253 if (
x)
bytes += spinor_bytes;
257 long long ghost_sites = 0;
258 for (
int d=0;
d<4;
d++)
if (dslashParam.commDim[
d]) ghost_sites += 2*
in->GhostFace()[
d];
259 bytes -= ghost_bytes * ghost_sites;
268 #endif // GPU_STAGGERED_DIRAC 275 const double &k,
const int *commOverride,
TimeProfile &profile)
277 #ifdef GPU_STAGGERED_DIRAC 282 dslash =
new StaggeredDslashCuda<double2, double2, double2, double>
285 dslash =
new StaggeredDslashCuda<float2, float2, float4, float>
288 dslash =
new StaggeredDslashCuda<short2, short2, short4, short>
295 for (
int i=0;
i<4;
i++) ghostFace[
i] =
in->GhostFace()[
i] /
in->X(4);
297 DslashPolicyTune dslash_policy(*
dslash, const_cast<cudaColorSpinorField*>(
in),
in->Volume()/
in->X(4), ghostFace, profile);
298 dslash_policy.apply(0);
302 errorQuda(
"Staggered dslash has not been built");
303 #endif // GPU_STAGGERED_DIRAC
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)
cudaDeviceProp deviceProp
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
void unbindLongGaugeTex(const cudaGaugeField &gauge)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void improvedStaggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, const cudaGaugeField &longGauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
cpuColorSpinorField * out
void bindLongGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)
#define IMPROVED_STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
int comm_dim_partitioned(int dim)
void unbindFatGaugeTex(const cudaGaugeField &gauge)