2 #include <cuda_runtime.h> 12 #define MIN_COEFF 1e-7 18 template <
typename Float,
typename Link,
typename Gauge>
36 LinkArg(Link link, Gauge u, Float
coeff,
const GaugeField &link_meta,
const GaugeField &u_meta)
37 : threads(link_meta.VolumeCB()), link(link), u(u),
coeff(
coeff)
39 for (
int d=0;
d<4;
d++) {
40 X[
d] = link_meta.X()[
d];
42 border[
d] = (
E[
d] -
X[
d]) / 2;
47 template <
typename Float,
int dir,
typename Arg>
48 __device__
void longLinkDir(Arg &
arg,
int idx,
int parity) {
50 int dx[4] = {0, 0, 0, 0};
52 int *
y =
arg.u.coords;
54 for (
int d=0;
d<4;
d++)
x[
d] +=
arg.border[
d];
56 typedef Matrix<complex<Float>,3> Link;
70 template <
typename Float,
typename Arg>
71 __global__
void computeLongLink(Arg
arg) {
75 int dir = blockIdx.z*
blockDim.z + threadIdx.z;
76 if (
idx >=
arg.threads)
return;
88 template <
typename Float,
typename Arg>
89 class LongLink :
public TunableVectorYZ {
91 const GaugeField &meta;
92 unsigned int minThreads()
const {
return arg.threads; }
93 bool tuneGridDim()
const {
return false; }
96 LongLink(Arg &
arg,
const GaugeField &meta) : TunableVectorYZ(2,4),
arg(
arg), meta(meta) {}
97 virtual ~LongLink() {}
99 void apply(
const cudaStream_t &
stream) {
101 computeLongLink<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
104 TuneKey tuneKey()
const {
105 std::stringstream aux;
106 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
107 return TuneKey(meta.VolString(),
typeid(*this).name(), aux.str().c_str());
110 long long flops()
const {
return 2*4*
arg.threads*198; }
111 long long bytes()
const {
return 2*4*
arg.threads*(3*
arg.u.Bytes()+
arg.link.Bytes()); }
114 void computeLongLink(GaugeField &lng,
const GaugeField &u,
double coeff)
117 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
119 typedef LinkArg<double,L,L> Arg;
120 Arg
arg(L(lng), L(u),
coeff, lng, u);
121 LongLink<double,Arg> longLink(
arg,lng);
124 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
125 typedef LinkArg<double,L,G> Arg;
126 Arg
arg(L(lng), G(u),
coeff, lng, u);
127 LongLink<double,Arg> longLink(
arg,lng);
130 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
133 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
135 typedef LinkArg<float,L,L> Arg;
136 Arg
arg(L(lng), L(u),
coeff, lng, u) ;
137 LongLink<float,Arg> longLink(
arg,lng);
140 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
141 typedef LinkArg<float,L,G> Arg;
142 Arg
arg(L(lng), G(u),
coeff, lng, u);
143 LongLink<float,Arg> longLink(
arg,lng);
147 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
150 errorQuda(
"Unsupported precision %d\n", u.Precision());
155 template <
typename Float,
typename Arg>
156 __global__
void computeOneLink(Arg
arg) {
160 int dir = blockIdx.z *
blockDim.z + threadIdx.z;
161 if (
idx >=
arg.threads)
return;
162 if (dir >= 4)
return;
164 int *
x =
arg.u.coords;
166 for (
int d=0;
d<4;
d++)
x[
d] +=
arg.border[
d];
168 typedef Matrix<complex<Float>,3> Link;
177 template <
typename Float,
typename Arg>
178 class OneLink :
public TunableVectorYZ {
180 const GaugeField &meta;
181 unsigned int minThreads()
const {
return arg.threads; }
182 bool tuneGridDim()
const {
return false; }
185 OneLink(Arg &
arg,
const GaugeField &meta) : TunableVectorYZ(2,4),
arg(
arg), meta(meta) {}
186 virtual ~OneLink() {}
188 void apply(
const cudaStream_t &
stream) {
190 computeOneLink<Float><<<tp.grid,tp.block>>>(
arg);
193 TuneKey tuneKey()
const {
194 std::stringstream aux;
195 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
196 return TuneKey(meta.VolString(),
typeid(*this).name(), aux.str().c_str());
199 long long flops()
const {
return 2*4*
arg.threads*18; }
200 long long bytes()
const {
return 2*4*
arg.threads*(
arg.u.Bytes()+
arg.link.Bytes()); }
203 void computeOneLink(GaugeField &fat,
const GaugeField &u,
double coeff)
206 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
208 typedef LinkArg<double,L,L> Arg;
209 Arg
arg(L(fat), L(u),
coeff, fat, u);
210 OneLink<double,Arg> oneLink(
arg,fat);
213 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
214 typedef LinkArg<double,L,G> Arg;
215 Arg
arg(L(fat), G(u),
coeff, fat, u);
216 OneLink<double,Arg> oneLink(
arg,fat);
219 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
222 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
224 typedef LinkArg<float,L,L> Arg;
225 Arg
arg(L(fat), L(u),
coeff, fat, u);
226 OneLink<float,Arg> oneLink(
arg,fat);
229 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
230 typedef LinkArg<float,L,G> Arg;
231 Arg
arg(L(fat), G(u),
coeff, fat, u);
232 OneLink<float,Arg> oneLink(
arg,fat);
235 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
238 errorQuda(
"Unsupported precision %d\n", u.Precision());
243 template <
typename Float,
typename Fat,
typename Staple,
typename Mulink,
typename Gauge>
245 unsigned int threads;
269 StapleArg(Fat fat, Staple staple, Mulink mulink, Gauge u, Float
coeff,
270 const GaugeField &fat_meta,
const GaugeField &u_meta)
271 : threads(1), fat(fat), staple(staple), mulink(mulink), u(u),
coeff(
coeff),
274 for (
int d=0;
d<4;
d++) {
275 X[
d] = (fat_meta.X()[
d] + u_meta.X()[
d]) / 2;
276 E[
d] = u_meta.X()[
d];
277 border[
d] = (
E[
d] -
X[
d]) / 2;
280 inner_X[
d] = fat_meta.X()[
d];
281 inner_border[
d] = (
E[
d] - inner_X[
d]) / 2;
287 template<
typename Float,
int mu,
int nu,
typename Arg>
288 __device__
inline void computeStaple(
Matrix<complex<Float>,3> &staple, Arg &
arg,
int x[],
int parity) {
290 int *
y =
arg.u.coords, *y_mu =
arg.mulink.coords, dx[4] = {0, 0, 0, 0};
341 template<
typename Float,
bool save_staple,
typename Arg>
342 __global__
void computeStaple(Arg
arg,
int nu)
346 if (
idx >=
arg.threads)
return;
348 int mu_idx = blockIdx.z*
blockDim.z + threadIdx.z;
349 if (mu_idx >=
arg.n_mu)
return;
352 case 0:
mu =
arg.mu_map[0];
break;
353 case 1:
mu =
arg.mu_map[1];
break;
354 case 2:
mu =
arg.mu_map[2];
break;
359 for (
int d=0;
d<4;
d++)
x[
d] +=
arg.border[
d];
361 typedef Matrix<complex<Float>,3> Link;
366 case 1: computeStaple<Float,0,1>(staple,
arg,
x,
parity);
break;
367 case 2: computeStaple<Float,0,2>(staple,
arg,
x,
parity);
break;
368 case 3: computeStaple<Float,0,3>(staple,
arg,
x,
parity);
break;
372 case 0: computeStaple<Float,1,0>(staple,
arg,
x,
parity);
break;
373 case 2: computeStaple<Float,1,2>(staple,
arg,
x,
parity);
break;
374 case 3: computeStaple<Float,1,3>(staple,
arg,
x,
parity);
break;
378 case 0: computeStaple<Float,2,0>(staple,
arg,
x,
parity);
break;
379 case 1: computeStaple<Float,2,1>(staple,
arg,
x,
parity);
break;
380 case 3: computeStaple<Float,2,3>(staple,
arg,
x,
parity);
break;
384 case 0: computeStaple<Float,3,0>(staple,
arg,
x,
parity);
break;
385 case 1: computeStaple<Float,3,1>(staple,
arg,
x,
parity);
break;
386 case 2: computeStaple<Float,3,2>(staple,
arg,
x,
parity);
break;
391 if ( !(
x[0] <
arg.inner_border[0] ||
x[0] >=
arg.inner_X[0] +
arg.inner_border[0] ||
392 x[1] <
arg.inner_border[1] ||
x[1] >=
arg.inner_X[1] +
arg.inner_border[1] ||
393 x[2] <
arg.inner_border[2] ||
x[2] >=
arg.inner_X[2] +
arg.inner_border[2] ||
394 x[3] <
arg.inner_border[3] ||
x[3] >=
arg.inner_X[3] +
arg.inner_border[3]) ) {
396 int inner_x[] = {
x[0]-
arg.inner_border[0],
x[1]-
arg.inner_border[1],
x[2]-
arg.inner_border[2],
x[3]-
arg.inner_border[3]};
398 fat +=
arg.coeff * staple;
406 template <
typename Float,
typename Arg>
407 class Staple :
public TunableVectorYZ {
409 const GaugeField &meta;
410 unsigned int minThreads()
const {
return arg.threads; }
411 bool tuneGridDim()
const {
return false; }
418 Staple(Arg &
arg,
int nu,
int dir1,
int dir2,
bool save_staple,
const GaugeField &meta)
419 : TunableVectorYZ(2,(3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 ))),
420 arg(
arg), meta(meta), nu(nu), dir1(dir1), dir2(dir2), save_staple(save_staple)
426 arg.n_mu = 3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 );
428 for (
int i=0;
i<4;
i++) {
429 if (
i==nu ||
i==dir1 ||
i==dir2)
continue;
432 assert(j ==
arg.n_mu);
436 void apply(
const cudaStream_t &
stream) {
439 computeStaple<Float,true><<<tp.grid,tp.block>>>(
arg, nu);
441 computeStaple<Float,false><<<tp.grid,tp.block>>>(
arg, nu);
444 TuneKey tuneKey()
const {
445 std::stringstream aux;
446 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
447 aux <<
",nu=" << nu <<
",dir1=" << dir1 <<
",dir2=" << dir2 <<
",save=" << save_staple;
448 return TuneKey(meta.VolString(),
typeid(*this).name(), aux.str().c_str());
451 void preTune() {
arg.fat.save();
arg.staple.save(); }
452 void postTune() {
arg.fat.load();
arg.staple.load(); }
454 long long flops()
const {
455 return 2*
arg.n_mu*
arg.threads*( 4*198 + 18 + 36 );
457 long long bytes()
const {
458 return arg.n_mu*2*meta.VolumeCB()*
arg.fat.Bytes()*2
459 +
arg.n_mu*2*
arg.threads*(4*
arg.u.Bytes() + 2*
arg.mulink.Bytes() + (save_staple ?
arg.staple.Bytes() : 0));
464 void computeStaple(GaugeField &fat, GaugeField &staple,
const GaugeField &mulink,
const GaugeField &u,
465 int nu,
int dir1,
int dir2,
double coeff,
bool save_staple) {
468 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
470 typedef StapleArg<double,L,L,L,L> Arg;
471 Arg
arg(L(fat), L(staple), L(mulink), L(u),
coeff, fat, u);
472 Staple<double,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
475 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
477 typedef StapleArg<double,L,L,L,G> Arg;
478 Arg
arg(L(fat), L(staple), L(mulink), G(u),
coeff, fat, u);
479 Staple<double,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
482 typedef StapleArg<double,L,L,G,G> Arg;
483 Arg
arg(L(fat), L(staple), G(mulink), G(u),
coeff, fat, u);
484 Staple<double,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
487 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
490 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
493 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
495 typedef StapleArg<float,L,L,L,L> Arg;
496 Arg
arg(L(fat), L(staple), L(mulink), L(u),
coeff, fat, u);
497 Staple<float,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
500 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
502 typedef StapleArg<double,L,L,L,G> Arg;
503 Arg
arg(L(fat), L(staple), L(mulink), G(u),
coeff, fat, u);
504 Staple<float,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
507 typedef StapleArg<double,L,L,G,G> Arg;
508 Arg
arg(L(fat), L(staple), G(mulink), G(u),
coeff, fat, u);
509 Staple<float,Arg> stapler(
arg, nu, dir1, dir2, save_staple, fat);
512 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
515 errorQuda(
"Reconstruct %d is not supported\n", u.Reconstruct());
518 errorQuda(
"Unsupported precision %d\n", u.Precision());
535 if( ((fat->
X()[0] % 2 != 0) || (fat->
X()[1] % 2 != 0) || (fat->
X()[2] % 2 != 0) || (fat->
X()[3] % 2 != 0))
537 errorQuda(
"Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
541 computeOneLink(*fat, u,
coeff[0]-6.0*
coeff[5]);
544 if (lng) computeLongLink(*lng, u,
coeff[1]);
550 for (
int nu = 0; nu < 4; nu++) {
551 computeStaple(*fat, staple, u, u, nu, -1, -1,
coeff[2], 1);
553 if (
coeff[5] != 0.0) computeStaple(*fat, staple, staple, u, nu, -1, -1,
coeff[5], 0);
555 for (
int rho = 0; rho < 4; rho++) {
558 computeStaple(*fat, staple1, staple, u, rho, nu, -1,
coeff[3], 1);
561 for (
int sig = 0; sig < 4; sig++) {
562 if (sig != nu && sig != rho) {
563 computeStaple(*fat, staple, staple1, u, sig, nu, rho,
coeff[4], 0);
574 errorQuda(
"Fat-link computation not enabled");
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
int commDimPartitioned(int dir)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
void setPrecision(QudaPrecision precision)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType reconstruct
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)