9 #define DOUBLE_TOL 1e-15 10 #define SINGLE_TOL 2e-6 14 #ifdef GPU_GAUGE_TOOLS 16 template <
typename Float,
typename GaugeOr,
typename GaugeDs>
17 struct GaugeSTOUTArg {
23 const Float tolerance;
27 GaugeSTOUTArg(GaugeOr &origin, GaugeDs &dest,
const GaugeField &data,
const Float rho,
const Float tolerance)
28 : threads(1), origin(origin), dest(dest), rho(rho), tolerance(tolerance) {
29 for (
int dir = 0; dir < 4; ++dir ) {
30 border[dir] = data.R()[dir];
31 X[dir] = data.X()[dir] - border[dir] * 2;
39 template <
typename Float,
typename GaugeOr,
typename GaugeDs,
typename Float2>
40 __host__ __device__
void computeStaple(GaugeSTOUTArg<Float,GaugeOr,GaugeDs>&
arg,
int idx,
int parity,
int dir,
Matrix<Float2,3> &staple) {
46 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
50 for(
int dr=0; dr<4; ++dr) {
51 x[dr] +=
arg.border[dr];
52 X[dr] += 2*
arg.border[dr];
58 for (
int mu=0;
mu<3;
mu++) {
65 int dx[4] = {0, 0, 0, 0};
81 staple = staple + U1 * U2 *
conj(U3);
95 staple = staple +
conj(U1) * U2 * U3;
101 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
102 __global__
void computeSTOUTStep(GaugeSTOUTArg<Float,GaugeOr,GaugeDs>
arg){
106 int dir = threadIdx.z + blockIdx.z*
blockDim.z;
107 if (
idx >=
arg.threads)
return;
108 if (dir >= 3)
return;
109 typedef complex<Float>
Complex;
113 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
117 for(
int dr=0; dr<4; ++dr) {
118 x[dr] +=
arg.border[dr];
119 X[dr] += 2*
arg.border[dr];
122 int dx[4] = {0, 0, 0, 0};
125 Link U, UDag, Stap, Omega, OmegaDiff, ODT, Q, exp_iQ;
130 computeStaple<Float,GaugeOr,GaugeDs,Complex>(
arg,
idx,
parity,dir,Stap);
148 Omega = (
arg.rho * Stap) * UDag;
153 OmegaDiff =
conj(Omega) - Omega;
157 OmegaDiffTr = (1.0/3.0) * OmegaDiffTr;
162 Q = Q - OmegaDiffTr * ODT;
171 error = OmegaDiffTr.real();
172 printf(
"Trace test %d %d %.15e\n",
idx, dir, error);
175 Link Q_diff =
conj(Q);
180 error = OmegaDiffTr.real();
181 printf(
"Herm test %d %d %.15e\n",
idx, dir, error);
189 printf(
"expiQ test %d %d %.15e\n",
idx, dir, error);
196 printf(
"expiQ*u test %d %d %.15e\n",
idx, dir, error);
203 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
204 class GaugeSTOUT : TunableVectorYZ {
205 GaugeSTOUTArg<Float,GaugeOr,GaugeDs>
arg;
206 const GaugeField &meta;
209 bool tuneGridDim()
const {
return false; }
210 unsigned int minThreads()
const {
return arg.threads; }
214 GaugeSTOUT(GaugeSTOUTArg<Float,GaugeOr,GaugeDs> &
arg,
const GaugeField &meta)
215 : TunableVectorYZ(2,3),
arg(
arg), meta(meta) {}
216 virtual ~GaugeSTOUT () {}
218 void apply(
const cudaStream_t &
stream){
221 computeSTOUTStep<<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
228 TuneKey tuneKey()
const {
229 std::stringstream aux;
230 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
231 return TuneKey(meta.VolString(),
typeid(*this).name(), aux.str().c_str());
234 long long flops()
const {
return 3*(2+2*4)*198ll*
arg.threads; }
235 long long bytes()
const {
return 3*((1+2*6)*
arg.origin.Bytes()+
arg.dest.Bytes())*
arg.threads; }
238 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
239 void STOUTStep(GaugeOr origin, GaugeDs dest,
const GaugeField& dataOr, Float rho) {
241 GaugeSTOUT<Float,GaugeOr,GaugeDs> gaugeSTOUT(
arg,dataOr);
246 template<
typename Float>
247 void STOUTStep(GaugeField &dataDs,
const GaugeField& dataOr, Float rho) {
250 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GDs;
253 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
254 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
256 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
257 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
259 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
260 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
262 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
265 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GDs;
267 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
268 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
270 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
271 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
273 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
274 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
276 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
279 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GDs;
281 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
282 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
284 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
285 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
287 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
288 STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
290 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
293 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
302 #ifdef GPU_GAUGE_TOOLS 305 errorQuda(
"Origin and destination fields must have the same precision\n");
309 errorQuda(
"Half precision not supported\n");
319 STOUTStep<float>(dataDs, dataOr, (
float) rho);
321 STOUTStep<double>(dataDs, dataOr, rho);
337 template <
typename Float,
typename GaugeOr,
typename GaugeDs>
351 for (
int dir = 0; dir < 4; ++dir ) {
353 X[dir] = data.
X()[dir] -
border[dir] * 2;
361 template <
typename Float,
typename GaugeOr,
typename GaugeDs,
typename Float2>
369 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
373 for(
int dr=0; dr<4; ++dr) {
374 x[dr] +=
arg.border[dr];
375 X[dr] += 2*
arg.border[dr];
383 for (
int mu=0;
mu<4;
mu++) {
407 int dx[4] = {0, 0, 0, 0};
408 Link U1, U2, U3, U4, U5, U6, U7;
436 rectangle = rectangle + U1*U2*U3*
conj(U4)*
conj(U5);
454 staple = staple + U1*U2*
conj(U5);
470 rectangle = rectangle + U1 * U2 * U3 *
conj(U4) *
conj(U6);
496 rectangle = rectangle +
conj(U7) * U4 * U3 * U2 *
conj(U5);
529 rectangle = rectangle +
conj(U1) *
conj(U2) * U3 * U4 * U5;
549 staple = staple +
conj(U1) * U2 * U5;
563 rectangle = rectangle +
conj(U1) * U2 * U3 * U4 *
conj(U6);
590 rectangle = rectangle +
conj(U7) *
conj(U4) * U3 * U2 * U5;
597 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
602 int dir = threadIdx.z + blockIdx.z*
blockDim.z;
603 if (
idx >=
arg.threads)
return;
605 typedef complex<Float>
Complex;
609 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
613 for(
int dr=0; dr<4; ++dr) {
614 x[dr] +=
arg.border[dr];
615 X[dr] += 2*
arg.border[dr];
618 double staple_coeff = (5.0 - 2.0*
arg.epsilon)/3.0;
619 double rectangle_coeff = (1.0 -
arg.epsilon)/12.0;
621 int dx[4] = {0, 0, 0, 0};
624 Link U, UDag, Stap, Rect, Omega, OmegaDiff, ODT, Q, exp_iQ;
631 computeStapleRectangle<Float,GaugeOr,GaugeDs,Complex>(
arg,
idx,
parity,dir,Stap,Rect);
643 Omega = (
arg.rho*staple_coeff)*(Stap);
646 Omega = Omega - (
arg.rho*rectangle_coeff)*(Rect);
647 Omega = Omega * UDag;
652 OmegaDiff =
conj(Omega) - Omega;
656 OmegaDiffTr = (1.0/3.0) * OmegaDiffTr;
661 Q = Q - OmegaDiffTr * ODT;
670 error = OmegaDiffTr.real();
671 printf(
"Trace test %d %d %.15e\n",
idx, dir, error);
674 Link Q_diff =
conj(Q);
679 error = OmegaDiffTr.real();
680 printf(
"Herm test %d %d %.15e\n",
idx, dir, error);
688 printf(
"expiQ test %d %d %.15e\n",
idx, dir, error);
695 printf(
"expiQ*u test %d %d %.15e\n",
idx, dir, error);
703 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
729 std::stringstream
aux;
730 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
734 long long flops()
const {
return 4*(18+2+2*4)*198ll*
arg.threads; }
735 long long bytes()
const {
return 4*((1+2*12)*
arg.origin.Bytes()+
arg.dest.Bytes())*
arg.threads; }
739 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
744 gaugeOvrImpSTOUT.apply(0);
748 template<
typename Float>
795 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.
Reconstruct());
803 #ifdef GPU_GAUGE_TOOLS 806 errorQuda(
"Origin and destination fields must have the same precision\n");
810 errorQuda(
"Half precision not supported\n");
820 OvrImpSTOUTStep<float>(dataDs, dataOr, (
float) rho, epsilon);
822 OvrImpSTOUTStep<double>(dataDs, dataOr, rho, epsilon);
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
virtual ~GaugeOvrImpSTOUT()
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
std::complex< double > Complex
void apply(const cudaStream_t &stream)
const char * VolString() const
int printf(const char *,...) __attribute__((__format__(__printf__
GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > arg
GaugeOvrImpSTOUT(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > &arg, const GaugeField &meta)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ void computeStapleRectangle(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > &arg, int idx, int parity, int dir, Matrix< Float2, 3 > &staple, Matrix< Float2, 3 > &rectangle)
__device__ __host__ void setIdentity(Matrix< T, N > *m)
QudaFieldLocation Location() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
__global__ void computeOvrImpSTOUTStep(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > arg)
GaugeOvrImpSTOUTArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float rho, const Float epsilon, const Float tolerance)
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__device__ __host__ void exponentiate_iQ(const Matrix< T, 3 > &Q, Matrix< T, 3 > *exp_iQ)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
unsigned int minThreads() const