9 #define DOUBLE_TOL 1e-15 10 #define SINGLE_TOL 2e-6 14 #ifdef GPU_GAUGE_TOOLS 16 template <
typename Float,
typename GaugeOr,
typename GaugeDs>
23 const Float tolerance;
27 GaugeAPEArg(GaugeOr &origin, GaugeDs &dest,
const GaugeField &data,
const Float alpha,
const Float tolerance)
28 : threads(1), origin(origin), dest(dest), alpha(alpha), tolerance(tolerance) {
29 for (
int dir = 0; dir < 4; ++dir ) {
30 border[dir] = data.R()[dir];
31 X[dir] = data.X()[dir] - border[dir] * 2;
39 template <
typename Float,
typename GaugeOr,
typename GaugeDs,
typename Float2>
40 __host__ __device__
void computeStaple(GaugeAPEArg<Float,GaugeOr,GaugeDs>&
arg,
int idx,
int parity,
int dir,
Matrix<Float2,3> &staple) {
46 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
50 for(
int dr=0; dr<4; ++dr) {
51 x[dr] +=
arg.border[dr];
52 X[dr] += 2*
arg.border[dr];
58 for(
int mu=0;
mu<3;
mu++) {
65 int dx[4] = {0, 0, 0, 0};
66 Link U1, U2, U3, tmpS;
81 staple = staple + U1 * U2 *
conj(U3);
95 staple = staple +
conj(U1) * U2 * U3;
101 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
102 __global__
void computeAPEStep(GaugeAPEArg<Float,GaugeOr,GaugeDs>
arg){
106 int dir = threadIdx.z + blockIdx.z*
blockDim.z;
107 if (
idx >=
arg.threads)
return;
108 if (dir >= 3)
return;
109 typedef complex<Float>
Complex;
113 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
117 for(
int dr=0; dr<4; ++dr) {
118 x[dr] +=
arg.border[dr];
119 X[dr] += 2*
arg.border[dr];
122 int dx[4] = {0, 0, 0, 0};
127 computeStaple<Float,GaugeOr,GaugeDs,Complex>(
arg,
idx,
parity,dir,S);
132 S = S * (
arg.alpha/((Float) (2.*(3. - 1.))));
135 TestU = I*(1.-
arg.alpha) + S*
conj(U);
136 polarSu3<Float>(TestU,
arg.tolerance);
143 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
144 class GaugeAPE : TunableVectorYZ {
145 GaugeAPEArg<Float,GaugeOr,GaugeDs>
arg;
146 const GaugeField &meta;
149 bool tuneGridDim()
const {
return false; }
150 unsigned int minThreads()
const {
return arg.threads; }
154 GaugeAPE(GaugeAPEArg<Float,GaugeOr, GaugeDs> &
arg,
const GaugeField &meta)
155 : TunableVectorYZ(2,3),
arg(
arg), meta(meta) {}
156 virtual ~GaugeAPE () {}
158 void apply(
const cudaStream_t &
stream){
161 computeAPEStep<<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
168 TuneKey tuneKey()
const {
169 std::stringstream aux;
170 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
171 return TuneKey(meta.VolString(),
typeid(*this).name(), aux.str().c_str());
174 long long flops()
const {
return 3*(2+2*4)*198ll*
arg.threads; }
175 long long bytes()
const {
return 3*((1+2*6)*
arg.origin.Bytes()+
arg.dest.Bytes())*
arg.threads; }
178 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
179 void APEStep(GaugeOr origin, GaugeDs dest,
const GaugeField& dataOr, Float alpha) {
181 GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(
arg,dataOr);
186 template<
typename Float>
187 void APEStep(GaugeField &dataDs,
const GaugeField& dataOr, Float alpha) {
190 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GDs;
193 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
194 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
196 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
197 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
199 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
200 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
202 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
205 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GDs;
207 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
208 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
210 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
211 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
213 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
214 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
216 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
219 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GDs;
221 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
222 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
224 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
225 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
227 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
228 APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
230 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
233 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
242 #ifdef GPU_GAUGE_TOOLS 245 errorQuda(
"Orign and destination fields must have the same precision\n");
249 errorQuda(
"Half precision not supported\n");
259 APEStep<float>(dataDs, dataOr, (
float) alpha);
261 APEStep<double>(dataDs, dataOr, alpha);
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
std::complex< double > Complex
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)