7 #define DOUBLE_TOL 1e-15
8 #define SINGLE_TOL 2e-6
12 #ifdef GPU_GAUGE_TOOLS
14 template <
typename Float,
typename GaugeOr,
typename GaugeDs>
23 const Float tolerance;
27 GaugeAPEArg(GaugeOr &origin, GaugeDs &dest,
const GaugeField &data,
const Float alpha,
const Float tolerance)
28 : origin(origin), dest(dest), alpha(alpha), tolerance(tolerance) {
30 for(
int dir=0; dir<4; ++dir){
33 for(
int dir=0; dir<4; ++dir)
X[dir] = data.X()[dir] - border[dir]*2;
35 for(
int dir=0; dir<4; ++dir)
X[dir] = data.X()[dir];
42 __device__ __host__
inline int linkIndex2(
int x[],
int dx[],
const int X[4]) {
44 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
45 int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
50 __device__ __host__
inline void getCoords2(
int x[4],
int cb_index,
const int X[4],
int parity)
52 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
53 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
54 x[1] = (cb_index/(X[0]/2)) % X[1];
55 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
60 template <
typename Float2,
typename Float>
68 if (fabs(
in(i,j).x - (*inv)(j,i).x) > tol)
70 if (fabs(
in(i,j).
y + (*inv)(j,i).
y) > tol)
76 template <
typename Float2>
83 printf(
"TESTR: %+.3le %+.3le %+.3le\n",
in(i,j).x, (*inv)(j,i).x, fabs(
in(i,j).x - (*inv)(j,i).x));
84 printf(
"TESTI: %+.3le %+.3le %+.3le\n",
in(i,j).
y, (*inv)(j,i).y, fabs(
in(i,j).y + (*inv)(j,i).y));
85 cudaDeviceSynchronize();
86 if (fabs(
in(i,j).x - (*inv)(j,i).x) > 1e-14)
88 if (fabs(
in(i,j).y + (*inv)(j,i).y) > 1e-14)
94 template <
typename Float2,
typename Float>
97 typedef typename ComplexTypeId<Float>::Type Cmplx;
105 out = out +
conj(inv);
107 }
while(checkUnitary(out, &inv, tol));
122 double mod = det.x*det.x + det.y*det.y;
123 mod =
pow(mod, (1./6.));
124 double angle =
atan2(det.y, det.x);
129 cTemp.x =
cos(angle)/mod;
130 cTemp.y =
sin(angle)/mod;
163 template <
typename Float,
typename GaugeOr,
typename GaugeDs,
typename Float2>
164 __host__ __device__
void computeStaple(GaugeAPEArg<Float,GaugeOr,GaugeDs>&
arg,
int idx,
int parity,
int dir,
Matrix<Float2,3> &staple) {
166 typedef typename ComplexTypeId<Float>::Type Cmplx;
170 for(
int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
173 getCoords2(x, idx, X, parity);
175 for(
int dr=0; dr<4; ++dr) {
176 x[dr] += arg.border[dr];
177 X[dr] += 2*arg.border[dr];
183 for (
int mu=0;
mu<4;
mu++) {
191 int dx[4] = {0, 0, 0, 0};
193 arg.origin.load((
Float*)(U1.data),linkIndex2(x,dx,X),
mu, parity);
197 arg.origin.load((
Float*)(U2.data),linkIndex2(x,dx,X), nu, 1-parity);
202 arg.origin.load((
Float*)(U3.data),linkIndex2(x,dx,X),
mu, 1-parity);
207 tmpS = tmpS *
conj(U3);
209 staple = staple + tmpS;
213 arg.origin.load((
Float*)(U1.data),linkIndex2(x,dx,X),
mu, 1-parity);
214 arg.origin.load((
Float*)(U2.data),linkIndex2(x,dx,X), nu, 1-parity);
217 arg.origin.load((
Float*)(U3.data),linkIndex2(x,dx,X),
mu, parity);
223 staple = staple + tmpS;
228 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
229 __global__
void computeAPEStep(GaugeAPEArg<Float,GaugeOr,GaugeDs> arg){
230 int idx = threadIdx.x + blockIdx.x*blockDim.x;
231 if(idx >= arg.threads)
return;
232 typedef typename ComplexTypeId<Float>::Type Cmplx;
235 if(idx >= arg.threads/2) {
237 idx -= arg.threads/2;
241 for(
int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
244 getCoords2(x, idx, X, parity);
246 for(
int dr=0; dr<4; ++dr) {
247 x[dr] += arg.border[dr];
248 X[dr] += 2*arg.border[dr];
252 int dx[4] = {0, 0, 0, 0};
253 for (
int dir=0; dir < 3; dir++) {
256 computeStaple<Float,GaugeOr,GaugeDs,Cmplx>(
arg,
idx,
parity,dir,S);
258 arg.origin.load((
Float*)(U.data),linkIndex2(x,dx,X), dir, parity);
260 U = U * (1. - arg.alpha);
261 S = S * (arg.alpha/6.);
265 polarSu3<Cmplx,Float>(&U, arg.tolerance);
266 arg.dest.save((
Float*)(U.data),linkIndex2(x,dx,X), dir, parity);
270 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
271 class GaugeAPE : Tunable {
272 GaugeAPEArg<Float,GaugeOr,GaugeDs>
arg;
276 unsigned int sharedBytesPerThread()
const {
return 0; }
277 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
279 bool tuneSharedBytes()
const {
return false; }
280 bool tuneGridDim()
const {
return false; }
281 unsigned int minThreads()
const {
return arg.threads; }
286 virtual ~GaugeAPE () {}
288 void apply(
const cudaStream_t &
stream){
290 #if (__COMPUTE_CAPABILITY__ >= 200)
292 computeAPEStep<<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
294 errorQuda(
"GaugeAPE not supported on pre-Fermi architecture");
302 TuneKey tuneKey()
const {
303 std::stringstream vol, aux;
304 vol << arg.X[0] <<
"x";
305 vol << arg.X[1] <<
"x";
306 vol << arg.X[2] <<
"x";
308 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(
Float);
309 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
313 std::string paramString(
const TuneParam ¶m)
const {
314 std::stringstream ps;
315 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
")";
316 ps <<
"shared=" << param.shared_bytes;
322 long long flops()
const {
return (1)*6*arg.threads; }
323 long long bytes()
const {
return (1)*6*arg.threads*
sizeof(
Float); }
327 template<
typename Float,
typename GaugeOr,
typename GaugeDs>
330 GaugeAPEArg<Float,GaugeOr,GaugeDs>
arg(origin, dest, dataOr, alpha,
DOUBLE_TOL);
331 GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(arg, location);
334 GaugeAPEArg<Float,GaugeOr,GaugeDs>
arg(origin, dest, dataOr, alpha,
SINGLE_TOL);
335 GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(arg, location);
338 cudaDeviceSynchronize();
341 template<
typename Float>
351 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
353 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
355 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
357 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
361 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
363 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
365 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
367 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
371 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
373 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
375 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
377 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
380 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
385 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
387 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
389 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
391 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
395 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
397 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
399 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
401 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
405 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
407 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
409 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
411 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
414 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
417 errorQuda(
"Invalid Gauge Order origin field\n");
423 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
425 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
427 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
429 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
433 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
435 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
437 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
439 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
443 APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
445 APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
447 APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
449 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
452 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
457 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
459 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
461 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
463 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
467 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
469 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
471 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
473 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
477 APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
479 APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
481 APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
483 errorQuda(
"Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
486 errorQuda(
"Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
489 errorQuda(
"Invalid Gauge Order origin field\n");
492 errorQuda(
"Invalid Gauge Order destination field\n");
499 #ifdef GPU_GAUGE_TOOLS
502 errorQuda(
"Oriign and destination fields must have the same precision\n");
506 errorQuda(
"Half precision not supported\n");
510 APEStep<float>(dataDs, dataOr, (float) alpha, location);
512 APEStep<double>(dataDs, dataOr, alpha,
location);
__device__ __host__ void setZero(Matrix< T, N > *m)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
__global__ void const FloatN FloatM FloatM Float Float int threads
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
QudaPrecision Precision() const
const QudaFieldLocation location
__host__ __device__ ValueType sin(ValueType x)
FloatingPoint< float > Float
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha, QudaFieldLocation location)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType conj(ValueType x)