24 #ifndef FL_UNITARIZE_PI 25 #define FL_UNITARIZE_PI 3.14159265358979323846 27 #ifndef FL_UNITARIZE_PI23 28 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*0.66666666666666666666 31 static const int max_iter_newton = 20;
32 static const int max_iter = 20;
35 static double max_error = 1e-10;
41 template <
typename Out,
typename In>
42 struct UnitarizeLinksArg {
50 const double max_error;
55 const static bool check_unitarization =
true;
57 UnitarizeLinksArg(Out &output,
const In &input,
const GaugeField &data,
int* fails,
58 int max_iter,
double unitarize_eps,
double max_error,
59 int reunit_allow_svd,
int reunit_svd_only,
double svd_rel_error,
61 : threads(data.VolumeCB()), output(output), input(input), fails(fails), unitarize_eps(unitarize_eps),
62 max_iter(max_iter), max_error(max_error), reunit_allow_svd(reunit_allow_svd),
63 reunit_svd_only(reunit_svd_only), svd_rel_error(svd_rel_error),
64 svd_abs_error(svd_abs_error)
66 for (
int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
70 #endif // GPU_UNITARIZE 73 bool reunit_allow_svd_,
bool reunit_svd_only_,
74 double svd_rel_error_,
double svd_abs_error_) {
76 unitarize_eps = unitarize_eps_;
77 max_error = max_error_;
78 reunit_allow_svd = reunit_allow_svd_;
79 reunit_svd_only = reunit_svd_only_;
80 svd_rel_error = svd_rel_error_;
81 svd_abs_error = svd_abs_error_;
83 errorQuda(
"Unitarization has not been built");
95 temporary =
conj(initial_matrix)*unitary_matrix;
96 temporary = temporary*temporary -
conj(initial_matrix)*initial_matrix;
98 for(
int i=0; i<3; ++i){
99 for(
int j=0; j<3; ++j){
100 if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
111 T getAbsMin(
const T*
const array,
int size){
112 T min = fabs(array[0]);
113 for(
int i=1; i<
size; ++i){
114 T abs_val = fabs(array[i]);
115 if((abs_val) < min){ min = abs_val; }
123 inline bool checkAbsoluteError(Real a, Real b, Real
epsilon)
125 if( fabs(a-b) < epsilon)
return true;
132 inline bool checkRelativeError(Real a, Real b, Real epsilon)
134 if( fabs((a-b)/b) < epsilon )
return true;
142 template<
class Float,
typename Arg>
144 bool reciprocalRoot(
const Matrix<complex<Float>,3>& q,
Matrix<complex<Float>,3>* res,
Arg &
arg){
151 const Float one_third = 0.333333333333333333333;
152 const Float one_ninth = 0.111111111111111111111;
153 const Float one_eighteenth = 0.055555555555555555555;
160 c[2] =
getTrace(tempq).x * one_third;;
162 g[0] = g[1] = g[2] = c[0] * one_third;
164 s = c[1]*one_third - c[0]*c[0]*one_eighteenth;
167 if(fabs(s) >= arg.unitarize_eps){
168 const Float rsqrt_s = rsqrt(s);
169 r = c[2]*0.5 - (c[0]*one_third)*(c[1] - c[0]*c[0]*one_ninth);
170 cosTheta = r*rsqrt_s*rsqrt_s*rsqrt_s;
172 if(fabs(cosTheta) >= 1.0){
173 theta = (r > 0) ? 0.0 : FL_UNITARIZE_PI;
175 theta =
acos(cosTheta);
178 const Float sqrt_s = s*rsqrt_s;
180 #if 0 // experimental version 182 sincos( theta*one_third, &as, &ac );
183 g[0] = c[0]*one_third + 2*sqrt_s*ac;
185 g[1] = c[0]*one_third - 2*sqrt_s*(0.5*ac + as*0.8660254037844386467637);
187 g[2] = c[0]*one_third + 2*sqrt_s*(-0.5*ac + as*0.8660254037844386467637);
189 g[0] = c[0]*one_third + 2*sqrt_s*
cos( theta*one_third );
190 g[1] = c[0]*one_third + 2*sqrt_s*
cos( theta*one_third + FL_UNITARIZE_PI23 );
191 g[2] = c[0]*one_third + 2*sqrt_s*
cos( theta*one_third + 2*FL_UNITARIZE_PI23 );
198 if( fabs(det) < arg.svd_abs_error)
return false;
199 if( checkRelativeError(g[0]*g[1]*g[2],det,arg.svd_rel_error) ==
false )
return false;
204 for(
int i=0; i<3; ++i) c[i] =
sqrt(g[i]);
207 g[0] = c[0]+c[1]+c[2];
208 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
209 g[2] = c[0]*c[1]*c[2];
211 const Float denominator = 1.0 / ( g[2]*(g[0]*g[1]-g[2]) );
212 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1])) * denominator;
213 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1]) * denominator;
214 c[2] = g[0] * denominator;
216 tempq = c[1]*q + c[2]*qsq;
218 tempq(0,0).x += c[0];
219 tempq(1,1).x += c[0];
220 tempq(2,2).x += c[0];
230 template<
class Float,
typename Arg>
232 bool unitarizeLinkMILC(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result,
Arg &arg)
235 if( !arg.reunit_svd_only ){
236 if( reciprocalRoot<Float>(
conj(
in)*
in,&u,arg) ){
244 if( !arg.reunit_allow_svd )
return false;
247 Float singular_values[3];
248 computeSVD<Float>(
in, u, v, singular_values);
254 template<
class Float>
256 bool unitarizeLinkSVD(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result,
257 const double max_error)
260 Float singular_values[3];
261 computeSVD<Float>(
in, u, v, singular_values);
265 if (result->isUnitary(max_error)==
false)
267 printf(
"ERROR: Link unitarity test failed\n");
268 printf(
"TOLERANCE: %g\n", max_error);
275 template<
class Float>
277 bool unitarizeLinkNewton(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result,
int max_iter)
282 for(
int i=0; i<max_iter; ++i){
284 u = 0.5*(u +
conj(uinv));
287 if(isUnitarizedLinkConsistent(
in,u,0.0000001)==
false)
289 printf(
"ERROR: Unitarized link is not consistent with incoming link\n");
297 #endif // GPU_UNITARIZE 308 for (
int i=0; i<infield.
Volume(); ++i){
309 for (
int dir=0; dir<4; ++dir){
312 if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) ==
false ) num_failures++;
316 if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) ==
false ) num_failures++;
323 errorQuda(
"Unitarization has not been built");
334 for(
int i=0; i<field.
Volume(); ++i){
335 for(
int dir=0; dir<4; ++dir){
343 if (link.
isUnitary(max_error) ==
false) {
344 printf(
"Unitarity failure\n");
345 printf(
"site index = %d,\t direction = %d\n", i, dir);
347 identity =
conj(link)*link;
355 errorQuda(
"Unitarization has not been built");
363 template<
typename Float,
typename Out,
typename In>
364 __global__
void DoUnitarizedLink(UnitarizeLinksArg<Out,In> arg){
365 int idx = threadIdx.x + blockIdx.x*blockDim.x;
366 int parity = threadIdx.y + blockIdx.y*blockDim.y;
367 int mu = threadIdx.z + blockIdx.z*blockDim.z;
368 if (idx >= arg.threads)
return;
376 unitarizeLinkMILC(v, &result, arg);
377 if (arg.check_unitarization) {
382 arg.output(mu, idx, parity) =
tmp;
387 template<
typename Float,
typename Out,
typename In>
389 UnitarizeLinksArg<Out,In>
arg;
392 unsigned int sharedBytesPerThread()
const {
return 0; }
393 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
396 bool tuneGridDim()
const {
return false; }
397 unsigned int minThreads()
const {
return arg.threads; }
400 UnitarizeLinks(UnitarizeLinksArg<Out,In> &arg,
const GaugeField &meta)
403 void apply(
const cudaStream_t &
stream){
407 void preTune() {
if (arg.input.gauge == arg.output.gauge) arg.output.save(); }
409 if (arg.input.gauge == arg.output.gauge) arg.output.load();
410 cudaMemset(arg.fails, 0,
sizeof(
int));
413 long long flops()
const {
415 return 4ll * 2 * arg.threads * 1147;
417 long long bytes()
const {
return 4ll * 2 * arg.threads * (arg.input.Bytes() + arg.output.Bytes()); }
420 std::stringstream aux;
421 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(Float);
427 template<
typename Float,
typename Out,
typename In>
429 UnitarizeLinksArg<Out,In>
arg(output, input, meta, fails, max_iter, unitarize_eps, max_error,
430 reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error);
431 UnitarizeLinks<Float, Out, In> unitlinks(arg, meta);
436 template<
typename Float>
445 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
448 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
451 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
461 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
464 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
467 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
478 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
481 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
484 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
498 #endif // GPU_UNITARIZE 506 unitarizeLinks<float>(output, input, fails);
508 unitarizeLinks<double>(output, input, fails);
513 errorQuda(
"Unitarization has not been built");
522 template <
typename Float,
typename G>
529 : threads(meta.VolumeCB()), u(u), tol(tol), fails(fails) { }
532 template<
typename Float,
typename G>
534 int idx = threadIdx.x + blockIdx.x*blockDim.x;
535 int parity = threadIdx.y + blockIdx.y*blockDim.y;
536 int mu = threadIdx.z + blockIdx.z*blockDim.z;
537 if (idx >= arg.
threads)
return;
542 polarSu3<Float>(u, arg.
tol);
545 if (u.isUnitary(arg.
tol) ==
false) {
549 arg.
u(mu, idx, parity) = u;
552 template<
typename Float,
typename G>
575 cudaMemset(arg.
fails, 0,
sizeof(
int));
578 long long flops()
const {
return 0; }
579 long long bytes()
const {
return 4ll * 2 * arg.
threads * 2 * arg.
u.Bytes(); }
582 std::stringstream aux;
583 aux <<
"threads=" << arg.
threads <<
",prec=" <<
sizeof(Float);
589 template <
typename Float>
608 errorQuda(
"Cannot project gauge field with staggered phases applied");
611 projectSU3<double>(u,
tol, fails);
613 projectSU3<float>(u,
tol, fails);
618 errorQuda(
"Unitarization has not been built");
static bool reunit_allow_svd
unsigned int sharedBytesPerThread() const
__device__ __host__ bool isUnitary(double max_error) const
void apply(const cudaStream_t &stream)
unsigned int minThreads() const
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
__host__ __device__ ValueType sqrt(ValueType x)
__global__ void ProjectSU3kernel(ProjectSU3Arg< Float, G > arg)
cudaColorSpinorField * tmp
const char * VolString() const
ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
ProjectSU3Arg< Float, G > arg
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
bool isUnitary(const cpuGaugeField &field, double max_error)
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
static double svd_rel_error
#define qudaDeviceSynchronize()
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
bool StaggeredPhaseApplied() const
static bool reunit_svd_only
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
ProjectSU3(ProjectSU3Arg< Float, G > &arg, const GaugeField &meta)
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
static double unitarize_eps
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
__host__ __device__ ValueType acos(ValueType x)
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
unsigned int sharedBytesPerBlock(const TuneParam &) const