24 #ifndef FL_UNITARIZE_PI 25 #define FL_UNITARIZE_PI 3.14159265358979323846 27 #ifndef FL_UNITARIZE_PI23 28 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*0.66666666666666666666 31 static const int max_iter_newton = 20;
32 static const int max_iter = 20;
35 static double max_error = 1
e-10;
41 template <
typename Out,
typename In>
42 struct UnitarizeLinksArg {
50 const double max_error;
55 const static bool check_unitarization =
true;
57 UnitarizeLinksArg(Out &output,
const In &input,
const GaugeField &data,
int* fails,
66 for(
int dir=0; dir<4; ++dir)
X[dir] = data.X()[dir];
67 threads =
X[0]*
X[1]*
X[2]*
X[3];
72 bool reunit_allow_svd_,
bool reunit_svd_only_,
73 double svd_rel_error_,
double svd_abs_error_) {
75 max_error = max_error_;
90 temporary =
conj(initial_matrix)*unitary_matrix;
91 temporary = temporary*temporary -
conj(initial_matrix)*initial_matrix;
93 for(
int i=0;
i<3; ++
i){
94 for(
int j=0; j<3; ++j){
95 if(
fabs(temporary(
i,j).
x) > max_error ||
fabs(temporary(
i,j).
y) > max_error){
106 T getAbsMin(
const T*
const array,
int size){
110 if((abs_val) < min){ min = abs_val; }
118 inline bool checkAbsoluteError(Real
a, Real
b, Real epsilon)
120 if(
fabs(
a-
b) < epsilon)
return true;
127 inline bool checkRelativeError(Real
a, Real
b, Real epsilon)
129 if(
fabs((
a-
b)/
b) < epsilon )
return true;
137 template<
class Float,
typename Arg>
139 bool reciprocalRoot(
const Matrix<complex<Float>,3>& q,
Matrix<complex<Float>,3>* res, Arg &
arg){
146 const Float one_third = 0.333333333333333333333;
147 const Float one_ninth = 0.111111111111111111111;
148 const Float one_eighteenth = 0.055555555555555555555;
157 g[0] = g[1] = g[2] =
c[0] * one_third;
159 s =
c[1]*one_third -
c[0]*
c[0]*one_eighteenth;
163 const Float rsqrt_s = rsqrt(
s);
164 r =
c[2]*0.5 - (
c[0]*one_third)*(
c[1] -
c[0]*
c[0]*one_ninth);
165 cosTheta = r*rsqrt_s*rsqrt_s*rsqrt_s;
167 if(
fabs(cosTheta) >= 1.0){
168 theta = (r > 0) ? 0.0 : FL_UNITARIZE_PI;
170 theta =
acos(cosTheta);
173 const Float sqrt_s =
s*rsqrt_s;
175 #if 0 // experimental version 177 sincos( theta*one_third, &as, &ac );
178 g[0] =
c[0]*one_third + 2*sqrt_s*ac;
180 g[1] =
c[0]*one_third - 2*sqrt_s*(0.5*ac + as*0.8660254037844386467637);
182 g[2] =
c[0]*one_third + 2*sqrt_s*(-0.5*ac + as*0.8660254037844386467637);
184 g[0] =
c[0]*one_third + 2*sqrt_s*
cos( theta*one_third );
185 g[1] =
c[0]*one_third + 2*sqrt_s*
cos( theta*one_third + FL_UNITARIZE_PI23 );
186 g[2] =
c[0]*one_third + 2*sqrt_s*
cos( theta*one_third + 2*FL_UNITARIZE_PI23 );
193 if(
fabs(det) <
arg.svd_abs_error)
return false;
194 if( checkRelativeError(g[0]*g[1]*g[2],det,
arg.svd_rel_error) ==
false )
return false;
202 g[0] =
c[0]+
c[1]+
c[2];
203 g[1] =
c[0]*
c[1] +
c[0]*
c[2] +
c[1]*
c[2];
204 g[2] =
c[0]*
c[1]*
c[2];
206 const Float denominator = 1.0 / ( g[2]*(g[0]*g[1]-g[2]) );
207 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1])) * denominator;
208 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1]) * denominator;
209 c[2] = g[0] * denominator;
211 tempq =
c[1]*q +
c[2]*qsq;
213 tempq(0,0).x +=
c[0];
214 tempq(1,1).x +=
c[0];
215 tempq(2,2).x +=
c[0];
225 template<
class Float,
typename Arg>
227 bool unitarizeLinkMILC(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result, Arg &
arg)
230 if( !
arg.reunit_svd_only ){
239 if( !
arg.reunit_allow_svd )
return false;
242 Float singular_values[3];
243 computeSVD<Float>(
in, u, v, singular_values);
249 template<
class Float>
251 bool unitarizeLinkSVD(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result,
252 const double max_error)
255 Float singular_values[3];
256 computeSVD<Float>(
in, u, v, singular_values);
262 printf(
"ERROR: Link unitarity test failed\n");
263 printf(
"TOLERANCE: %g\n", max_error);
270 template<
class Float>
272 bool unitarizeLinkNewton(
const Matrix<complex<Float>,3>&
in,
Matrix<complex<Float>,3>*
const result,
int max_iter)
277 for(
int i=0;
i<max_iter; ++
i){
279 u = 0.5*(u +
conj(uinv));
282 if(isUnitarizedLinkConsistent(
in,u,0.0000001)==
false)
284 printf(
"ERROR: Unitarized link is not consistent with incoming link\n");
294 if (infield.Precision() != outfield.Precision())
295 errorQuda(
"Precisions must match (out=%d != in=%d)", outfield.Precision(), infield.Precision());
300 for (
int i=0;
i<infield.Volume(); ++
i){
301 for (
int dir=0; dir<4; ++dir){
304 if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) ==
false )
num_failures++;
308 if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) ==
false )
num_failures++;
317 bool isUnitary(
const cpuGaugeField& field,
double max_error)
321 for(
int i=0;
i<field.Volume(); ++
i){
322 for(
int dir=0; dir<4; ++dir){
331 printf(
"Unitarity failure\n");
332 printf(
"site index = %d,\t direction = %d\n",
i, dir);
334 identity =
conj(link)*link;
344 template<
typename Float,
typename Out,
typename In>
345 __global__
void DoUnitarizedLink(UnitarizeLinksArg<Out,In>
arg){
347 if(
idx >=
arg.threads)
return;
349 if(
idx >=
arg.threads/2) {
354 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
362 for (
int mu = 0;
mu < 4;
mu++) {
366 unitarizeLinkMILC(v, &result,
arg);
367 if (
arg.check_unitarization) {
379 template<
typename Float,
typename Out,
typename In>
380 class UnitarizeLinks : Tunable {
381 UnitarizeLinksArg<Out,In>
arg;
383 unsigned int sharedBytesPerThread()
const {
return 0; }
384 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
387 bool tuneGridDim()
const {
return false; }
388 unsigned int minThreads()
const {
return arg.threads; }
391 UnitarizeLinks(UnitarizeLinksArg<Out,In> &
arg) :
arg(
arg) { }
394 void apply(
const cudaStream_t &
stream){
396 DoUnitarizedLink<Float,Out,In><<<tp.grid, tp.block, 0,
stream>>>(
arg);
398 void preTune() {
if (
arg.input.gauge ==
arg.output.gauge)
arg.output.save(); }
400 if (
arg.input.gauge ==
arg.output.gauge)
arg.output.load();
401 cudaMemset(
arg.fails, 0,
sizeof(
int));
404 long long flops()
const {
406 return 4588LL*
arg.threads;
408 long long bytes()
const {
return 4ll *
arg.threads * (
arg.input.Bytes() +
arg.output.Bytes()); }
410 TuneKey tuneKey()
const {
411 std::stringstream vol, aux;
412 vol <<
arg.X[0] <<
"x";
413 vol <<
arg.X[1] <<
"x";
414 vol <<
arg.X[2] <<
"x";
416 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
417 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
422 template<
typename Float,
typename Out,
typename In>
423 void unitarizeLinks(Out output,
const In input,
const cudaGaugeField& meta,
int* fails) {
424 UnitarizeLinksArg<Out,In>
arg(output, input, meta, fails, max_iter,
unitarize_eps, max_error,
426 UnitarizeLinks<Float, Out, In> unitlinks(
arg) ;
431 template<
typename Float>
432 void unitarizeLinks(cudaGaugeField& output,
const cudaGaugeField &input,
int* fails) {
434 if( output.isNative() && input.isNative() ) {
436 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Out;
439 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
440 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
442 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
443 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
445 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
446 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
448 errorQuda(
"Reconstruction type %d of gauge field not supported", input.Reconstruct());
452 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Out;
455 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
456 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
458 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
459 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
461 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
462 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
464 errorQuda(
"Reconstruction type %d of gauge field not supported", input.Reconstruct());
469 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Out;
472 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
473 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
475 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
476 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
478 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
479 unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
481 errorQuda(
"Reconstruction type %d of gauge field not supported", input.Reconstruct());
486 errorQuda(
"Reconstruction type %d of gauge field not supported", output.Reconstruct());
489 errorQuda(
"Invalid Gauge Order (output=%d, input=%d)", output.Order(), input.Order());
501 unitarizeLinks<float>(output, input, fails);
503 unitarizeLinks<double>(output, input, fails);
508 errorQuda(
"Unitarization has not been built");
517 template <
typename Float,
typename G>
526 for(
int dir=0; dir<4; ++dir)
X[dir] = meta.
X()[dir];
531 template<
typename Float,
typename G>
535 if(
idx >=
arg.threads)
return;
539 for (
int mu = 0;
mu < 4;
mu++) {
541 polarSu3<Float>(u,
arg.tol);
550 template<
typename Float,
typename G>
571 long long flops()
const {
return 0; }
572 long long bytes()
const {
return 4ll *
arg.threads *
arg.u.Bytes(); }
575 std::stringstream vol,
aux;
576 vol <<
arg.X[0] <<
"x" <<
arg.X[1] <<
"x" <<
arg.X[2] <<
"x" <<
arg.X[3];
577 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
578 return TuneKey(vol.str().c_str(),
typeid(*this).name(),
aux.str().c_str());
583 template <
typename Float>
600 if (u.StaggeredPhaseApplied())
601 errorQuda(
"Cannot project gauge field with staggered phases applied");
604 projectSU3<double>(u,
tol, fails);
606 projectSU3<float>(u,
tol, fails);
608 errorQuda(
"Precision %d not supported", u.Precision());
611 errorQuda(
"Unitarization has not been built");
unsigned int sharedBytesPerThread() const
void apply(const cudaStream_t &stream)
unsigned int minThreads() const
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
__host__ __device__ ValueType sqrt(ValueType x)
ProjectSU3(ProjectSU3Arg< Float, G > &arg)
static bool reunit_svd_only
__global__ void ProjectSU3kernel(ProjectSU3Arg< Float, G > arg)
cudaColorSpinorField * tmp
ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
ProjectSU3Arg< Float, G > arg
static double svd_rel_error
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
bool isUnitary(const cpuGaugeField &field, double max_error)
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
int printf(const char *,...) __attribute__((__format__(__printf__
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double unitarize_eps
Main header file for host and device accessors to GaugeFields.
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
static bool reunit_allow_svd
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)
static double svd_abs_error
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
__host__ __device__ ValueType cos(ValueType x)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType acos(ValueType x)
struct cudaExtent unsigned int cudaArray_t array
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
unsigned int sharedBytesPerBlock(const TuneParam &) const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)