20 #ifndef FL_UNITARIZE_PI
21 #define FL_UNITARIZE_PI 3.14159265358979323846
23 #ifndef FL_UNITARIZE_PI23
24 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0
27 __constant__
int INPUT_PADDING=0;
28 __constant__
int OUTPUT_PADDING=0;
29 __constant__
int DEV_MAX_ITER = 20;
31 static int HOST_MAX_ITER = 20;
33 __constant__
double DEV_FL_MAX_ERROR;
34 __constant__
double DEV_FL_UNITARIZE_EPS;
35 __constant__
bool DEV_FL_REUNIT_ALLOW_SVD;
36 __constant__
bool DEV_FL_REUNIT_SVD_ONLY;
37 __constant__
double DEV_FL_REUNIT_SVD_REL_ERROR;
38 __constant__
double DEV_FL_REUNIT_SVD_ABS_ERROR;
39 __constant__
bool DEV_FL_CHECK_UNITARIZATION;
41 static double HOST_FL_MAX_ERROR;
42 static double HOST_FL_UNITARIZE_EPS;
43 static bool HOST_FL_REUNIT_ALLOW_SVD;
44 static bool HOST_FL_REUNIT_SVD_ONLY;
45 static double HOST_FL_REUNIT_SVD_REL_ERROR;
46 static double HOST_FL_REUNIT_SVD_ABS_ERROR;
47 static bool HOST_FL_CHECK_UNITARIZATION;
51 cudaMemcpyToSymbol(INPUT_PADDING, &input_padding_h,
sizeof(
int));
52 cudaMemcpyToSymbol(OUTPUT_PADDING, &output_padding_h,
sizeof(
int));
63 for(
int i=0; i<3; ++i){
64 if( fabs(identity(i,i).
x - 1.0) > max_error || fabs(identity(i,i).
y) > max_error)
return false;
65 for(
int j=i+1; j<3; ++j){
66 if( fabs(identity(i,j).
x) > max_error || fabs(identity(i,j).y) > max_error
67 || fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){
84 temporary =
conj(initial_matrix)*unitary_matrix;
85 temporary = temporary*temporary -
conj(initial_matrix)*initial_matrix;
87 for(
int i=0; i<3; ++i){
88 for(
int j=0; j<3; ++j){
89 if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
100 bool allow_svd_h,
bool svd_only_h,
101 double svd_rel_error_h,
double svd_abs_error_h,
102 bool check_unitarization_h)
106 static bool not_set=
true;
109 cudaMemcpyToSymbol(DEV_FL_UNITARIZE_EPS, &unitarize_eps_h,
sizeof(
double));
110 cudaMemcpyToSymbol(DEV_FL_REUNIT_ALLOW_SVD, &allow_svd_h,
sizeof(
bool));
111 cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ONLY, &svd_only_h,
sizeof(
bool));
112 cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_REL_ERROR, &svd_rel_error_h,
sizeof(
double));
113 cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ABS_ERROR, &svd_abs_error_h,
sizeof(
double));
114 cudaMemcpyToSymbol(DEV_FL_MAX_ERROR, &max_error_h,
sizeof(
double));
115 cudaMemcpyToSymbol(DEV_FL_CHECK_UNITARIZATION, &check_unitarization_h,
sizeof(
bool));
118 HOST_FL_UNITARIZE_EPS = unitarize_eps_h;
119 HOST_FL_REUNIT_ALLOW_SVD = allow_svd_h;
120 HOST_FL_REUNIT_SVD_ONLY = svd_only_h;
121 HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error_h;
122 HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error_h;
123 HOST_FL_MAX_ERROR = max_error_h;
124 HOST_FL_CHECK_UNITARIZATION = check_unitarization_h;
135 T getAbsMin(
const T*
const array,
int size){
136 T min = fabs(array[0]);
137 for(
int i=1; i<size; ++i){
138 T abs_val = fabs(array[i]);
139 if((abs_val) < min){ min = abs_val; }
147 inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
149 if( fabs(a-b) < epsilon)
return true;
156 inline bool checkRelativeError(Real a, Real b, Real epsilon)
158 if( fabs((a-b)/b) < epsilon )
return true;
167 template<
class Cmplx>
174 typename RealTypeId<Cmplx>::Type c[3];
175 typename RealTypeId<Cmplx>::Type g[3];
184 g[0] = g[1] = g[2] = c[0]/3.;
185 typename RealTypeId<Cmplx>::Type r,
s,theta;
186 s = c[1]/3. - c[0]*c[0]/18;
189 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS
191 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS
196 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR
197 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR
199 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR
200 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR
204 typename RealTypeId<Cmplx>::Type cosTheta;
205 if(fabs(s) >= FL_UNITARIZE_EPS){
206 const typename RealTypeId<Cmplx>::Type sqrt_s =
sqrt(s);
207 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
208 cosTheta = r/(sqrt_s*sqrt_s*sqrt_s);
209 if(fabs(cosTheta) >= 1.0){
213 theta = FL_UNITARIZE_PI;
216 theta =
acos(cosTheta);
218 g[0] = c[0]/3 + 2*sqrt_s*
cos( theta/3 );
219 g[1] = c[0]/3 + 2*sqrt_s*
cos( theta/3 + FL_UNITARIZE_PI23 );
220 g[2] = c[0]/3 + 2*sqrt_s*
cos( theta/3 + 2*FL_UNITARIZE_PI23 );
226 if( fabs(det) < FL_REUNIT_SVD_ABS_ERROR ){
229 if( checkRelativeError(g[0]*g[1]*g[2],det,FL_REUNIT_SVD_REL_ERROR) ==
false )
return false;
234 for(
int i=0; i<3; ++i) c[i] =
sqrt(g[i]);
237 g[0] = c[0]+c[1]+c[2];
238 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
239 g[2] = c[0]*c[1]*c[2];
241 const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]);
242 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
243 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
244 c[2] = g[0]/denominator;
246 tempq = c[1]*q + c[2]*qsq;
248 tempq(0,0).x += c[0];
249 tempq(1,1).x += c[0];
250 tempq(2,2).x += c[0];
260 template<
class Cmplx>
266 #define FL_REUNIT_SVD_ONLY DEV_FL_REUNIT_SVD_ONLY
267 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD
269 #define FL_REUNIT_SVD_ONLY HOST_FL_REUNIT_SVD_ONLY
270 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD
272 if( !FL_REUNIT_SVD_ONLY ){
273 if( reciprocalRoot<Cmplx>(
conj(in)*in,&u) ){
281 if( !FL_REUNIT_ALLOW_SVD )
return false;
284 typename RealTypeId<Cmplx>::Type singular_values[3];
285 computeSVD<Cmplx>(
in, u, v, singular_values);
291 template<
class Cmplx>
296 typename RealTypeId<Cmplx>::Type singular_values[3];
297 computeSVD<Cmplx>(
in, u, v, singular_values);
302 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
304 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
306 if(
isUnitary(*result,FL_MAX_ERROR)==
false)
308 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
309 printf(
"ERROR: Link unitarity test failed\n");
310 printf(
"TOLERANCE: %g\n", FL_MAX_ERROR);
319 template<
class Cmplx>
327 #define MAX_ITER DEV_MAX_ITER
329 #define MAX_ITER HOST_MAX_ITER
331 for(
int i=0; i<MAX_ITER; ++i){
333 u = 0.5*(u +
conj(uinv));
337 if(isUnitarizedLinkConsistent(in,u,0.0000001)==
false)
339 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
340 printf(
"ERROR: Unitarized link is not consistent with incoming link\n");
355 template<
class Cmplx>
356 __global__
void getUnitarizedField(
const Cmplx* inlink_even,
const Cmplx* inlink_odd,
357 Cmplx* outlink_even, Cmplx* outlink_odd,
358 int* num_failures,
const int threads)
360 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
361 if (mem_idx >= threads)
return;
366 inlink = inlink_even;
367 outlink = outlink_even;
369 if(mem_idx >= threads/2){
370 mem_idx = mem_idx - (threads/2);
372 outlink = outlink_odd;
377 for(
int dir=0; dir<4; ++dir){
379 unitarizeLinkMILC(v, &result);
381 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
382 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION
384 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
385 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION
387 if(FL_CHECK_UNITARIZATION){
388 if(
isUnitary(result,FL_MAX_ERROR) ==
false)
391 atomicAdd(num_failures, 1);
402 class UnitarizeLinksCuda :
public Tunable {
404 const cudaGaugeField &inField;
405 cudaGaugeField &outField;
408 unsigned int sharedBytesPerThread()
const {
return 0; }
409 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
412 bool tuneGridDim()
const {
return false; }
413 unsigned int minThreads()
const {
return inField.Volume(); }
416 UnitarizeLinksCuda(
const cudaGaugeField& inField, cudaGaugeField& outField,
int* fails) :
417 inField(inField), outField(outField), fails(fails) { ; }
418 virtual ~UnitarizeLinksCuda() { ; }
420 void apply(
const cudaStream_t &
stream) {
424 getUnitarizedField<<<tp.grid,tp.block>>>((float2*)inField.Even_p(), (float2*)inField.Odd_p(),
425 (float2*)outField.Even_p(), (float2*)outField.Odd_p(),
426 fails, inField.Volume());
428 getUnitarizedField<<<tp.grid,tp.block>>>((double2*)inField.Even_p(), (double2*)inField.Odd_p(),
429 (double2*)outField.Even_p(), (double2*)outField.Odd_p(),
430 fails, inField.Volume());
432 errorQuda(
"UnitarizeLinks not implemented for precision %d", inField.Precision());
437 void postTune() { cudaMemset(fails, 0,
sizeof(
int)); }
439 long long flops()
const {
return 0; }
441 TuneKey tuneKey()
const {
442 std::stringstream vol, aux;
443 vol << inField.X()[0] <<
"x";
444 vol << inField.X()[1] <<
"x";
445 vol << inField.X()[2] <<
"x";
446 vol << inField.X()[3] <<
"x";
447 aux <<
"threads=" << inField.Volume() <<
",prec=" << inField.Precision();
448 aux <<
"stride=" << inField.Stride();
449 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
454 cudaGaugeField& inField,
455 cudaGaugeField* outField,
457 UnitarizeLinksCuda unitarizeLinks(inField, *outField, fails);
458 unitarizeLinks.apply(0);
463 int num_failures = 0;
466 for(
int i=0; i<infield.Volume(); ++i){
467 for(
int dir=0; dir<4; ++dir){
469 copyArrayToLink(&inlink, ((
float*)(infield.Gauge_p()) + (i*4 + dir)*18));
470 if( unitarizeLinkNewton<double2>(inlink, &outlink) ==
false ) num_failures++;
471 copyLinkToArray(((
float*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
473 copyArrayToLink(&inlink, ((
double*)(infield.Gauge_p()) + (i*4 + dir)*18));
474 if( unitarizeLinkNewton<double2>(inlink, &outlink) ==
false ) num_failures++;
475 copyLinkToArray(((
double*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
487 for(
int i=0; i<field.Volume(); ++i){
488 for(
int dir=0; dir<4; ++dir){
497 printf(
"Unitarity failure\n");
498 printf(
"site index = %d,\t direction = %d\n", i, dir);
500 identity =
conj(link)*link;
QudaVerbosity getVerbosity()
void unitarizeLinksCPU(const QudaGaugeParam ¶m, cpuGaugeField &infield, cpuGaugeField *outfield)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error, bool check_unitarization=true)
__host__ __device__ ValueType sqrt(ValueType x)
__global__ void const FloatN FloatM FloatM Float Float int threads
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
void setUnitarizeLinksPadding(int input_padding, int output_padding)
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
bool isUnitary(const QudaGaugeParam ¶m, cpuGaugeField &field, double max_error)
void unitarizeLinksCuda(const QudaGaugeParam ¶m, cudaGaugeField &infield, cudaGaugeField *outfield, int *num_failures)
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType acos(ValueType x)
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
__host__ __device__ ValueType conj(ValueType x)