14 #ifndef FL_UNITARIZE_PI
15 #define FL_UNITARIZE_PI 3.14159265358979323846
17 #ifndef FL_UNITARIZE_PI23
18 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0
25 static int HOST_MAX_ITER = 20;
35 static double HOST_FL_MAX_ERROR;
36 static double HOST_FL_UNITARIZE_EPS;
37 static bool HOST_FL_REUNIT_ALLOW_SVD;
38 static bool HOST_FL_REUNIT_SVD_ONLY;
39 static double HOST_FL_REUNIT_SVD_REL_ERROR;
40 static double HOST_FL_REUNIT_SVD_ABS_ERROR;
41 static bool HOST_FL_CHECK_UNITARIZATION;
45 cudaMemcpyToSymbol(
INPUT_PADDING, &input_padding_h,
sizeof(
int));
46 cudaMemcpyToSymbol(
OUTPUT_PADDING, &output_padding_h,
sizeof(
int));
57 for(
int i=0; i<3; ++i){
58 if( fabs(identity(i,i).
x - 1.0) > max_error || fabs(identity(i,i).y) > max_error)
return false;
59 for(
int j=i+1; j<3; ++j){
60 if( fabs(identity(i,j).
x) > max_error || fabs(identity(i,j).y) > max_error
61 || fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){
78 temporary =
conj(initial_matrix)*unitary_matrix;
79 temporary = temporary*temporary -
conj(initial_matrix)*initial_matrix;
81 for(
int i=0; i<3; ++i){
82 for(
int j=0; j<3; ++j){
83 if( fabs(temporary(i,j).
x) > max_error || fabs(temporary(i,j).y) > max_error){
94 bool allow_svd_h,
bool svd_only_h,
95 double svd_rel_error_h,
double svd_abs_error_h,
96 bool check_unitarization_h)
100 static bool not_set=
true;
112 HOST_FL_UNITARIZE_EPS = unitarize_eps_h;
113 HOST_FL_REUNIT_ALLOW_SVD = allow_svd_h;
114 HOST_FL_REUNIT_SVD_ONLY = svd_only_h;
115 HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error_h;
116 HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error_h;
117 HOST_FL_MAX_ERROR = max_error_h;
118 HOST_FL_CHECK_UNITARIZATION = check_unitarization_h;
130 T min = fabs(array[0]);
131 for(
int i=1; i<size; ++i){
132 T abs_val = fabs(array[i]);
133 if((abs_val) < min){ min = abs_val; }
143 if( fabs(a-b) < epsilon)
return true;
152 if( fabs((a-b)/b) < epsilon )
return true;
161 template<
class Cmplx>
178 g[0] = g[1] = g[2] = c[0]/3.;
180 s = c[1]/3. - c[0]*c[0]/18;
183 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS
185 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS
190 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR
191 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR
193 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR
194 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR
201 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
202 cosTheta = r/(sqrt_s*sqrt_s*sqrt_s);
203 if(fabs(cosTheta) >= 1.0){
210 theta = acos(cosTheta);
212 g[0] = c[0]/3 + 2*sqrt_s*cos( theta/3 );
228 for(
int i=0; i<3; ++i) c[i] = sqrt(g[i]);
231 g[0] = c[0]+c[1]+c[2];
232 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
233 g[2] = c[0]*c[1]*c[2];
236 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
237 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
238 c[2] = g[0]/denominator;
240 tempq = c[1]*q + c[2]*qsq;
242 tempq(0,0).x += c[0];
243 tempq(1,1).x += c[0];
244 tempq(2,2).x += c[0];
254 template<
class Cmplx>
260 #define FL_REUNIT_SVD_ONLY DEV_FL_REUNIT_SVD_ONLY
261 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD
263 #define FL_REUNIT_SVD_ONLY HOST_FL_REUNIT_SVD_ONLY
264 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD
267 if( reciprocalRoot<Cmplx>(
conj(in)*in,&u) ){
279 computeSVD<Cmplx>(
in, u, v, singular_values);
285 template<
class Cmplx>
291 computeSVD<Cmplx>(
in, u, v, singular_values);
296 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
298 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
302 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
303 printf(
"ERROR: Link unitarity test failed\n");
313 template<
class Cmplx>
321 #define MAX_ITER DEV_MAX_ITER
323 #define MAX_ITER HOST_MAX_ITER
327 u = 0.5*(u +
conj(uinv));
333 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
334 printf(
"ERROR: Unitarized link is not consistent with incoming link\n");
349 template<
class Cmplx>
351 Cmplx* outlink_even, Cmplx* outlink_odd,
352 int* num_failures,
const int threads)
354 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
355 if (mem_idx >= threads)
return;
360 inlink = inlink_even;
361 outlink = outlink_even;
364 mem_idx = mem_idx -
Vh;
366 outlink = outlink_odd;
375 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
376 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION
378 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
379 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION
385 atomicAdd(num_failures, 1);
402 int sharedBytesPerThread()
const {
return 0; }
403 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
409 bool advanceBlockDim(
TuneParam ¶m)
const
411 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
412 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
413 const unsigned int max_shared = 16384;
417 param.
block.x += step;
418 if (param.
block.x > max_threads || sharedBytesPerThread()*param.
block.x > max_shared) {
419 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
420 param.
block.x = ((param.
block.x+step-1) / step) * step;
421 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
432 inField(inField), outField(outField), fails(fails) { ; }
440 (float2*)outField.
Even_p(), (float2*)outField.
Odd_p(),
444 (double2*)outField.
Even_p(), (double2*)outField.
Odd_p(),
452 void postTune() { cudaMemset(fails, 0,
sizeof(
int)); }
456 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
457 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
460 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
461 param.
block.x = ((param.
block.x+step-1) / step) * step;
462 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
464 param.
shared_bytes = sharedBytesPerThread()*param.
block.x > sharedBytesPerBlock(param) ?
465 sharedBytesPerThread()*param.
block.x : sharedBytesPerBlock(param);
473 long long flops()
const {
return 0; }
476 std::stringstream vol, aux;
477 vol << inField.
X()[0] <<
"x";
478 vol << inField.
X()[1] <<
"x";
479 vol << inField.
X()[2] <<
"x";
480 vol << inField.
X()[3] <<
"x";
481 aux <<
"threads=" << inField.
Volume() <<
",prec=" << inField.
Precision();
482 aux <<
"stride=" << inField.
Stride();
483 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
492 unitarizeLinks.
apply(0);
497 int num_failures = 0;
500 for(
int i=0; i<infield.
Volume(); ++i){
504 if( unitarizeLinkNewton<double2>(inlink, &outlink) ==
false ) num_failures++;
508 if( unitarizeLinkNewton<double2>(inlink, &outlink) ==
false ) num_failures++;
521 for(
int i=0; i<field.
Volume(); ++i){
531 printf(
"Unitarity failure\n");
532 printf(
"site index = %d,\t direction = %d\n", i,
dir);
534 identity =
conj(link)*link;