25 #define HISQ_UNITARIZE_PI 3.14159265358979323846
26 #define HISQ_UNITARIZE_PI23 HISQ_UNITARIZE_PI*2.0/3.0
29 __constant__
double DEV_HISQ_UNITARIZE_EPS;
30 __constant__
double DEV_HISQ_FORCE_FILTER;
31 __constant__
double DEV_MAX_DET_ERROR;
32 __constant__
bool DEV_REUNIT_ALLOW_SVD;
33 __constant__
bool DEV_REUNIT_SVD_ONLY;
34 __constant__
double DEV_REUNIT_SVD_REL_ERROR;
35 __constant__
double DEV_REUNIT_SVD_ABS_ERROR;
37 static double HOST_HISQ_UNITARIZE_EPS;
38 static double HOST_HISQ_FORCE_FILTER;
39 static double HOST_MAX_DET_ERROR;
40 static bool HOST_REUNIT_ALLOW_SVD;
41 static bool HOST_REUNIT_SVD_ONLY;
42 static double HOST_REUNIT_SVD_REL_ERROR;
43 static double HOST_REUNIT_SVD_ABS_ERROR;
47 namespace fermion_force{
51 double max_det_error_h,
bool allow_svd_h,
bool svd_only_h,
52 double svd_rel_error_h,
double svd_abs_error_h)
56 static bool not_set=
true;
60 cudaMemcpyToSymbol(DEV_HISQ_UNITARIZE_EPS, &unitarize_eps_h,
sizeof(
double));
61 cudaMemcpyToSymbol(DEV_HISQ_FORCE_FILTER, &hisq_force_filter_h,
sizeof(
double));
62 cudaMemcpyToSymbol(DEV_MAX_DET_ERROR, &max_det_error_h,
sizeof(
double));
63 cudaMemcpyToSymbol(DEV_REUNIT_ALLOW_SVD, &allow_svd_h,
sizeof(
bool));
64 cudaMemcpyToSymbol(DEV_REUNIT_SVD_ONLY, &svd_only_h,
sizeof(
bool));
65 cudaMemcpyToSymbol(DEV_REUNIT_SVD_REL_ERROR, &svd_rel_error_h,
sizeof(
double));
66 cudaMemcpyToSymbol(DEV_REUNIT_SVD_ABS_ERROR, &svd_abs_error_h,
sizeof(
double));
68 HOST_HISQ_UNITARIZE_EPS = unitarize_eps_h;
69 HOST_HISQ_FORCE_FILTER = hisq_force_filter_h;
70 HOST_MAX_DET_ERROR = max_det_error_h;
71 HOST_REUNIT_ALLOW_SVD = allow_svd_h;
72 HOST_REUNIT_SVD_ONLY = svd_only_h;
73 HOST_REUNIT_SVD_REL_ERROR = svd_rel_error_h;
74 HOST_REUNIT_SVD_ABS_ERROR = svd_abs_error_h;
83 class DerivativeCoefficients{
87 Real computeC00(
const Real &,
const Real &,
const Real &);
89 Real computeC01(
const Real &,
const Real &,
const Real &);
91 Real computeC02(
const Real &,
const Real &,
const Real &);
93 Real computeC11(
const Real &,
const Real &,
const Real &);
95 Real computeC12(
const Real &,
const Real &,
const Real &);
97 Real computeC22(
const Real &,
const Real &,
const Real &);
100 __device__ __host__
void set(
const Real & u,
const Real & v,
const Real & w);
102 Real getB00()
const {
return b[0]; }
104 Real getB01()
const {
return b[1]; }
106 Real getB02()
const {
return b[2]; }
108 Real getB11()
const {
return b[3]; }
110 Real getB12()
const {
return b[4]; }
112 Real getB22()
const {
return b[5]; }
117 Real DerivativeCoefficients<Real>::computeC00(
const Real & u,
const Real & v,
const Real & w){
118 Real result = -
pow(w,3)*
pow(u,6)
136 Real DerivativeCoefficients<Real>::computeC01(
const Real & u,
const Real & v,
const Real & w){
137 Real result = -
pow(w,2)*
pow(u,7)
155 Real DerivativeCoefficients<Real>::computeC02(
const Real & u,
const Real & v,
const Real & w){
156 Real result =
pow(w,2)*
pow(u,5)
169 Real DerivativeCoefficients<Real>::computeC11(
const Real & u,
const Real & v,
const Real & w){
170 Real result = - w*
pow(u,8)
187 Real DerivativeCoefficients<Real>::computeC12(
const Real & u,
const Real & v,
const Real & w){
188 Real result = w*
pow(u,6)
201 Real DerivativeCoefficients<Real>::computeC22(
const Real & u,
const Real & v,
const Real & w){
202 Real result = - w*
pow(u,4)
209 template <
class Real>
211 void DerivativeCoefficients<Real>::set(
const Real & u,
const Real & v,
const Real & w){
212 const Real & denominator = 2.0*
pow(w*(u*v-w),3);
213 b[0] = computeC00(u,v,w)/denominator;
214 b[1] = computeC01(u,v,w)/denominator;
215 b[2] = computeC02(u,v,w)/denominator;
216 b[3] = computeC11(u,v,w)/denominator;
217 b[4] = computeC12(u,v,w)/denominator;
218 b[5] = computeC22(u,v,w)/denominator;
223 template<
class Cmplx>
227 const typename RealTypeId<Cmplx>::Type temp = 2.0*
getTrace(left*outer_prod).x;
228 for(
int k=0; k<3; ++k){
229 for(
int l=0; l<3; ++l){
232 result->operator()(k,l).
x += temp*right(k,l).x;
233 result->operator()(k,l).
y += temp*right(k,l).y;
240 template<
class Cmplx>
244 Cmplx temp =
getTrace(left*outer_prod);
245 for(
int k=0; k<3; ++k){
246 for(
int l=0; l<3; ++l){
247 result->operator()(k,l) = temp*right(k,l);
256 T getAbsMin(
const T*
const array,
int size){
257 T min = fabs(array[0]);
258 for(
int i=1; i<size; ++i){
259 T abs_val = fabs(array[i]);
260 if((abs_val) < min){ min = abs_val; }
268 inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
270 if( fabs(a-b) < epsilon)
return true;
277 inline bool checkRelativeError(Real a, Real b, Real epsilon)
279 if( fabs((a-b)/b) < epsilon )
return true;
288 template<
class Cmplx>
290 void reciprocalRoot(
Matrix<Cmplx,3>* res, DerivativeCoefficients<
typename RealTypeId<Cmplx>::Type>* deriv_coeffs,
291 typename RealTypeId<Cmplx>::Type f[3],
Matrix<Cmplx,3> & q,
int *unitarization_failed){
295 typename RealTypeId<Cmplx>::Type c[3];
296 typename RealTypeId<Cmplx>::Type g[3];
299 #define REUNIT_SVD_ONLY DEV_REUNIT_SVD_ONLY
301 #define REUNIT_SVD_ONLY HOST_REUNIT_SVD_ONLY
303 if(!REUNIT_SVD_ONLY){
311 g[0] = g[1] = g[2] = c[0]/3.;
312 typename RealTypeId<Cmplx>::Type r,
s,theta;
313 s = c[1]/3. - c[0]*c[0]/18;
314 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
317 #define HISQ_UNITARIZE_EPS DEV_HISQ_UNITARIZE_EPS
319 #define HISQ_UNITARIZE_EPS HOST_HISQ_UNITARIZE_EPS
322 typename RealTypeId<Cmplx>::Type cosTheta = r/
sqrt(s*s*s);
323 if(fabs(s) < HISQ_UNITARIZE_EPS){
327 if(fabs(cosTheta)>1.0){ r>0 ? theta=0.0 : theta=HISQ_UNITARIZE_PI/3.0; }
328 else{ theta =
acos(cosTheta)/3.0; }
331 for(
int i=0; i<3; ++i){
332 g[i] += s*
cos(theta + (i-1)*HISQ_UNITARIZE_PI23);
350 #define REUNIT_ALLOW_SVD DEV_REUNIT_ALLOW_SVD
351 #define REUNIT_SVD_REL_ERROR DEV_REUNIT_SVD_REL_ERROR
352 #define REUNIT_SVD_ABS_ERROR DEV_REUNIT_SVD_ABS_ERROR
354 #define REUNIT_ALLOW_SVD HOST_REUNIT_ALLOW_SVD
355 #define REUNIT_SVD_REL_ERROR HOST_REUNIT_SVD_REL_ERROR
356 #define REUNIT_SVD_ABS_ERROR HOST_REUNIT_SVD_ABS_ERROR
359 if(REUNIT_ALLOW_SVD){
360 bool perform_svd =
true;
361 if(!REUNIT_SVD_ONLY){
363 if( fabs(det) >= REUNIT_SVD_ABS_ERROR){
364 if( checkRelativeError(g[0]*g[1]*g[2],det,REUNIT_SVD_REL_ERROR) ) perform_svd =
false;
371 computeSVD<Cmplx>(q,tempq,
tmp2,g);
375 const typename RealTypeId<Cmplx>::Type determinant =
getDeterminant(q).x;
376 const typename RealTypeId<Cmplx>::Type gprod = g[0]*g[1]*g[2];
379 #define MAX_DET_ERROR DEV_MAX_DET_ERROR
381 #define MAX_DET_ERROR HOST_MAX_DET_ERROR
383 if(fabs(gprod - determinant) > MAX_DET_ERROR){
384 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__ >= 200))
385 printf(
"Warning: Error in determinant computed by SVD : %g > %g\n", fabs(gprod-determinant), MAX_DET_ERROR);
390 atomicAdd(unitarization_failed,1);
392 (*unitarization_failed)++;
400 #define HISQ_FORCE_FILTER DEV_HISQ_FORCE_FILTER
402 #define HISQ_FORCE_FILTER HOST_HISQ_FORCE_FILTER
404 typename RealTypeId<Cmplx>::Type delta = getAbsMin(g,3);
405 if(delta < HISQ_FORCE_FILTER){
406 for(
int i=0; i<3; ++i){
407 g[i] += HISQ_FORCE_FILTER;
408 q(i,i).x += HISQ_FORCE_FILTER;
416 for(
int i=0; i<3; ++i) c[i] =
sqrt(g[i]);
419 g[0] = c[0]+c[1]+c[2];
420 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
421 g[2] = c[0]*c[1]*c[2];
424 deriv_coeffs->set(g[0], g[1], g[2]);
426 const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]);
427 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
428 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
429 c[2] = g[0]/denominator;
431 tempq = c[1]*q + c[2]*qsq;
433 tempq(0,0).x += c[0];
434 tempq(1,1).x += c[0];
435 tempq(2,2).x += c[0];
448 template<
class Cmplx>
452 typename RealTypeId<Cmplx>::Type f[3];
453 typename RealTypeId<Cmplx>::Type b[6];
460 DerivativeCoefficients<typename RealTypeId<Cmplx>::Type> deriv_coeffs;
462 reciprocalRoot<Cmplx>(&rsqrt_q, &deriv_coeffs, f, q, unitarization_failed);
465 b[0] = deriv_coeffs.getB00();
466 b[1] = deriv_coeffs.getB01();
467 b[2] = deriv_coeffs.getB02();
468 b[3] = deriv_coeffs.getB11();
469 b[4] = deriv_coeffs.getB12();
470 b[5] = deriv_coeffs.getB22();
475 local_result = rsqrt_q*outer_prod;
484 temp = f[1]*v_dagger + f[2]*qv_dagger;
488 temp = f[1]*v + f[2]*v*q;
489 local_result = local_result + outer_prod*temp*v_dagger + f[2]*q*outer_prod*vv_dagger;
491 local_result = local_result + v_dagger*conj_outer_prod*
conj(temp) + f[2]*qv_dagger*conj_outer_prod*v_dagger;
496 Matrix<Cmplx,3> pv_dagger = b[0]*v_dagger + b[1]*qv_dagger + b[2]*qsqv_dagger;
497 accumBothDerivatives(&local_result, v, pv_dagger, outer_prod);
499 Matrix<Cmplx,3> rv_dagger = b[1]*v_dagger + b[3]*qv_dagger + b[4]*qsqv_dagger;
501 accumBothDerivatives(&local_result, vq, rv_dagger, outer_prod);
503 Matrix<Cmplx,3> sv_dagger = b[2]*v_dagger + b[4]*qv_dagger + b[5]*qsqv_dagger;
505 accumBothDerivatives(&local_result, vqsq, sv_dagger, outer_prod);
512 template<
class Cmplx>
513 __global__
void getUnitarizeForceField(
const int threads,
const Cmplx* link_even,
const Cmplx* link_odd,
514 const Cmplx* old_force_even,
const Cmplx* old_force_odd,
515 Cmplx* force_even, Cmplx* force_odd,
516 int* unitarization_failed)
519 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
521 const int HALF_VOLUME = threads/2;
522 if(mem_idx >= threads)
return;
526 const Cmplx* old_force;
530 old_force = old_force_even;
531 if(mem_idx >= HALF_VOLUME){
532 mem_idx = mem_idx - HALF_VOLUME;
535 old_force = old_force_odd;
542 for(
int dir=0; dir<4; ++dir){
546 getUnitarizeForceSite<double2>(v, oprod, &result, unitarization_failed);
557 int num_failures = 0;
565 for(
int i=0; i<cpuGauge.Volume(); ++i){
566 for(
int dir=0; dir<4; ++dir){
568 copyArrayToLink(&old_force, ((
float*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18));
570 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
571 copyLinkToArray(((
float*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force);
573 copyArrayToLink(&old_force, ((
double*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18));
575 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
576 copyLinkToArray(((
double*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force);
581 for(
int dir=0; dir<4; ++dir){
582 for(
int i=0; i<cpuGauge.Volume(); ++i){
584 copyArrayToLink(&old_force, ((
float**)(cpuOldForce.Gauge_p()))[dir] + i*18);
586 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
587 copyLinkToArray(((
float**)(cpuNewForce->Gauge_p()))[dir] + i*18, new_force);
589 copyArrayToLink(&old_force, ((
double**)(cpuOldForce.Gauge_p()))[dir] + i*18);
591 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
592 copyLinkToArray(((
double**)(cpuNewForce->Gauge_p()))[dir] + i*18, new_force);
597 errorQuda(
"Only MILC and QDP gauge orders supported\n");
602 class UnitarizeForceCuda :
public Tunable {
604 const cudaGaugeField &oldForce;
605 const cudaGaugeField &
gauge;
606 cudaGaugeField &newForce;
609 unsigned int sharedBytesPerThread()
const {
return 0; }
610 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
613 bool tuneGridDim()
const {
return false; }
614 unsigned int minThreads()
const {
return gauge.Volume(); }
617 UnitarizeForceCuda(
const cudaGaugeField& oldForce,
const cudaGaugeField&
gauge,
618 cudaGaugeField& newForce,
int* fails) :
619 oldForce(oldForce), gauge(gauge), newForce(newForce), fails(fails) {
620 writeAuxString(
"threads=%d,prec=%lu,stride=%d",
621 gauge.Volume(), gauge.Precision(), gauge.Stride());
623 virtual ~UnitarizeForceCuda() { ; }
625 void apply(
const cudaStream_t &
stream) {
629 getUnitarizeForceField<<<tp.grid,tp.block>>>(gauge.Volume(), (
const float2*)gauge.Even_p(), (
const float2*)gauge.Odd_p(),
630 (
const float2*)oldForce.Even_p(), (
const float2*)oldForce.Odd_p(),
631 (float2*)newForce.Even_p(), (float2*)newForce.Odd_p(),
634 getUnitarizeForceField<<<tp.grid,tp.block>>>(gauge.Volume(), (
const double2*)gauge.Even_p(), (
const double2*)gauge.Odd_p(),
635 (
const double2*)oldForce.Even_p(), (
const double2*)oldForce.Odd_p(),
636 (double2*)newForce.Even_p(), (double2*)newForce.Odd_p(),
642 void postTune() { cudaMemset(fails, 0,
sizeof(
int)); }
644 long long flops()
const {
return 4*4528*gauge.Volume(); }
646 TuneKey tuneKey()
const {
return TuneKey(gauge.VolString(),
typeid(*this).name(), aux); }
650 cudaGaugeField &
cudaGauge, cudaGaugeField *cudaNewForce,
int* unitarization_failed,
long long *flops) {
652 UnitarizeForceCuda unitarizeForce(cudaOldForce, cudaGauge, *cudaNewForce, unitarization_failed);
653 unitarizeForce.apply(0);
654 if(flops) *flops = unitarizeForce.flops();
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
QudaVerbosity getVerbosity()
void unitarizeForceCuda(cudaGaugeField &cudaOldForce, cudaGaugeField &cudaGauge, cudaGaugeField *cudaNewForce, int *unitarization_failed, long long *flops=NULL)
void unitarizeForceCPU(cpuGaugeField &cpuOldForce, cpuGaugeField &cpuGauge, cpuGaugeField *cpuNewForce)
__host__ __device__ ValueType sqrt(ValueType x)
__global__ void const FloatN FloatM FloatM Float Float int threads
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
cudaGaugeField * cudaGauge
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
cudaColorSpinorField * tmp2
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType acos(ValueType x)
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
__host__ __device__ ValueType conj(ValueType x)