16 #if defined(__APPLE__) && CUDA_VERSION >= 7000 && CUDA_VERSION < 7050 17 #define EXPONENT_TYPE Real 19 #define EXPONENT_TYPE int 28 #define HISQ_UNITARIZE_PI 3.14159265358979323846 29 #define HISQ_UNITARIZE_PI23 HISQ_UNITARIZE_PI*2.0/3.0 32 static double force_filter;
33 static double max_det_error;
34 static bool allow_svd;
40 namespace fermion_force {
42 template <
typename F,
typename G>
43 struct UnitarizeForceArg {
50 const double force_filter;
51 const double max_det_error;
57 UnitarizeForceArg(
const F &force,
const F &force_old,
const G &gauge,
const GaugeField &meta,
int *fails,
58 double unitarize_eps,
double force_filter,
double max_det_error,
int allow_svd,
61 force_filter(force_filter), max_det_error(max_det_error), allow_svd(allow_svd),
64 for(
int dir=0; dir<4; ++dir) threads *= meta.X()[dir];
70 double max_det_error_,
bool allow_svd_,
bool svd_only_,
71 double svd_rel_error_,
double svd_abs_error_)
74 force_filter = force_filter_;
75 max_det_error = max_det_error_;
76 allow_svd = allow_svd_;
84 class DerivativeCoefficients{
88 Real computeC00(
const Real &,
const Real &,
const Real &);
90 Real computeC01(
const Real &,
const Real &,
const Real &);
92 Real computeC02(
const Real &,
const Real &,
const Real &);
94 Real computeC11(
const Real &,
const Real &,
const Real &);
96 Real computeC12(
const Real &,
const Real &,
const Real &);
98 Real computeC22(
const Real &,
const Real &,
const Real &);
101 __device__ __host__
void set(
const Real & u,
const Real & v,
const Real &
w);
103 Real getB00()
const {
return b[0]; }
105 Real getB01()
const {
return b[1]; }
107 Real getB02()
const {
return b[2]; }
109 Real getB11()
const {
return b[3]; }
111 Real getB12()
const {
return b[4]; }
113 Real getB22()
const {
return b[5]; }
118 Real DerivativeCoefficients<Real>::computeC00(
const Real & u,
const Real & v,
const Real &
w){
119 Real result = -
pow(
w,static_cast<EXPONENT_TYPE>(3)) *
pow(u,static_cast<EXPONENT_TYPE>(6))
120 + 3*v*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(4))
121 + 3*
pow(v,static_cast<EXPONENT_TYPE>(4))*
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
122 -
pow(v,static_cast<EXPONENT_TYPE>(6))*
pow(u,static_cast<EXPONENT_TYPE>(3))
123 - 4*
pow(
w,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
124 - 12*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
125 + 16*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
126 + 3*
pow(v,static_cast<EXPONENT_TYPE>(5))*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
127 - 8*v*
pow(
w,static_cast<EXPONENT_TYPE>(4))*u
128 - 3*
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u
129 +
pow(
w,static_cast<EXPONENT_TYPE>(5))
130 +
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(
w,static_cast<EXPONENT_TYPE>(3));
137 Real DerivativeCoefficients<Real>::computeC01(
const Real & u,
const Real & v,
const Real &
w){
138 Real result = -
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(7))
139 -
pow(v,static_cast<EXPONENT_TYPE>(2))*
w*
pow(u,static_cast<EXPONENT_TYPE>(6))
140 +
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(5))
141 + 6*v*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
142 - 5*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(4))
143 -
pow(v,static_cast<EXPONENT_TYPE>(3))*
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
144 - 2*
pow(v,static_cast<EXPONENT_TYPE>(5))*
pow(u,static_cast<EXPONENT_TYPE>(3))
145 - 6*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
146 + 10*v*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
147 + 6*
pow(v,static_cast<EXPONENT_TYPE>(4))*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
148 - 3*
pow(
w,static_cast<EXPONENT_TYPE>(4))*u
149 - 6*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u
150 + 2*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(
w,static_cast<EXPONENT_TYPE>(3));
156 Real DerivativeCoefficients<Real>::computeC02(
const Real & u,
const Real & v,
const Real &
w){
157 Real result =
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
158 +
pow(v,static_cast<EXPONENT_TYPE>(2))*
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
159 -
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
160 - 4*v*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
161 + 4*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
162 + 3*
pow(v,static_cast<EXPONENT_TYPE>(3))*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
163 - 3*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u
164 + v*
pow(
w,static_cast<EXPONENT_TYPE>(3));
170 Real DerivativeCoefficients<Real>::computeC11(
const Real & u,
const Real & v,
const Real &
w){
171 Real result = -
w*
pow(u,static_cast<EXPONENT_TYPE>(8))
172 -
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(7))
173 + 7*v*
w*
pow(u,static_cast<EXPONENT_TYPE>(6))
174 + 4*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(5))
175 - 5*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
176 - 16*
pow(v,static_cast<EXPONENT_TYPE>(2))*
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
177 - 4*
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
178 + 16*v*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
179 - 3*
pow(
w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
180 + 12*
pow(v,static_cast<EXPONENT_TYPE>(3))*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
181 - 12*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u
182 + 3*v*
pow(
w,static_cast<EXPONENT_TYPE>(3));
188 Real DerivativeCoefficients<Real>::computeC12(
const Real & u,
const Real & v,
const Real &
w){
189 Real result =
w*
pow(u,static_cast<EXPONENT_TYPE>(6))
190 +
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
191 - 5*v*
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
192 - 2*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(3))
193 + 4*
pow(
w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
194 + 6*
pow(v,static_cast<EXPONENT_TYPE>(2))*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
195 - 6*v*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u
196 +
pow(
w,static_cast<EXPONENT_TYPE>(3));
202 Real DerivativeCoefficients<Real>::computeC22(
const Real & u,
const Real & v,
const Real &
w){
203 Real result = -
w*
pow(u,static_cast<EXPONENT_TYPE>(4))
204 -
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
205 + 3*v*
w*
pow(u,static_cast<EXPONENT_TYPE>(2))
206 - 3*
pow(
w,static_cast<EXPONENT_TYPE>(2))*u;
210 template <
class Real>
213 const Real & denominator = 2.0*
pow(
w*(u*v-
w),static_cast<EXPONENT_TYPE>(3));
214 b[0] = computeC00(u,v,
w)/denominator;
215 b[1] = computeC01(u,v,
w)/denominator;
216 b[2] = computeC02(u,v,
w)/denominator;
217 b[3] = computeC11(u,v,
w)/denominator;
218 b[4] = computeC12(u,v,
w)/denominator;
219 b[5] = computeC22(u,v,
w)/denominator;
224 template<
class Float>
226 void accumBothDerivatives(
Matrix<complex<Float>,3>* result,
const Matrix<complex<Float>,3> &left,
227 const Matrix<complex<Float>,3> &right,
const Matrix<complex<Float>,3> &outer_prod)
229 const Float temp = (2.0*
getTrace(left*outer_prod)).real();;
230 for(
int k=0; k<3; ++k){
231 for(
int l=0; l<3; ++l){
235 result->operator()(k,l).
x += temp*right(k,l).x;
236 result->operator()(k,l).
y += temp*right(k,l).y;
243 template<
class Cmplx>
247 Cmplx temp =
getTrace(left*outer_prod);
248 for(
int k=0; k<3; ++k){
249 for(
int l=0; l<3; ++l){
250 result->operator()(k,l) = temp*right(k,l);
259 T getAbsMin(
const T*
const array,
int size){
263 if ((abs_val) < min){ min = abs_val; }
271 inline bool checkAbsoluteError(Real
a, Real
b, Real epsilon)
273 if(
fabs(
a-
b) < epsilon)
return true;
280 inline bool checkRelativeError(Real
a, Real
b, Real epsilon)
282 if(
fabs((
a-
b)/
b) < epsilon )
return true;
291 template<
class Float,
typename Arg>
293 void reciprocalRoot(
Matrix<complex<Float>,3>* res, DerivativeCoefficients<Float>* deriv_coeffs,
294 Float
f[3],
Matrix<complex<Float>,3> & q, Arg &
arg) {
309 g[0] = g[1] = g[2] =
c[0]/3.;
311 s =
c[1]/3. -
c[0]*
c[0]/18;
312 r =
c[2]/2. - (
c[0]/3.)*(
c[1] -
c[0]*
c[0]/9.);
314 Float cosTheta = r/
sqrt(
s*
s*
s);
319 if(
fabs(cosTheta)>1.0){ r>0 ? theta=0.0 : theta=HISQ_UNITARIZE_PI/3.0; }
320 else{ theta =
acos(cosTheta)/3.0; }
323 for(
int i=0;
i<3; ++
i){
324 g[
i] +=
s*
cos(theta + (
i-1)*HISQ_UNITARIZE_PI23);
342 bool perform_svd =
true;
345 if(
fabs(det) >=
arg.svd_abs_error) {
346 if( checkRelativeError(g[0]*g[1]*g[2],det,
arg.svd_rel_error) ) perform_svd =
false;
353 computeSVD<Float>(q,tempq,
tmp2,g);
358 const Float gprod = g[0]*g[1]*g[2];
360 if (
fabs(gprod - determinant) >
arg.max_det_error) {
361 printf(
"Warning: Error in determinant computed by SVD : %g > %g\n",
fabs(gprod-determinant),
arg.max_det_error);
365 atomicAdd(
arg.fails, 1);
374 Float
delta = getAbsMin(g,3);
376 for (
int i=0;
i<3; ++
i) {
377 g[
i] +=
arg.force_filter;
378 q(
i,
i).x +=
arg.force_filter;
389 g[0] =
c[0]+
c[1]+
c[2];
390 g[1] =
c[0]*
c[1] +
c[0]*
c[2] +
c[1]*
c[2];
391 g[2] =
c[0]*
c[1]*
c[2];
394 deriv_coeffs->set(g[0], g[1], g[2]);
396 const Float& denominator = g[2]*(g[0]*g[1]-g[2]);
397 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
398 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
399 c[2] = g[0]/denominator;
401 tempq =
c[1]*q +
c[2]*qsq;
403 tempq(0,0).x +=
c[0];
404 tempq(1,1).x +=
c[0];
405 tempq(2,2).x +=
c[0];
418 template<
class Float,
typename Arg>
420 void getUnitarizeForceSite(
Matrix<complex<Float>,3>& result,
const Matrix<complex<Float>,3> & v,
421 const Matrix<complex<Float>,3> & outer_prod, Arg &
arg)
427 Link v_dagger =
conj(v);
431 DerivativeCoefficients<Float> deriv_coeffs;
433 reciprocalRoot<Float>(&rsqrt_q, &deriv_coeffs,
f, q,
arg);
436 b[0] = deriv_coeffs.getB00();
437 b[1] = deriv_coeffs.getB01();
438 b[2] = deriv_coeffs.getB02();
439 b[3] = deriv_coeffs.getB11();
440 b[4] = deriv_coeffs.getB12();
441 b[5] = deriv_coeffs.getB22();
443 result = rsqrt_q*outer_prod;
446 Link qv_dagger = q*v_dagger;
447 Link vv_dagger = v*v_dagger;
448 Link vqv_dagger = v*qv_dagger;
449 Link temp =
f[1]*vv_dagger +
f[2]*vqv_dagger;
451 temp =
f[1]*v_dagger +
f[2]*qv_dagger;
452 Link conj_outer_prod =
conj(outer_prod);
454 temp =
f[1]*v +
f[2]*v*q;
455 result = result + outer_prod*temp*v_dagger +
f[2]*q*outer_prod*vv_dagger;
456 result = result + v_dagger*conj_outer_prod*
conj(temp) +
f[2]*qv_dagger*conj_outer_prod*v_dagger;
458 Link qsqv_dagger = q*qv_dagger;
459 Link pv_dagger =
b[0]*v_dagger +
b[1]*qv_dagger +
b[2]*qsqv_dagger;
460 accumBothDerivatives(&result, v, pv_dagger, outer_prod);
462 Link rv_dagger =
b[1]*v_dagger +
b[3]*qv_dagger +
b[4]*qsqv_dagger;
464 accumBothDerivatives(&result, vq, rv_dagger, outer_prod);
466 Link sv_dagger =
b[2]*v_dagger +
b[4]*qv_dagger +
b[5]*qsqv_dagger;
468 accumBothDerivatives(&result, vqsq, sv_dagger, outer_prod);
475 template<
typename Float,
typename Arg>
476 __global__
void getUnitarizeForceField(Arg
arg)
479 if(
idx >=
arg.threads)
return;
481 if(
idx >=
arg.threads/2) {
490 for(
int dir=0; dir<4; ++dir){
496 getUnitarizeForceSite<double>(result, v, oprod,
arg);
499 arg.force.save((Float*)(result_tmp.data),
idx, dir,
parity);
505 template <
typename Float,
typename Arg>
511 for (
int i=0;
i<
arg.threads/2;
i++) {
512 for (
int dir=0; dir<4; dir++) {
518 getUnitarizeForceSite<double>(result, v, oprod,
arg);
521 arg.force.save((Float*)(result_tmp.data),
i, dir,
parity);
527 void unitarizeForceCPU(cpuGaugeField& newForce,
const cpuGaugeField& oldForce,
const cpuGaugeField& gauge)
534 typedef gauge::MILCOrder<double,18> G;
537 unitarizeForceCPU<double>(
arg);
539 typedef gauge::MILCOrder<float,18> G;
542 unitarizeForceCPU<float>(
arg);
544 errorQuda(
"Precision = %d not supported", gauge.Precision());
548 typedef gauge::QDPOrder<double,18> G;
551 unitarizeForceCPU<double>(
arg);
553 typedef gauge::QDPOrder<float,18> G;
556 unitarizeForceCPU<float>(
arg);
558 errorQuda(
"Precision = %d not supported", gauge.Precision());
561 errorQuda(
"Only MILC and QDP gauge orders supported\n");
568 template <
typename Float,
typename Arg>
569 class UnitarizeForce :
public Tunable {
572 const GaugeField &meta;
574 unsigned int sharedBytesPerThread()
const {
return 0; }
575 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
578 bool tuneGridDim()
const {
return false; }
579 unsigned int minThreads()
const {
return arg.threads; }
582 UnitarizeForce(Arg &
arg,
const GaugeField& meta) :
arg(
arg), meta(meta) {
583 writeAuxString(
"threads=%d,prec=%lu,stride=%d", meta.Volume(), meta.Precision(), meta.Stride());
585 virtual ~UnitarizeForce() { ; }
587 void apply(
const cudaStream_t &
stream) {
589 getUnitarizeForceField<Float><<<tp.grid,tp.block>>>(
arg);
593 void postTune() { cudaMemset(
arg.fails, 0,
sizeof(
int)); }
595 long long flops()
const {
return 4ll*4528*meta.Volume(); }
596 long long bytes()
const {
return 4ll *
arg.threads * (
arg.force.Bytes() +
arg.force_old.Bytes() +
arg.gauge.Bytes()); }
598 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
601 template<
typename Float,
typename Gauge>
602 void unitarizeForce(Gauge newForce,
const Gauge oldForce,
const Gauge gauge,
603 const GaugeField &meta,
int* fails,
long long *
flops) {
605 UnitarizeForceArg<Gauge,Gauge>
arg(newForce, oldForce, gauge, meta, fails,
unitarize_eps, force_filter,
614 void unitarizeForce(cudaGaugeField &newForce,
const cudaGaugeField &oldForce,
const cudaGaugeField &gauge,
615 int* fails,
long long *
flops) {
618 errorQuda(
"Force field should not use reconstruct %d", oldForce.Reconstruct());
621 errorQuda(
"Force field should not use reconstruct %d", newForce.Reconstruct());
624 errorQuda(
"Gauge field should not use reconstruct %d", gauge.Reconstruct());
626 if (gauge.Precision() != oldForce.Precision() || gauge.Precision() != newForce.Precision())
627 errorQuda(
"Mixed precision not supported");
629 if (gauge.Order() != oldForce.Order() || gauge.Order() != newForce.Order())
630 errorQuda(
"Mixed data ordering not supported");
634 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type G;
635 unitarizeForce<double>(G(newForce), G(oldForce), G(gauge), gauge, fails,
flops);
637 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type G;
638 unitarizeForce<float>(G(newForce), G(oldForce), G(gauge), gauge, fails,
flops);
641 errorQuda(
"Data order %d not supported", gauge.Order());
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
__host__ __device__ double set(double &x)
static double svd_rel_error
void unitarizeForceCPU(cpuGaugeField &newForce, const cpuGaugeField &oldForce, const cpuGaugeField &gauge)
Unitarize the fermion force on CPU.
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
static unsigned int delta
int printf(const char *,...) __attribute__((__format__(__printf__
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
int int int enum cudaChannelFormatKind f
static double unitarize_eps
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed, long long *flops=NULL)
Unitarize the fermion force.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
static double svd_abs_error
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType acos(ValueType x)
struct cudaExtent unsigned int cudaArray_t array
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...