16 #if defined(__APPLE__) && CUDA_VERSION >= 7000 && CUDA_VERSION < 7050 17 #define EXPONENT_TYPE Real 19 #define EXPONENT_TYPE int 28 #define HISQ_UNITARIZE_PI 3.14159265358979323846 29 #define HISQ_UNITARIZE_PI23 HISQ_UNITARIZE_PI*2.0/3.0 32 static double force_filter;
33 static double max_det_error;
34 static bool allow_svd;
40 namespace fermion_force {
42 template <
typename F,
typename G>
43 struct UnitarizeForceArg {
50 const double force_filter;
51 const double max_det_error;
57 UnitarizeForceArg(
const F &force,
const F &force_old,
const G &gauge,
const GaugeField &meta,
int *fails,
58 double unitarize_eps,
double force_filter,
double max_det_error,
int allow_svd,
59 int svd_only,
double svd_rel_error,
double svd_abs_error)
60 : threads(1), force(force), force_old(force_old), gauge(gauge), fails(fails), unitarize_eps(unitarize_eps),
61 force_filter(force_filter), max_det_error(max_det_error), allow_svd(allow_svd),
62 svd_only(svd_only), svd_rel_error(svd_rel_error), svd_abs_error(svd_abs_error)
64 for(
int dir=0; dir<4; ++dir) threads *= meta.X()[dir];
70 double max_det_error_,
bool allow_svd_,
bool svd_only_,
71 double svd_rel_error_,
double svd_abs_error_)
73 unitarize_eps = unitarize_eps_;
74 force_filter = force_filter_;
75 max_det_error = max_det_error_;
76 allow_svd = allow_svd_;
78 svd_rel_error = svd_rel_error_;
79 svd_abs_error = svd_abs_error_;
84 class DerivativeCoefficients{
88 Real computeC00(
const Real &,
const Real &,
const Real &);
90 Real computeC01(
const Real &,
const Real &,
const Real &);
92 Real computeC02(
const Real &,
const Real &,
const Real &);
94 Real computeC11(
const Real &,
const Real &,
const Real &);
96 Real computeC12(
const Real &,
const Real &,
const Real &);
98 Real computeC22(
const Real &,
const Real &,
const Real &);
101 __device__ __host__
void set(
const Real & u,
const Real & v,
const Real & w);
103 Real getB00()
const {
return b[0]; }
105 Real getB01()
const {
return b[1]; }
107 Real getB02()
const {
return b[2]; }
109 Real getB11()
const {
return b[3]; }
111 Real getB12()
const {
return b[4]; }
113 Real getB22()
const {
return b[5]; }
118 Real DerivativeCoefficients<Real>::computeC00(
const Real & u,
const Real & v,
const Real & w){
119 Real result = -
pow(w,static_cast<EXPONENT_TYPE>(3)) *
pow(u,static_cast<EXPONENT_TYPE>(6))
120 + 3*v*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(4))
121 + 3*
pow(v,static_cast<EXPONENT_TYPE>(4))*w*
pow(u,static_cast<EXPONENT_TYPE>(4))
122 -
pow(v,static_cast<EXPONENT_TYPE>(6))*
pow(u,static_cast<EXPONENT_TYPE>(3))
123 - 4*
pow(w,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
124 - 12*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
125 + 16*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
126 + 3*
pow(v,static_cast<EXPONENT_TYPE>(5))*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
127 - 8*v*
pow(w,static_cast<EXPONENT_TYPE>(4))*u
128 - 3*
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(w,static_cast<EXPONENT_TYPE>(2))*u
129 +
pow(w,static_cast<EXPONENT_TYPE>(5))
130 +
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(w,static_cast<EXPONENT_TYPE>(3));
137 Real DerivativeCoefficients<Real>::computeC01(
const Real & u,
const Real & v,
const Real & w){
138 Real result = -
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(7))
139 -
pow(v,static_cast<EXPONENT_TYPE>(2))*w*
pow(u,static_cast<EXPONENT_TYPE>(6))
140 +
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(5))
141 + 6*v*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
142 - 5*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(4))
143 -
pow(v,static_cast<EXPONENT_TYPE>(3))*w*
pow(u,static_cast<EXPONENT_TYPE>(4))
144 - 2*
pow(v,static_cast<EXPONENT_TYPE>(5))*
pow(u,static_cast<EXPONENT_TYPE>(3))
145 - 6*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
146 + 10*v*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
147 + 6*
pow(v,static_cast<EXPONENT_TYPE>(4))*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
148 - 3*
pow(w,static_cast<EXPONENT_TYPE>(4))*u
149 - 6*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(w,static_cast<EXPONENT_TYPE>(2))*u
150 + 2*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(w,static_cast<EXPONENT_TYPE>(3));
156 Real DerivativeCoefficients<Real>::computeC02(
const Real & u,
const Real & v,
const Real & w){
157 Real result =
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
158 +
pow(v,static_cast<EXPONENT_TYPE>(2))*w*
pow(u,static_cast<EXPONENT_TYPE>(4))
159 -
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
160 - 4*v*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
161 + 4*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
162 + 3*
pow(v,static_cast<EXPONENT_TYPE>(3))*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
163 - 3*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(w,static_cast<EXPONENT_TYPE>(2))*u
164 + v*
pow(w,static_cast<EXPONENT_TYPE>(3));
170 Real DerivativeCoefficients<Real>::computeC11(
const Real & u,
const Real & v,
const Real & w){
171 Real result = - w*
pow(u,static_cast<EXPONENT_TYPE>(8))
172 -
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(7))
173 + 7*v*w*
pow(u,static_cast<EXPONENT_TYPE>(6))
174 + 4*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(5))
175 - 5*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
176 - 16*
pow(v,static_cast<EXPONENT_TYPE>(2))*w*
pow(u,static_cast<EXPONENT_TYPE>(4))
177 - 4*
pow(v,static_cast<EXPONENT_TYPE>(4))*
pow(u,static_cast<EXPONENT_TYPE>(3))
178 + 16*v*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
179 - 3*
pow(w,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(2))
180 + 12*
pow(v,static_cast<EXPONENT_TYPE>(3))*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
181 - 12*
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(w,static_cast<EXPONENT_TYPE>(2))*u
182 + 3*v*
pow(w,static_cast<EXPONENT_TYPE>(3));
188 Real DerivativeCoefficients<Real>::computeC12(
const Real & u,
const Real & v,
const Real & w){
189 Real result = w*
pow(u,static_cast<EXPONENT_TYPE>(6))
190 +
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(5))
191 - 5*v*w*
pow(u,static_cast<EXPONENT_TYPE>(4))
192 - 2*
pow(v,static_cast<EXPONENT_TYPE>(3))*
pow(u,static_cast<EXPONENT_TYPE>(3))
193 + 4*
pow(w,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
194 + 6*
pow(v,static_cast<EXPONENT_TYPE>(2))*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
195 - 6*v*
pow(w,static_cast<EXPONENT_TYPE>(2))*u
196 +
pow(w,static_cast<EXPONENT_TYPE>(3));
202 Real DerivativeCoefficients<Real>::computeC22(
const Real & u,
const Real & v,
const Real & w){
203 Real result = - w*
pow(u,static_cast<EXPONENT_TYPE>(4))
204 -
pow(v,static_cast<EXPONENT_TYPE>(2))*
pow(u,static_cast<EXPONENT_TYPE>(3))
205 + 3*v*w*
pow(u,static_cast<EXPONENT_TYPE>(2))
206 - 3*
pow(w,static_cast<EXPONENT_TYPE>(2))*u;
210 template <
class Real>
213 const Real & denominator = 2.0*
pow(w*(u*v-w),static_cast<EXPONENT_TYPE>(3));
214 b[0] = computeC00(u,v,w)/denominator;
215 b[1] = computeC01(u,v,w)/denominator;
216 b[2] = computeC02(u,v,w)/denominator;
217 b[3] = computeC11(u,v,w)/denominator;
218 b[4] = computeC12(u,v,w)/denominator;
219 b[5] = computeC22(u,v,w)/denominator;
224 template<
class Float>
226 void accumBothDerivatives(
Matrix<complex<Float>,3>* result,
const Matrix<complex<Float>,3> &left,
227 const Matrix<complex<Float>,3> &right,
const Matrix<complex<Float>,3> &outer_prod)
229 const Float temp = (2.0*
getTrace(left*outer_prod)).real();;
230 for(
int k=0; k<3; ++k){
231 for(
int l=0; l<3; ++l){
235 result->operator()(k,l).x += temp*right(k,l).x;
236 result->operator()(k,l).y += temp*right(k,l).y;
243 template<
class Cmplx>
247 Cmplx temp =
getTrace(left*outer_prod);
248 for(
int k=0; k<3; ++k){
249 for(
int l=0; l<3; ++l){
250 result->operator()(k,l) = temp*right(k,l);
259 T getAbsMin(
const T*
const array,
int size){
260 T min = fabs(array[0]);
261 for (
int i=1; i<
size; ++i) {
262 T abs_val = fabs(array[i]);
263 if ((abs_val) < min){ min = abs_val; }
271 inline bool checkAbsoluteError(Real a, Real b, Real
epsilon)
273 if( fabs(a-b) < epsilon)
return true;
280 inline bool checkRelativeError(Real a, Real b, Real epsilon)
282 if( fabs((a-b)/b) < epsilon )
return true;
291 template<
class Float,
typename Arg>
293 void reciprocalRoot(
Matrix<complex<Float>,3>* res, DerivativeCoefficients<Float>* deriv_coeffs,
294 Float f[3],
Matrix<complex<Float>,3> & q, Arg &
arg) {
309 g[0] = g[1] = g[2] = c[0]/3.;
311 s = c[1]/3. - c[0]*c[0]/18;
312 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
314 Float cosTheta = r/
sqrt(s*s*s);
315 if (fabs(s) < arg.unitarize_eps) {
319 if(fabs(cosTheta)>1.0){ r>0 ? theta=0.0 : theta=HISQ_UNITARIZE_PI/3.0; }
320 else{ theta =
acos(cosTheta)/3.0; }
323 for(
int i=0; i<3; ++i){
324 g[i] += s*
cos(theta + (i-1)*HISQ_UNITARIZE_PI23);
342 bool perform_svd =
true;
345 if( fabs(det) >= arg.svd_abs_error) {
346 if( checkRelativeError(g[0]*g[1]*g[2],det,arg.svd_rel_error) ) perform_svd =
false;
353 computeSVD<Float>(q,tempq,
tmp2,g);
358 const Float gprod = g[0]*g[1]*g[2];
360 if (fabs(gprod - determinant) > arg.max_det_error) {
361 printf(
"Warning: Error in determinant computed by SVD : %g > %g\n", fabs(gprod-determinant), arg.max_det_error);
374 Float delta = getAbsMin(g,3);
375 if (delta < arg.force_filter) {
376 for (
int i=0; i<3; ++i) {
377 g[i] += arg.force_filter;
378 q(i,i).x += arg.force_filter;
386 for (
int i=0; i<3; ++i) c[i] =
sqrt(g[i]);
389 g[0] = c[0]+c[1]+c[2];
390 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
391 g[2] = c[0]*c[1]*c[2];
394 deriv_coeffs->set(g[0], g[1], g[2]);
396 const Float& denominator = g[2]*(g[0]*g[1]-g[2]);
397 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
398 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
399 c[2] = g[0]/denominator;
401 tempq = c[1]*q + c[2]*qsq;
403 tempq(0,0).x += c[0];
404 tempq(1,1).x += c[0];
405 tempq(2,2).x += c[0];
418 template<
class Float,
typename Arg>
420 void getUnitarizeForceSite(
Matrix<complex<Float>,3>& result,
const Matrix<complex<Float>,3> & v,
421 const Matrix<complex<Float>,3> & outer_prod, Arg &arg)
427 Link v_dagger =
conj(v);
431 DerivativeCoefficients<Float> deriv_coeffs;
433 reciprocalRoot<Float>(&rsqrt_q, &deriv_coeffs, f, q,
arg);
436 b[0] = deriv_coeffs.getB00();
437 b[1] = deriv_coeffs.getB01();
438 b[2] = deriv_coeffs.getB02();
439 b[3] = deriv_coeffs.getB11();
440 b[4] = deriv_coeffs.getB12();
441 b[5] = deriv_coeffs.getB22();
443 result = rsqrt_q*outer_prod;
446 Link qv_dagger = q*v_dagger;
447 Link vv_dagger = v*v_dagger;
448 Link vqv_dagger = v*qv_dagger;
449 Link temp = f[1]*vv_dagger + f[2]*vqv_dagger;
451 temp = f[1]*v_dagger + f[2]*qv_dagger;
452 Link conj_outer_prod =
conj(outer_prod);
454 temp = f[1]*v + f[2]*v*q;
455 result = result + outer_prod*temp*v_dagger + f[2]*q*outer_prod*vv_dagger;
456 result = result + v_dagger*conj_outer_prod*
conj(temp) + f[2]*qv_dagger*conj_outer_prod*v_dagger;
458 Link qsqv_dagger = q*qv_dagger;
459 Link pv_dagger = b[0]*v_dagger + b[1]*qv_dagger + b[2]*qsqv_dagger;
460 accumBothDerivatives(&result, v, pv_dagger, outer_prod);
462 Link rv_dagger = b[1]*v_dagger + b[3]*qv_dagger + b[4]*qsqv_dagger;
464 accumBothDerivatives(&result, vq, rv_dagger, outer_prod);
466 Link sv_dagger = b[2]*v_dagger + b[4]*qv_dagger + b[5]*qsqv_dagger;
468 accumBothDerivatives(&result, vqsq, sv_dagger, outer_prod);
475 template<
typename Float,
typename Arg>
476 __global__
void getUnitarizeForceField(Arg arg)
478 int idx = blockIdx.x*blockDim.x + threadIdx.x;
479 if(idx >= arg.threads)
return;
481 if(idx >= arg.threads/2) {
483 idx -= arg.threads/2;
488 Matrix<complex<Float>,3> v_tmp, result_tmp, oprod_tmp;
490 for(
int dir=0; dir<4; ++dir){
491 oprod_tmp = arg.force_old(dir, idx, parity);
492 v_tmp = arg.gauge(dir, idx, parity);
496 getUnitarizeForceSite<double>(result, v, oprod,
arg);
499 arg.force(dir, idx, parity) = result_tmp;
505 template <
typename Float,
typename Arg>
508 Matrix<complex<Float>,3> v_tmp, result_tmp, oprod_tmp;
510 for (
int parity=0; parity<2; parity++) {
511 for (
int i=0; i<arg.threads/2; i++) {
512 for (
int dir=0; dir<4; dir++) {
513 oprod_tmp = arg.force_old(dir, i, parity);
514 v_tmp = arg.gauge(dir, i, parity);
518 getUnitarizeForceSite<double>(result, v, oprod,
arg);
521 arg.force(dir, i, parity) = result_tmp;
527 void unitarizeForceCPU(cpuGaugeField& newForce,
const cpuGaugeField& oldForce,
const cpuGaugeField& gauge)
534 typedef gauge::MILCOrder<double,18> G;
535 UnitarizeForceArg<G,G>
arg(G(newForce), G(oldForce), G(gauge), gauge, &num_failures, unitarize_eps, force_filter,
536 max_det_error, allow_svd, svd_only, svd_rel_error, svd_abs_error);
537 unitarizeForceCPU<double>(
arg);
539 typedef gauge::MILCOrder<float,18> G;
540 UnitarizeForceArg<G,G>
arg(G(newForce), G(oldForce), G(gauge), gauge, &num_failures, unitarize_eps, force_filter,
541 max_det_error, allow_svd, svd_only, svd_rel_error, svd_abs_error);
542 unitarizeForceCPU<float>(
arg);
544 errorQuda(
"Precision = %d not supported", gauge.Precision());
548 typedef gauge::QDPOrder<double,18> G;
549 UnitarizeForceArg<G,G>
arg(G(newForce), G(oldForce), G(gauge), gauge, &num_failures, unitarize_eps, force_filter,
550 max_det_error, allow_svd, svd_only, svd_rel_error, svd_abs_error);
551 unitarizeForceCPU<double>(
arg);
553 typedef gauge::QDPOrder<float,18> G;
554 UnitarizeForceArg<G,G>
arg(G(newForce), G(oldForce), G(gauge), gauge, &num_failures, unitarize_eps, force_filter,
555 max_det_error, allow_svd, svd_only, svd_rel_error, svd_abs_error);
556 unitarizeForceCPU<float>(
arg);
558 errorQuda(
"Precision = %d not supported", gauge.Precision());
561 errorQuda(
"Only MILC and QDP gauge orders supported\n");
564 if (num_failures)
errorQuda(
"Unitarization failed, failures = %d", num_failures);
568 template <
typename Float,
typename Arg>
569 class UnitarizeForce :
public Tunable {
572 const GaugeField &meta;
574 unsigned int sharedBytesPerThread()
const {
return 0; }
575 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
578 bool tuneGridDim()
const {
return false; }
579 unsigned int minThreads()
const {
return arg.threads; }
582 UnitarizeForce(Arg &arg,
const GaugeField& meta) : arg(arg), meta(meta) {
583 writeAuxString(
"threads=%d,prec=%lu,stride=%d", meta.Volume(), meta.Precision(), meta.Stride());
585 virtual ~UnitarizeForce() { ; }
587 void apply(
const cudaStream_t &
stream) {
589 getUnitarizeForceField<Float><<<tp.grid,tp.block>>>(
arg);
593 void postTune() { cudaMemset(arg.fails, 0,
sizeof(
int)); }
595 long long flops()
const {
return 4ll*4528*meta.Volume(); }
596 long long bytes()
const {
return 4ll * arg.threads * (arg.force.Bytes() + arg.force_old.Bytes() + arg.gauge.Bytes()); }
598 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
601 template<
typename Float,
typename Gauge>
602 void unitarizeForce(Gauge newForce,
const Gauge oldForce,
const Gauge gauge,
603 const GaugeField &meta,
int* fails) {
605 UnitarizeForceArg<Gauge,Gauge>
arg(newForce, oldForce, gauge, meta, fails, unitarize_eps, force_filter,
606 max_det_error, allow_svd, svd_only, svd_rel_error, svd_abs_error);
607 UnitarizeForce<Float,UnitarizeForceArg<Gauge,Gauge> >
unitarizeForce(arg, meta);
613 void unitarizeForce(cudaGaugeField &newForce,
const cudaGaugeField &oldForce,
const cudaGaugeField &gauge,
617 errorQuda(
"Force field should not use reconstruct %d", oldForce.Reconstruct());
620 errorQuda(
"Force field should not use reconstruct %d", newForce.Reconstruct());
623 errorQuda(
"Gauge field should not use reconstruct %d", gauge.Reconstruct());
625 if (gauge.Precision() != oldForce.Precision() || gauge.Precision() != newForce.Precision())
626 errorQuda(
"Mixed precision not supported");
628 if (gauge.Order() != oldForce.Order() || gauge.Order() != newForce.Order())
629 errorQuda(
"Mixed data ordering not supported");
633 typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type G;
634 unitarizeForce<double>(G(newForce), G(oldForce), G(gauge), gauge, fails);
636 typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type G;
637 unitarizeForce<float>(G(newForce), G(oldForce), G(gauge), gauge, fails);
640 errorQuda(
"Data order %d not supported", gauge.Order());
cudaColorSpinorField * tmp2
__host__ __device__ double set(double &x)
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.
void unitarizeForceCPU(cpuGaugeField &newForce, const cpuGaugeField &oldForce, const cpuGaugeField &gauge)
Unitarize the fermion force on CPU.
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
static double svd_rel_error
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
static double unitarize_eps
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType acos(ValueType x)
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...