v0.9.0/doc/unitarize__links__quda_8cu_source.html

 #include <cstdlib>
 #include <cstdio>
 #include <iostream>
 #include <iomanip>
 #include <cuda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>

 #include <tune_quda.h>
 #include <quda_matrix.h>
 #include <unitarization_links.h>

 #include <su3_project.cuh>
 #include <index_helper.cuh>


 namespace quda{
 #ifdef GPU_UNITARIZE

 namespace{
   #include <svd_quda.h>
 }

 #ifndef FL_UNITARIZE_PI
 #define FL_UNITARIZE_PI 3.14159265358979323846
 #endif
 #ifndef FL_UNITARIZE_PI23
 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*0.66666666666666666666
 #endif

   static const int max_iter_newton = 20;
   static const int max_iter = 20;

   static double unitarize_eps = 1e-14;
   static double max_error = 1e-10;
   static int reunit_allow_svd = 1;
   static int reunit_svd_only  = 0;
   static double svd_rel_error = 1e-6;
   static double svd_abs_error = 1e-6;

   template <typename Out, typename In>
   struct UnitarizeLinksArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     Out output;
     const In input;
     int *fails;
     const int max_iter;
     const double unitarize_eps;
     const double max_error;
     const int reunit_allow_svd;
     const int reunit_svd_only;
     const double svd_rel_error;
     const double svd_abs_error;
     const static bool check_unitarization = true;

     UnitarizeLinksArg(Out &output, const In &input, const GaugeField &data, int* fails,
           int max_iter, double unitarize_eps, double max_error,
           int reunit_allow_svd, int reunit_svd_only, double svd_rel_error,
           double svd_abs_error)
       : output(output), input(input), fails(fails), unitarize_eps(unitarize_eps),
   max_iter(max_iter), max_error(max_error), reunit_allow_svd(reunit_allow_svd),
   reunit_svd_only(reunit_svd_only), svd_rel_error(svd_rel_error),
   svd_abs_error(svd_abs_error)
     {
       for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
       threads = X[0]*X[1]*X[2]*X[3];
     }
   };

   void setUnitarizeLinksConstants(double unitarize_eps_, double max_error_,
           bool reunit_allow_svd_, bool reunit_svd_only_,
           double svd_rel_error_, double svd_abs_error_) {
     unitarize_eps = unitarize_eps_;
     max_error = max_error_;
     reunit_allow_svd = reunit_allow_svd_;
     reunit_svd_only = reunit_svd_only_;
     svd_rel_error = svd_rel_error_;
     svd_abs_error = svd_abs_error_;
   }


   template<class Cmplx>
   __device__ __host__
   bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
           const Matrix<Cmplx,3>& unitary_matrix,
           double max_error)
   {
     Matrix<Cmplx,3> temporary;
     temporary = conj(initial_matrix)*unitary_matrix;
     temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;

     for(int i=0; i<3; ++i){
       for(int j=0; j<3; ++j){
   if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
     return false;
   }
       }
     }
     return true;
   }


   template<class T>
   __device__ __host__
   T getAbsMin(const T* const array, int size){
     T min = fabs(array[0]);
     for(int i=1; i<size; ++i){
       T abs_val = fabs(array[i]);
       if((abs_val) < min){ min = abs_val; }
     }
     return min;
   }


   template<class Real>
   __device__ __host__
   inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
   {
     if( fabs(a-b) <  epsilon) return true;
     return false;
   }


   template<class Real>
   __device__ __host__
   inline bool checkRelativeError(Real a, Real b, Real epsilon)
   {
     if( fabs((a-b)/b)  < epsilon ) return true;
     return false;
   }


   // Compute the reciprocal square root of the matrix q
   // Also modify q if the eigenvalues are dangerously small.
   template<class Float, typename Arg>
   __device__  __host__
   bool reciprocalRoot(const Matrix<complex<Float>,3>& q, Matrix<complex<Float>,3>* res, Arg &arg){

     Matrix<complex<Float>,3> qsq, tempq;

     Float c[3];
     Float g[3];

     const Float one_third = 0.333333333333333333333;
     const Float one_ninth = 0.111111111111111111111;
     const Float one_eighteenth = 0.055555555555555555555;

     qsq = q*q;
     tempq = qsq*q;

     c[0] = getTrace(q).x;
     c[1] = getTrace(qsq).x * 0.5;
     c[2] = getTrace(tempq).x * one_third;;

     g[0] = g[1] = g[2] = c[0] * one_third;
     Float r,s,theta;
     s = c[1]*one_third - c[0]*c[0]*one_eighteenth;

     Float cosTheta;
     if(fabs(s) >= arg.unitarize_eps){ // faster when this conditional is removed?
       const Float rsqrt_s = rsqrt(s);
       r = c[2]*0.5 - (c[0]*one_third)*(c[1] - c[0]*c[0]*one_ninth);
       cosTheta = r*rsqrt_s*rsqrt_s*rsqrt_s;

       if(fabs(cosTheta) >= 1.0){
   theta = (r > 0) ? 0.0 : FL_UNITARIZE_PI;
       }else{
   theta = acos(cosTheta); // this is the primary performance limiter
       }

       const Float sqrt_s = s*rsqrt_s;

 #if 0 // experimental version
       Float as, ac;
       sincos( theta*one_third, &as, &ac );
       g[0] = c[0]*one_third + 2*sqrt_s*ac;
       //g[1] = c[0]*one_third + 2*sqrt_s*(ac*cos(1*FL_UNITARIZE_PI23) - as*sin(1*FL_UNITARIZE_PI23));
       g[1] = c[0]*one_third - 2*sqrt_s*(0.5*ac + as*0.8660254037844386467637);
       //g[2] = c[0]*one_third + 2*sqrt_s*(ac*cos(2*FL_UNITARIZE_PI23) - as*sin(2*FL_UNITARIZE_PI23));
       g[2] = c[0]*one_third + 2*sqrt_s*(-0.5*ac + as*0.8660254037844386467637);
 #else
       g[0] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third );
       g[1] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + FL_UNITARIZE_PI23 );
       g[2] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + 2*FL_UNITARIZE_PI23 );
 #endif
     }

     // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
     // return false. Then call SVD instead.
     Float det = getDeterminant(q).x;
     if( fabs(det) < arg.svd_abs_error) return false;
     if( checkRelativeError(g[0]*g[1]*g[2],det,arg.svd_rel_error) == false ) return false;


     // At this point we have finished with the c's
     // use these to store sqrt(g)
     for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);

     // done with the g's, use these to store u, v, w
     g[0] = c[0]+c[1]+c[2];
     g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
     g[2] = c[0]*c[1]*c[2];

     const Float denominator  = 1.0 / ( g[2]*(g[0]*g[1]-g[2]) );
     c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1])) * denominator;
     c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1]) * denominator;
     c[2] =  g[0] * denominator;

     tempq = c[1]*q + c[2]*qsq;
     // Add a real scalar
     tempq(0,0).x += c[0];
     tempq(1,1).x += c[0];
     tempq(2,2).x += c[0];

     *res = tempq;

     return true;
   }


   template<class Float, typename Arg>
   __host__ __device__
   bool unitarizeLinkMILC(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, Arg &arg)
   {
     Matrix<complex<Float>,3> u;
     if( !arg.reunit_svd_only ){
       if( reciprocalRoot<Float>(conj(in)*in,&u,arg) ){
   *result = in*u;
   return true;
       }
     }

     // If we've got this far, then the Caley-Hamilton unitarization
     // has failed. If SVD is not allowed, the unitarization has failed.
     if( !arg.reunit_allow_svd ) return false;

     Matrix<complex<Float>,3> v;
     Float singular_values[3];
     computeSVD<Float>(in, u, v, singular_values);
     *result = u*conj(v);
     return true;
   } // unitarizeMILC


   template<class Float>
   __host__ __device__
   bool unitarizeLinkSVD(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result,
       const double max_error)
   {
     Matrix<complex<Float>,3> u, v;
     Float singular_values[3];
     computeSVD<Float>(in, u, v, singular_values); // should pass pointers to u,v I guess

     *result = u*conj(v);

     if (isUnitary(*result,max_error)==false)
       {
   printf("ERROR: Link unitarity test failed\n");
   printf("TOLERANCE: %g\n", max_error);
   return false;
       }
     return true;
   }


   template<class Float>
   __host__ __device__
   bool unitarizeLinkNewton(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, int max_iter)
   {
     Matrix<complex<Float>,3> u, uinv;
     u = in;

     for(int i=0; i<max_iter; ++i){
       computeMatrixInverse(u, &uinv);
       u = 0.5*(u + conj(uinv));
     }

     if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
       {
         printf("ERROR: Unitarized link is not consistent with incoming link\n");
   return false;
       }
     *result = u;

     return true;
   }

   void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField& infield)
   {
     if (infield.Precision() != outfield.Precision())
       errorQuda("Precisions must match (out=%d != in=%d)", outfield.Precision(), infield.Precision());

     int num_failures = 0;
     Matrix<complex<double>,3> inlink, outlink;

     for (int i=0; i<infield.Volume(); ++i){
       for (int dir=0; dir<4; ++dir){
   if (infield.Precision() == QUDA_SINGLE_PRECISION){
     copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
     if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
     copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
   } else if (infield.Precision() == QUDA_DOUBLE_PRECISION){
     copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
     if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
     copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
   } // precision?
       } // dir
     }  // loop over volume
     return;
   }

   // CPU function which checks that the gauge field is unitary
   bool isUnitary(const cpuGaugeField& field, double max_error)
   {
     Matrix<complex<double>,3> link, identity;

     for(int i=0; i<field.Volume(); ++i){
       for(int dir=0; dir<4; ++dir){
   if(field.Precision() == QUDA_SINGLE_PRECISION){
     copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
   }else if(field.Precision() == QUDA_DOUBLE_PRECISION){
     copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
   }else{
     errorQuda("Unsupported precision\n");
   }
   if(isUnitary(link,max_error) == false){
     printf("Unitarity failure\n");
     printf("site index = %d,\t direction = %d\n", i, dir);
     printLink(link);
     identity = conj(link)*link;
     printLink(identity);
     return false;
   }
       } // dir
     } // i
     return true;
   } // is unitary


   template<typename Float, typename Out, typename In>
   __global__ void DoUnitarizedLink(UnitarizeLinksArg<Out,In> arg){
     int idx = threadIdx.x + blockIdx.x*blockDim.x;
     if(idx >= arg.threads) return;
     int parity = 0;
     if(idx >= arg.threads/2) {
       parity = 1;
       idx -= arg.threads/2;
     }
     int X[4];
     for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
     int x[4];
     getCoords(x, idx, X, parity);

     idx = linkIndex(x,X);
     // result is always in double precision
     Matrix<complex<double>,3> v, result;
     Matrix<complex<Float>,3> tmp;
     for (int mu = 0; mu < 4; mu++) {
       arg.input.load((Float*)(tmp.data),idx, mu, parity);

       v = tmp;
       unitarizeLinkMILC(v, &result, arg);
       if (arg.check_unitarization) {
         if (isUnitary(result,arg.max_error) == false) atomicAdd(arg.fails, 1);
       }
       //WRITE BACK IF FAIL??????????
       tmp = result;

       arg.output.save((Float*)(tmp.data),idx, mu, parity);
     }
   }


   template<typename Float, typename Out, typename In>
   class UnitarizeLinks : Tunable {
     UnitarizeLinksArg<Out,In> arg;

     unsigned int sharedBytesPerThread() const { return 0; }
     unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }

     // don't tune the grid dimension
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.threads; }

   public:
     UnitarizeLinks(UnitarizeLinksArg<Out,In> &arg) : arg(arg) { }


     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       DoUnitarizedLink<Float,Out,In><<<tp.grid, tp.block, 0, stream>>>(arg);
     }
     void preTune() { if (arg.input.gauge == arg.output.gauge) arg.output.save(); }
     void postTune() {
       if (arg.input.gauge == arg.output.gauge) arg.output.load();
       cudaMemset(arg.fails, 0, sizeof(int)); // reset fails counter
     }

     long long flops() const {
       // Accounted only the minimum flops for the case reunitarize_svd_only=0
       return 4588LL*arg.threads;
     }
     long long bytes() const { return 4ll * arg.threads * (arg.input.Bytes() + arg.output.Bytes()); }

     TuneKey tuneKey() const {
       std::stringstream vol, aux;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
     }
   };


   template<typename Float, typename Out, typename In>
   void unitarizeLinks(Out output,  const In input, const cudaGaugeField& meta, int* fails) {
     UnitarizeLinksArg<Out,In> arg(output, input, meta, fails, max_iter, unitarize_eps, max_error,
               reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error);
     UnitarizeLinks<Float, Out, In> unitlinks(arg) ;
     unitlinks.apply(0);
     qudaDeviceSynchronize(); // need to synchronize to ensure failure write has completed
   }

 template<typename Float>
 void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {

   if( output.isNative() && input.isNative() ) {
     if(output.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Out;

       if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else {
   errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
       }

     } else if(output.Reconstruct() == QUDA_RECONSTRUCT_12){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Out;

       if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else {
   errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
       }


     } else if(output.Reconstruct() == QUDA_RECONSTRUCT_8){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Out;

       if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
   unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
       } else {
   errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
       }


     } else {
       errorQuda("Reconstruction type %d of gauge field not supported", output.Reconstruct());
     }
   } else {
     errorQuda("Invalid Gauge Order (output=%d, input=%d)", output.Order(), input.Order());
   }
 }

 #endif

   void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {
 #ifdef GPU_UNITARIZE
     if (input.Precision() != output.Precision())
       errorQuda("input (%d) and output (%d) precisions must match", output.Precision(), input.Precision());

     if (input.Precision() == QUDA_SINGLE_PRECISION) {
       unitarizeLinks<float>(output, input, fails);
     } else if(input.Precision() == QUDA_DOUBLE_PRECISION) {
       unitarizeLinks<double>(output, input, fails);
     } else {
       errorQuda("Precision %d not supported", input.Precision());
     }
 #else
     errorQuda("Unitarization has not been built");
 #endif
   }

   void unitarizeLinks(cudaGaugeField &links, int* fails) {
     unitarizeLinks(links, links, fails);
   }


   template <typename Float, typename G>
   struct ProjectSU3Arg {
     int threads; // number of active threads required
     G u;
     Float tol;
     int *fails;
     int X[4];
     ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
       : u(u), tol(tol), fails(fails) {
       for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
       threads = meta.VolumeCB();
     }
   };

   template<typename Float, typename G>
   __global__ void ProjectSU3kernel(ProjectSU3Arg<Float,G> arg){
     int idx = threadIdx.x + blockIdx.x*blockDim.x;
     int parity = blockIdx.y;
     if(idx >= arg.threads) return;

     Matrix<complex<Float>,3> u;

     for (int mu = 0; mu < 4; mu++) {
       arg.u.load((Float*)(u.data),idx, mu, parity);
       polarSu3<Float>(u, arg.tol);

       // count number of failures
       if(isUnitary(u, arg.tol) == false) atomicAdd(arg.fails, 1);

       arg.u.save((Float*)(u.data),idx, mu, parity);
     }
   }

   template<typename Float, typename G>
   class ProjectSU3 : Tunable {
     ProjectSU3Arg<Float,G> arg;

     unsigned int sharedBytesPerThread() const { return 0; }
     unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }

     // don't tune the grid dimension
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.threads; }

   public:
     ProjectSU3(ProjectSU3Arg<Float,G> &arg) : arg(arg) { }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       ProjectSU3kernel<Float,G><<<tp.grid, tp.block, 0, stream>>>(arg);
     }
     void preTune() { arg.u.save(); }
     void postTune() { arg.u.load(); }

     long long flops() const { return 0; } // depends on number of iterations
     long long bytes() const { return 4ll * arg.threads * arg.u.Bytes(); }

     TuneKey tuneKey() const {
       std::stringstream vol, aux;
       vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
       aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
     }
   };


   template <typename Float>
   void projectSU3(cudaGaugeField &u, double tol, int *fails) {
     if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
       ProjectSU3Arg<Float,G> arg(G(u), u, static_cast<Float>(tol), fails);
       ProjectSU3<Float,G> project(arg);
       project.apply(0);
       qudaDeviceSynchronize();
       checkCudaError();
     } else {
       errorQuda("Reconstruct %d not supported", u.Reconstruct());
     }
   }

   void projectSU3(cudaGaugeField &u, double tol, int *fails) {
 #ifdef GPU_UNITARIZE
     // check the the field doesn't have staggered phases applied
     if (u.StaggeredPhaseApplied())
       errorQuda("Cannot project gauge field with staggered phases applied");

     if (u.Precision() == QUDA_DOUBLE_PRECISION) {
       projectSU3<double>(u, tol, fails);
     } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
       projectSU3<float>(u, tol, fails);
     } else {
       errorQuda("Precision %d not supported", u.Precision());
     }
 #else
     errorQuda("Unitarization has not been built");
 #endif
   }

 } // namespace quda

quda::ProjectSU3::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: unitarize_links_quda.cu:554

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

quda::ProjectSU3::tuneGridDim
bool tuneGridDim() const
Definition: unitarize_links_quda.cu:558

quda::ProjectSU3::apply
void apply(const cudaStream_t &stream)
Definition: unitarize_links_quda.cu:564

mu
double mu
Definition: test_util.cpp:1643

quda::ProjectSU3::minThreads
unsigned int minThreads() const
Definition: unitarize_links_quda.cu:559

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

quda::ProjectSU3::postTune
void postTune()
Definition: unitarize_links_quda.cu:569

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)

quda::ProjectSU3::tuneKey
TuneKey tuneKey() const
Definition: unitarize_links_quda.cu:574

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

links
void * links[4]
Definition: covdev_test.cpp:47

quda::ProjectSU3::ProjectSU3
ProjectSU3(ProjectSU3Arg< Float, G > &arg)
Definition: unitarize_links_quda.cu:562

reunit_svd_only
static bool reunit_svd_only
Definition: unitarize_link_test.cpp:38

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

quda::ProjectSU3kernel
__global__ void ProjectSU3kernel(ProjectSU3Arg< Float, G > arg)
Definition: unitarize_links_quda.cu:532

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda::ProjectSU3Arg::ProjectSU3Arg
ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
Definition: unitarize_links_quda.cu:524

quda::ProjectSU3Arg::X
int X[4]
Definition: unitarize_links_quda.cu:523

quda::ProjectSU3::arg
ProjectSU3Arg< Float, G > arg
Definition: unitarize_links_quda.cu:552

svd_rel_error
static double svd_rel_error
Definition: unitarize_link_test.cpp:39

quda::ProjectSU3::bytes
long long bytes() const
Definition: unitarize_links_quda.cu:572

quda::ProjectSU3Arg::tol
Float tol
Definition: unitarize_links_quda.cu:521

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

num_failures
int num_failures
Definition: gauge_alg_test.cpp:32

quda
Definition: blas_cublas.h:6

quda::printLink
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1039

quda::copyLinkToArray
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:978

quda::isUnitary
bool isUnitary(const cpuGaugeField &field, double max_error)

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::Tunable
Definition: tune_quda.h:60

tol
double tol
Definition: test_util.cpp:1647

quda::unitarizeLinks
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
Definition: unitarize_links_quda.cu:495

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

printf
int printf(const char *,...) __attribute__((__format__(__printf__

quda::ProjectSU3Arg
Definition: unitarize_links_quda.cu:518

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:44

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:74

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::ProjectSU3::preTune
void preTune()
Definition: unitarize_links_quda.cu:568

quda::cudaGaugeField
Definition: gauge_field.h:298

unitarize_eps
static double unitarize_eps
Definition: unitarize_link_test.cpp:36

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

su3_project.cuh

quda::projectSU3
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
Definition: unitarize_links_quda.cu:584

quda::ProjectSU3
Definition: unitarize_links_quda.cu:551

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

quda::copyArrayToLink
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:951

tune_quda.h

quda::ProjectSU3::flops
long long flops() const
Definition: unitarize_links_quda.cu:571

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

reunit_allow_svd
static bool reunit_allow_svd
Definition: unitarize_link_test.cpp:37

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

unitarization_links.h

quda::getTrace
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305

index_helper.cuh

quda::unitarizeLinksCPU
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

svd_abs_error
static double svd_abs_error
Definition: unitarize_link_test.cpp:40

s
size_t s
Definition: CMakeCUDACompilerId.cpp1.ii:2229

quda::ProjectSU3Arg::threads
int threads
Definition: unitarize_links_quda.cu:519

quda::gauge_mapper
Definition: gauge_field_order.h:2083

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:425

fabs
double fabs(double)

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

size
size_t size
Definition: CMakeCUDACompilerId.cpp1.ii:2289

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::computeMatrixInverse
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:501

quda::ProjectSU3Arg::u
G u
Definition: unitarize_links_quda.cu:520

svd_quda.h

c
const void * c
Definition: CMakeCUDACompilerId.cpp1.ii:2234

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::acos
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:50

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

array
struct cudaExtent unsigned int cudaArray_t array
Definition: CMakeCUDACompilerId.cpp1.ii:2547

quda::getDeterminant
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:312

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

parity
QudaParity parity
Definition: covdev_test.cpp:53

a
#define a
Definition: dw_dslash4_core.h:82

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:68

quda::Tunable::aux
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::ProjectSU3Arg::fails
int * fails
Definition: unitarize_links_quda.cu:522

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

quda::GaugeField
Definition: gauge_field.h:123

quda::ProjectSU3::sharedBytesPerBlock
unsigned int sharedBytesPerBlock(const TuneParam &) const
Definition: unitarize_links_quda.cu:555

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129