quda-ref/v0.7.0/inv__cg__quda_8cpp_source.html

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>


 #include <quda_internal.h>

 #include <color_spinor_field.h>

 #include <blas_quda.h>

 #include <dslash_quda.h>

 #include <invert_quda.h>

 #include <util_quda.h>

 #include <sys/time.h>


 #include <face_quda.h>


 #include <iostream>


 namespace quda {


   CG::CG(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile) :

     Solver(param, profile), mat(mat), matSloppy(matSloppy)

   {


   }


   CG::~CG() {


   }


   void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b)

   {

     profile.Start(QUDA_PROFILE_INIT);


     // Check to see that we're not trying to invert on a zero-field source

     const double b2 = norm2(b);

     if(b2 == 0){

       profile.Stop(QUDA_PROFILE_INIT);

       printfQuda("Warning: inverting on zero-field source\n");

       x=b;

       param.true_res = 0.0;

       param.true_res_hq = 0.0;

       return;

     }


     cudaColorSpinorField r(b);


     ColorSpinorParam csParam(x);

     csParam.create = QUDA_ZERO_FIELD_CREATE;

     cudaColorSpinorField y(b, csParam);


     mat(r, x, y);


     double r2 = xmyNormCuda(b, r);


     csParam.setPrecision(param.precision_sloppy);

     cudaColorSpinorField Ap(x, csParam);

     cudaColorSpinorField tmp(x, csParam);


     // tmp2 only needed for multi-gpu Wilson-like kernels

     cudaColorSpinorField *tmp2_p = !mat.isStaggered() ?

       new cudaColorSpinorField(x, csParam) : &tmp;

     cudaColorSpinorField &tmp2 = *tmp2_p;


     cudaColorSpinorField *r_sloppy;

     if (param.precision_sloppy == x.Precision()) {

       r_sloppy = &r;

     } else {

       csParam.create = QUDA_COPY_FIELD_CREATE;

       r_sloppy = new cudaColorSpinorField(r, csParam);

     }


     cudaColorSpinorField *x_sloppy;

     if (param.precision_sloppy == x.Precision() ||

         !param.use_sloppy_partial_accumulator) {

       x_sloppy = &x;

     } else {

       csParam.create = QUDA_COPY_FIELD_CREATE;

       x_sloppy = new cudaColorSpinorField(x, csParam);

     }


     // additional high-precision temporary if Wilson and mixed-precision

     csParam.setPrecision(param.precision);

     cudaColorSpinorField *tmp3_p =

       (param.precision != param.precision_sloppy && !mat.isStaggered()) ?

       new cudaColorSpinorField(x, csParam) : &tmp;

     cudaColorSpinorField &tmp3 = *tmp3_p;


     cudaColorSpinorField &xSloppy = *x_sloppy;

     cudaColorSpinorField &rSloppy = *r_sloppy;

     cudaColorSpinorField p(rSloppy);


     if(&x != &xSloppy){

       copyCuda(y,x);

       zeroCuda(xSloppy);

     } else {

       zeroCuda(y);

     }


     const bool use_heavy_quark_res =

       (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;

     bool heavy_quark_restart = false;


     profile.Stop(QUDA_PROFILE_INIT);

     profile.Start(QUDA_PROFILE_PREAMBLE);


     double r2_old;


     double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver


     double heavy_quark_res = 0.0; // heavy quark residual

     double heavy_quark_res_old = 0.0; // heavy quark residual


     if (use_heavy_quark_res) {

       heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x, r).z);

       heavy_quark_res_old = heavy_quark_res; // heavy quark residual

     }

     const int heavy_quark_check = 1; // how often to check the heavy quark residual


     double alpha=0.0, beta=0.0;

     double pAp;

     int rUpdate = 0;


     double rNorm = sqrt(r2);

     double r0Norm = rNorm;

     double maxrx = rNorm;

     double maxrr = rNorm;

     double delta = param.delta;


     // this parameter determines how many consective reliable update

     // reisudal increases we tolerate before terminating the solver,

     // i.e., how long do we want to keep trying to converge

     const int maxResIncrease = (use_heavy_quark_res ? 0 : param.max_res_increase); // check if we reached the limit of our tolerance

     const int maxResIncreaseTotal = param.max_res_increase_total;

     // 0 means we have no tolerance

     // maybe we should expose this as a parameter

     const int hqmaxresIncrease = maxResIncrease + 1;


     int resIncrease = 0;

     int resIncreaseTotal = 0;

     int hqresIncrease = 0;


     // set this to true if maxResIncrease has been exceeded but when we use heavy quark residual we still want to continue the CG

     // only used if we use the heavy_quark_res

     bool L2breakdown =false;


     profile.Stop(QUDA_PROFILE_PREAMBLE);

     profile.Start(QUDA_PROFILE_COMPUTE);

     blas_flops = 0;


     int k=0;


     PrintStats("CG", k, r2, b2, heavy_quark_res);


     int steps_since_reliable = 1;

     bool converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);


     while ( !converged && k < param.maxiter) {

       matSloppy(Ap, p, tmp, tmp2); // tmp as tmp


       double sigma;


       bool breakdown = false;


       if (param.pipeline) {

         double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p);

         r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z;

         r2_old = r2;


         alpha = r2 / pAp;

         sigma = alpha*(alpha * Ap2 - pAp);

         if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down

           r2 = axpyNormCuda(-alpha, Ap, rSloppy);

           sigma = r2;

           breakdown = true;

         }


         r2 = sigma;

       } else {

         r2_old = r2;

         pAp = reDotProductCuda(p, Ap);

         alpha = r2 / pAp;


         // here we are deploying the alternative beta computation

         Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy);

         r2 = real(cg_norm); // (r_new, r_new)

         sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks

       }


       // reliable update conditions

       rNorm = sqrt(r2);

       if (rNorm > maxrx) maxrx = rNorm;

       if (rNorm > maxrr) maxrr = rNorm;

       int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;

       int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;


       // force a reliable update if we are within target tolerance (only if doing reliable updates)

       if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && param.delta >= param.tol) updateX = 1;


       // For heavy-quark inversion force a reliable update if we continue after

       if (use_heavy_quark_res and L2breakdown and convergenceHQ(r2, heavy_quark_res, stop, param.tol_hq) and param.delta >= param.tol) {

         updateX = 1;

       }


       if ( !(updateR || updateX)) {

         //beta = r2 / r2_old;

         beta = sigma / r2_old; // use the alternative beta computation


         if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, xSloppy, rSloppy, p);

         else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);


         if (use_heavy_quark_res && k%heavy_quark_check==0) {

           if (&x != &xSloppy) {

             copyCuda(tmp,y);

             heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z);

           } else {

             copyCuda(r, rSloppy);

             heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(x, y, r).z);

           }

         }


         steps_since_reliable++;

       } else {


         axpyCuda(alpha, p, xSloppy);

         copyCuda(x, xSloppy); // nop when these pointers alias


         xpyCuda(x, y); // swap these around?

         mat(r, y, x, tmp3); // here we can use x as tmp

         r2 = xmyNormCuda(b, r);


         copyCuda(rSloppy, r); //nop when these pointers alias

         zeroCuda(xSloppy);


         // calculate new reliable HQ resididual

         if (use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y, r).z);


         // break-out check if we have reached the limit of the precision

         if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this

           resIncrease++;

           resIncreaseTotal++;

           warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",

                       sqrt(r2), r0Norm, resIncreaseTotal);

           if ( resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {

             if (use_heavy_quark_res) L2breakdown = true;

             else break;

           }

         } else {

           resIncrease = 0;

         }

         // if L2 broke down already we turn off reliable updates and restart the CG

         if (use_heavy_quark_res and L2breakdown) {

           delta = 0;

           warningQuda("CG: Restarting without reliable updates for heavy-quark residual");

           heavy_quark_restart = true;

           if (heavy_quark_res > heavy_quark_res_old) {

             hqresIncrease++;

             warningQuda("CG: new reliable HQ residual norm %e is greater than previous reliable residual norm %e", heavy_quark_res, heavy_quark_res_old);

             // break out if we do not improve here anymore

             if (hqresIncrease > hqmaxresIncrease) break;

           }

         }


         rNorm = sqrt(r2);

         maxrr = rNorm;

         maxrx = rNorm;

         r0Norm = rNorm;

         rUpdate++;


         if (use_heavy_quark_res and heavy_quark_restart) {

           // perform a restart

           copyCuda(p, rSloppy);

           heavy_quark_restart = false;

         }

         else {

           // explicitly restore the orthogonality of the gradient vector

           double rp = reDotProductCuda(rSloppy, p) / (r2);

           axpyCuda(-rp, rSloppy, p);


           beta = r2 / r2_old;

           xpayCuda(rSloppy, beta, p);

         }


         steps_since_reliable = 0;

         heavy_quark_res_old = heavy_quark_res;

       }


       breakdown = false;

       k++;


       PrintStats("CG", k, r2, b2, heavy_quark_res);

       // check convergence, if convergence is satisfied we only need to check that we had a reliable update for the heavy quarks recently

       converged = convergence(r2, heavy_quark_res, stop, param.tol_hq);


       // check for recent enough relibale updates of the HQ residual if we use it

       if (use_heavy_quark_res) {

         // L2 is concverged or precision maxed out for L2

         bool L2done = L2breakdown or convergenceL2(r2, heavy_quark_res, stop, param.tol_hq);

         // HQ is converged and if we do reliable update the HQ residual has been caclculated using a reliable update

         bool HQdone = (steps_since_reliable == 0 and param.delta > 0) and convergenceHQ(r2, heavy_quark_res, stop, param.tol_hq);

         converged = L2done and HQdone;

       }


     }


     copyCuda(x, xSloppy); // nop when these pointers alias

     xpyCuda(y, x);


     profile.Stop(QUDA_PROFILE_COMPUTE);

     profile.Start(QUDA_PROFILE_EPILOGUE);


     param.secs = profile.Last(QUDA_PROFILE_COMPUTE);

     double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;

     reduceDouble(gflops);

     param.gflops = gflops;

     param.iter += k;


     if (k==param.maxiter)

       warningQuda("Exceeded maximum iterations %d", param.maxiter);


     if (getVerbosity() >= QUDA_VERBOSE)

       printfQuda("CG: Reliable updates = %d\n", rUpdate);


     // compute the true residuals

     mat(r, x, y);

     param.true_res = sqrt(xmyNormCuda(b, r) / b2);

 #if (__COMPUTE_CAPABILITY__ >= 200)

     param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z);

 #else

     param.true_res_hq = 0.0;

 #endif


     PrintSummary("CG", k, r2, b2);


     // reset the flops counters

     quda::blas_flops = 0;

     mat.flops();

     matSloppy.flops();


     profile.Stop(QUDA_PROFILE_EPILOGUE);

     profile.Start(QUDA_PROFILE_FREE);


     if (&tmp3 != &tmp) delete tmp3_p;

     if (&tmp2 != &tmp) delete tmp2_p;


     if (rSloppy.Precision() != r.Precision()) delete r_sloppy;

     if (xSloppy.Precision() != x.Precision()) delete x_sloppy;


     profile.Stop(QUDA_PROFILE_FREE);


     return;

   }


 } // namespace quda

invert_quda.h

quda::Solver::convergence
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:82

QUDA_VERBOSE
Definition: enum_quda.h:217

quda::ColorSpinorParam::setPrecision
void setPrecision(QudaPrecision precision)
Definition: color_spinor_field.h:109

quda::tripleCGReductionCuda
double3 tripleCGReductionCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:811

quda::Solver::stopping
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
Definition: solver.cpp:65

quda::SolverParam::delta
double delta
Definition: invert_quda.h:41

y
int y[4]
Definition: staggered_dslash_core.h:356

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

quda::QUDA_PROFILE_FREE
Definition: quda_internal.h:150

color_spinor_field.h

quda::SolverParam::secs
double secs
Definition: invert_quda.h:140

quda::SolverParam::true_res_hq
double true_res_hq
Definition: invert_quda.h:72

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

quda::axpyNormCuda
double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:321

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

quda::Solver::convergenceL2
bool convergenceL2(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:110

quda::axpyZpbxCuda
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
Definition: blas_quda.cu:338

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

quda::Solver::profile
TimeProfile & profile
Definition: invert_quda.h:224

quda::cudaColorSpinorField
Definition: color_spinor_field.h:302

util_quda.h

quda::SolverParam::iter
int iter
Definition: invert_quda.h:78

quda::SolverParam::max_res_increase_total
int max_res_increase_total
Definition: invert_quda.h:54

quda::TimeProfile
Definition: quda_internal.h:171

quda::axpyCGNormCuda
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:682

quda::DiracMatrix::flops
unsigned long long flops() const
Definition: dirac_quda.h:587

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::SolverParam::pipeline
int pipeline
Definition: invert_quda.h:57

tmp2
cudaColorSpinorField * tmp2
Definition: dslash_test.cpp:41

QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:305

tmp
cudaColorSpinorField * tmp
Definition: staggered_dslash_test.cpp:48

quda::Solver::PrintSummary
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
Definition: solver.cpp:137

quda::SolverParam::gflops
double gflops
Definition: invert_quda.h:143

quda::SolverParam::residual_type
QudaResidualType residual_type
Definition: invert_quda.h:35

quda::CG::CG
CG(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam &param, TimeProfile &profile)
Definition: inv_cg_quda.cpp:19

quda::QUDA_PROFILE_EPILOGUE
Definition: quda_internal.h:149

quda::SolverParam::maxiter
int maxiter
Definition: invert_quda.h:75

csParam
ColorSpinorParam csParam
Definition: pack_test.cpp:24

face_quda.h

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:148

warningQuda
#define warningQuda(...)
Definition: util_quda.h:84

quda::copyCuda
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235

quda::SolverParam::tol_hq
double tol_hq
Definition: invert_quda.h:66

quda::CG::operator()
void operator()(cudaColorSpinorField &out, cudaColorSpinorField &in)
Definition: inv_cg_quda.cpp:29

quda::QUDA_PROFILE_PREAMBLE
Definition: quda_internal.h:147

quda::axpyCuda
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:115

QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:149

quda::SolverParam::max_res_increase
int max_res_increase
Definition: invert_quda.h:49

x
int x[4]
Definition: hisq_paths_force_core.h:99

quda::blas_flops
unsigned long long blas_flops
Definition: blas_quda.cu:37

blas_quda.h

quda::xpyHeavyQuarkResidualNormCuda
double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &r)
Definition: reduce_quda.cu:782

quda::SolverParam::precision
QudaPrecision precision
Definition: invert_quda.h:81

quda::QUDA_PROFILE_INIT
Definition: quda_internal.h:146

quda::SolverParam::true_res
double true_res
Definition: invert_quda.h:69

quda::Solver
Definition: invert_quda.h:220

quda::ColorSpinorParam
Definition: color_spinor_field.h:14

quda::Solver::param
SolverParam & param
Definition: invert_quda.h:223

quda::xpyCuda
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:98

quda::reDotProductCuda
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:170

quda::TimeProfile::Stop
void Stop(QudaProfileType idx)
Definition: quda_internal.h:194

quda::ColorSpinorField::Precision
QudaPrecision Precision() const
Definition: color_spinor_field.h:242

quda::Solver::PrintStats
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
Definition: solver.cpp:122

quda::TimeProfile::Last
double Last(QudaProfileType idx)
Definition: quda_internal.h:204

quda::DiracMatrix
Definition: dirac_quda.h:571

reduceDouble
void reduceDouble(double &)
Definition: face_buffer.cpp:530

dslash_quda.h

printfQuda
#define printfQuda(...)
Definition: util_quda.h:67

quda::zeroCuda
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40

quda::TimeProfile::Start
void Start(QudaProfileType idx)
Definition: quda_internal.h:184

quda::DiracMatrix::isStaggered
bool isStaggered() const
Definition: dirac_quda.h:594

quda::tripleCGUpdateCuda
void tripleCGUpdateCuda(const double &alpha, const double &beta, cudaColorSpinorField &q, cudaColorSpinorField &r, cudaColorSpinorField &x, cudaColorSpinorField &p)
Definition: blas_quda.cu:480

quda::SolverParam::precision_sloppy
QudaPrecision precision_sloppy
Definition: invert_quda.h:84

quda::SolverParam::use_sloppy_partial_accumulator
bool use_sloppy_partial_accumulator
Definition: invert_quda.h:44

QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:304

quda::Solver::convergenceHQ
bool convergenceHQ(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
Definition: solver.cpp:99

quda::CG::~CG
virtual ~CG()
Definition: inv_cg_quda.cpp:25

quda::xpayCuda
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
Definition: blas_quda.cu:138

quda::ColorSpinorParam::create
QudaFieldCreate create
Definition: color_spinor_field.h:25

quda::SolverParam
Definition: invert_quda.h:14

quda::HeavyQuarkResidualNormCuda
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
Definition: reduce_quda.cu:777

quda::norm2
double norm2(const ColorSpinorField &)
Definition: color_spinor_field.cpp:486

quda::xmyNormCuda
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343

quda::SolverParam::tol
double tol
Definition: invert_quda.h:60

quda_internal.h