22 long ds = end.tv_sec - start.tv_sec;
23 long dus = end.tv_usec - start.tv_usec;
24 return ds + 0.000001*dus;
53 for (
int i=0; i<k; i++) {
61 for (
int i=0; i<k-1; i++) {
62 beta[i+1][k] =
caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]);
64 caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]);
67 for (
int i=0; i<k-2; i+=3) {
69 caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]);
73 if ((k - 3*(k/3)) % 2 == 0) {
76 caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]);
79 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
85 for (
int i=0; i<k-1; i+=2) {
87 caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]);
92 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
96 errorQuda(
"Orthogonalization type not defined");
102 for (
int k=n-1; k>=0;k--) {
104 for (
int j=k+1;j<n; j++) {
105 delta[k] -= beta[k][j]*delta[j];
107 delta[k] /= gamma[k];
117 backSubs(alpha, beta, gamma, delta, k);
121 for (
int i=0; i<k-2; i+=3)
122 caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x);
125 if ((k - 3*(k/3)) % 2 == 0)
caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x);
134 Solver(param, profile), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0), Kparam(param)
140 K =
new CG(matPrecon, matPrecon, Kparam, profile);
142 K =
new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam, profile);
144 K =
new MR(matPrecon, Kparam, profile);
146 K =
new SD(matPrecon, Kparam, profile);
181 for (
int i=0; i<Nkrylov; i++) {
202 bool precMatch =
true;
219 for (
int i=0; i<Nkrylov; i++) beta[i] =
new Complex[Nkrylov];
220 double *gamma =
new double[Nkrylov];
224 for (
int i=0; i<4; i++) parity +=
commCoords(i);
254 const bool use_heavy_quark_res =
263 double heavy_quark_res = 0.0;
267 int resIncreaseTotal = 0;
279 bool l2_converge =
false;
285 PrintStats(
"GCR", total_iter+k, r2, b2, heavy_quark_res);
307 if (m==0) {
copyCuda(*p[k], pPre); }
314 matSloppy(*Ap[k], *p[k], tmp);
324 printfQuda(
"GCR debug iter=%d: Apr=(%e,%e,%e)\n", total_iter, Apr.x, Apr.y, Apr.z);
325 for (
int i=0; i<k; i++)
326 for (
int j=0; j<=k; j++)
327 printfQuda(
"GCR debug iter=%d: beta[%d][%d] = (%e,%e)\n",
328 total_iter, i, j, real(beta[i][j]), imag(beta[i][j]));
331 gamma[k] =
sqrt(Apr.z);
332 if (gamma[k] == 0.0)
errorQuda(
"GCR breakdown\n");
333 alpha[k] =
Complex(Apr.x, Apr.y) / gamma[k];
341 PrintStats(
"GCR", total_iter, r2, b2, heavy_quark_res);
362 warningQuda(
"GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
363 sqrt(r2),
sqrt(r2_old), resIncreaseTotal);
364 if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal)
break;
374 PrintStats(
"GCR (restart)", restart, r2, b2, heavy_quark_res);
381 if (r2 < stop) l2_converge =
true;
409 #if (__COMPUTE_CAPABILITY__ >= 200)
441 for (
int i=0; i<Nkrylov; i++) {
449 for (
int i=0; i<Nkrylov; i++)
delete []beta[i];
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
double timeInterval(struct timeval start, struct timeval end)
void setPrecision(QudaPrecision precision)
QudaSchwarzType schwarz_type
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
QudaInverterType inv_type
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
std::complex< double > Complex
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
double cabxpyAxNormCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
QudaInverterType inv_type_precondition
QudaPreserveSource preserve_source
int max_res_increase_total
void orthoDir(Complex **beta, cudaColorSpinorField *Ap[], int k)
void fillInnerSolveParam(SolverParam &inner, const SolverParam &outer)
void updateSolution(cudaColorSpinorField &x, const Complex *alpha, Complex **const beta, double *gamma, int k, cudaColorSpinorField *p[])
unsigned long long flops() const
void backSubs(const Complex *alpha, Complex **const beta, const double *gamma, Complex *delta, int n)
GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, SolverParam ¶m, TimeProfile &profile)
cudaColorSpinorField * tmp
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
QudaResidualType residual_type
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
double normCuda(const cudaColorSpinorField &b)
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Complex caxpyDotzyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
void caxpbypczpwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
QudaPrecision precision_precondition
unsigned long long blas_flops
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
void Stop(QudaProfileType idx)
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
double Last(QudaProfileType idx)
void reduceDouble(double &)
void operator()(cudaColorSpinorField &out, cudaColorSpinorField &in)
void caxpbypzCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
void zeroCuda(cudaColorSpinorField &a)
double3 cDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
void Start(QudaProfileType idx)
QudaUseInitGuess use_init_guess
QudaPrecision precision_sloppy
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
double norm2(const ColorSpinorField &)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)