39 const double *r2,
const double *beta,
const double pAp,
40 const double *offset,
const int nShift,
const int j_low) {
42 for (
int j=0; j<nShift; j++) alpha_old[j] = alpha[j];
44 alpha[0] = r2[0] / pAp;
46 for (
int j=1; j<nShift; j++) {
47 double c0 = zeta[j] * zeta_old[j] * alpha_old[j_low];
48 double c1 = alpha[j_low] * beta[j_low] * (zeta_old[j]-zeta[j]);
49 double c2 = zeta_old[j] * alpha_old[j_low] * (1.0+(offset[j]-offset[0])*alpha[j_low]);
51 zeta_old[j] = zeta[j];
53 zeta[j] = c0 / (c1 + c2);
59 alpha[j] = alpha[j_low] * zeta[j] / zeta_old[j];
74 if (num_offset == 0)
return;
80 printfQuda(
"Warning: inverting on zero-field source\n");
81 for(
int i=0; i<num_offset; ++i){
90 double *zeta =
new double[num_offset];
91 double *zeta_old =
new double[num_offset];
92 double *alpha =
new double[num_offset];
93 double *beta =
new double[num_offset];
96 int num_offset_now = num_offset;
97 for (
int i=0; i<num_offset; i++) {
98 zeta[i] = zeta_old[i] = 1.0;
105 for (
int j=0; j<num_offset; j++)
131 for (
int i=0; i<num_offset; i++) x_sloppy[i] = x[i];
134 for (
int i=0; i<num_offset; i++)
164 for (
int i=0; i<num_offset; i++) {
176 for (
int i=0; i<num_offset; i++) {
177 rNorm[i] =
sqrt(r2[i]);
178 r0Norm[i] = rNorm[i];
192 for (
int i=0; i<num_offset; i++) {
193 resIncreaseTotal[i]=0;
204 printfQuda(
"MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0],
sqrt(r2[0]/b2));
214 updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low);
218 r2[0] = real(cg_norm);
219 double zn = imag(cg_norm);
222 rNorm[0] =
sqrt(r2[0]);
223 for (
int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j];
225 int updateX=0, updateR=0;
226 int reliable_shift = -1;
227 for (
int j=num_offset_now-1; j>=0; j--) {
228 if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j];
229 if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j];
230 updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX;
231 updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR;
232 if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j;
235 if ( !(updateR || updateX) || !
reliable) {
237 beta[0] = zn / r2_old;
239 axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]);
241 for (
int j=1; j<num_offset_now; j++) {
242 beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
244 axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]);
247 for (
int j=0; j<num_offset_now; j++) {
248 axpyCuda(alpha[j], *p[j], *x_sloppy[j]);
253 mat(*r, *y[0], *x[0], tmp3);
257 for (
int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0];
258 for (
int j=0; j<num_offset_now; j++)
zeroCuda(*x_sloppy[j]);
264 if (
sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) {
266 resIncreaseTotal[reliable_shift]++;
267 warningQuda(
"MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)",
268 reliable_shift,
sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]);
271 if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal)
break;
277 for (
int j=0; j<num_offset_now; j++) {
283 beta[0] = r2[0] / r2_old;
284 xpayCuda(*r_sloppy, beta[0], *p[0]);
285 for (
int j=1; j<num_offset_now; j++) {
286 beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
287 axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]);
291 int m = reliable_shift;
292 rNorm[m] =
sqrt(r2[0]) * zeta[m];
295 r0Norm[m] = rNorm[m];
300 for (
int j=1; j<num_offset_now; j++) {
301 if (zeta[j] == 0.0) {
304 printfQuda(
"MultiShift CG: Shift %d converged after %d iterations\n", j, k + 1);
307 r2[j] = zeta[j] * zeta[j] * r2[0];
308 if (r2[j] < stop[j]) {
311 printfQuda(
"MultiShift CG: Shift %d converged after %d iterations\n", j, k+1);
319 printfQuda(
"MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0],
sqrt(r2[0]/b2));
323 for (
int i=0; i<num_offset; i++) {
325 if (reliable)
xpyCuda(*y[i], *x[i]);
332 printfQuda(
"MultiShift CG: Reliable updates = %d\n", rUpdate);
342 for(
int i=0; i < num_offset; i++) {
347 axpyCuda(offset[i]-offset[0], *x[i], *r);
351 #if (__COMPUTE_CAPABILITY__ >= 200)
359 printfQuda(
"MultiShift CG: Converged after %d iterations\n", k);
360 for(
int i=0; i < num_offset; i++) {
361 printfQuda(
" shift=%d, relative residual: iterated = %e, true = %e\n",
374 if (&tmp3 != &tmp1)
delete tmp3_p;
375 if (&tmp2 != &tmp1)
delete tmp2_p;
378 for (
int i=0; i<num_offset; i++)
379 if (x_sloppy[i]->Precision() != x[i]->
Precision())
delete x_sloppy[i];
383 for (
int i=0; i<num_offset; i++)
delete p[i];
387 for (
int i=0; i<num_offset; i++)
delete y[i];
void setPrecision(QudaPrecision precision)
static double stopping(const double &tol, const double &b2, QudaResidualType residual_type)
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
std::complex< double > Complex
void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y)
cudaColorSpinorField * tmp1
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
void updateAlphaZeta(double *alpha, double *zeta, double *zeta_old, const double *r2, const double *beta, const double pAp, const double *offset, const int nShift, const int j_low)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
double tol_offset[QUDA_MAX_MULTI_SHIFT]
double offset[QUDA_MAX_MULTI_SHIFT]
int max_res_increase_total
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
unsigned long long flops() const
cudaColorSpinorField * tmp2
void axpyBzpcxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, const double &b, cudaColorSpinorField &z, const double &c)
QudaResidualType residual_type
const DiracMatrix & matSloppy
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
double normCuda(const cudaColorSpinorField &b)
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
void operator()(cudaColorSpinorField **out, cudaColorSpinorField &in)
unsigned long long blas_flops
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
void Stop(QudaProfileType idx)
QudaPrecision Precision() const
double Last(QudaProfileType idx)
void reduceDouble(double &)
MultiShiftCG(DiracMatrix &mat, DiracMatrix &matSloppy, SolverParam ¶m, TimeProfile &profile)
int reliable(double &rNorm, double &maxrx, double &maxrr, const double &r2, const double &delta)
void zeroCuda(cudaColorSpinorField &a)
void Start(QudaProfileType idx)
QudaPrecision precision_sloppy
bool use_sloppy_partial_accumulator
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)