20 #define MAX_EIGENVEC_WINDOW 16
22 #define SINGLE_PRECISION_EPSILON 1e-7
31 static DeflationParam *defl_param = 0;
32 static double eigcg_global_stop = 0.0;
87 if(addedvecs == 0)
return;
100 printfQuda(
"\nProjection matrix information:\n");
131 cudaEigvParam.eigv_dim = nev;
132 cudaEigvParam.eigv_id = -1;
155 template<
typename Float,
typename CudaComplex>
162 std::complex<Float> *hTm;
168 CudaComplex *dTvecm1;
184 int RestartVm(
void* vm,
const int cld,
const int clen,
const int vprec);
193 template<
typename Float,
typename CudaComplex>
196 ldm = ((m+15)/16)*16;
202 hTm =
new std::complex<Float>[ldm*m];
206 cudaMalloc(&dTm, ldm*m*
sizeof(CudaComplex));
207 cudaMalloc(&dTvecm, ldm*m*
sizeof(CudaComplex));
208 cudaMalloc(&dTvecm1, ldm*m*
sizeof(CudaComplex));
211 cudaMemset(dTm, 0, ldm*m*
sizeof(CudaComplex));
212 cudaMemset(dTvecm, 0, ldm*m*
sizeof(CudaComplex));
213 cudaMemset(dTvecm1, 0, ldm*m*
sizeof(CudaComplex));
221 template<
typename Float,
typename CudaComplex>
231 delete eigcg_magma_args;
236 template<
typename Float,
typename CudaComplex>
239 hTm[idx*ldm+
idx] = std::complex<Float>((
Float)(1.0/alpha + beta0/alpha0), 0.0);
243 template<
typename Float,
typename CudaComplex>
246 hTm[(idx+1)*ldm+idx] = std::complex<Float>((
Float)(-
sqrt(beta)/alpha), 0.0f);
247 hTm[idx*ldm+(idx+1)] = hTm[(idx+1)*ldm+
idx];
251 template<
typename Float,
typename CudaComplex>
255 cudaMemcpy(dTm, hTm, ldm*m*
sizeof(CudaComplex), cudaMemcpyDefault);
258 cudaMemcpy(dTvecm, dTm, ldm*m*
sizeof(CudaComplex), cudaMemcpyDefault);
259 eigcg_magma_args->MagmaHEEVD((
void*)dTvecm, (
void*)hTvalm, m);
262 cudaMemcpy(dTvecm1, dTm, ldm*m*
sizeof(CudaComplex), cudaMemcpyDefault);
263 eigcg_magma_args->MagmaHEEVD((
void*)dTvecm1, (
void*)hTvalm, m-1);
266 cudaMemset2D(&dTvecm1[(m-1)], ldm*
sizeof(CudaComplex), 0,
sizeof(CudaComplex), (m-1));
269 cudaMemcpy(&dTvecm[ldm*nev], dTvecm1, ldm*nev*
sizeof(CudaComplex), cudaMemcpyDefault);
272 int i = eigcg_magma_args->MagmaORTH_2nev((
void*)dTvecm, (
void*)dTm);
275 eigcg_magma_args->MagmaHEEVD((
void*)dTm, (
void*)hTvalm, i);
278 cudaMemset2D(&(dTm[i]), ldm*
sizeof(CudaComplex), 0, (m-i)*
sizeof(CudaComplex), i);
281 eigcg_magma_args->RestartV(v, cld, clen, vprec, (
void*)dTvecm, (
void*)dTm);
287 template<
typename Float,
typename CudaComplex>
290 memset(hTm, 0, ldm*m*
sizeof(std::complex<Float>));
291 for (
int i = 0; i < _2nev; i++) hTm[i*ldm+i]= hTvalm[i];
296 template<
typename Float,
typename CudaComplex>
300 for (
int i = 0; i < _2nev; i++){
303 hTm[_2nev*ldm+i] = std::complex<Float>((
Float)s.real(), (
Float)s.imag());
304 hTm[i*ldm+_2nev] =
conj(hTm[_2nev*ldm+i]);
309 template<
typename Float,
typename CudaComplex>
312 printfQuda(
"\nPrint eigenvalue accuracy after %d restart.\n", restart_num);
329 for (
int j = 0; j < nev; j++)
334 for (
int i = 0; i < j; i++)
338 hproj[j*nev+i] = alpha;
339 hproj[i*nev+j] =
conj(alpha);
345 hproj[j*nev+j] = alpha;
348 double *evals = (
double*)calloc(nev,
sizeof(
double));
350 cudaHostRegister(hproj, nev*nev*
sizeof(
Complex), cudaHostRegisterMapped);
354 magma_args2.
MagmaHEEVD(hproj, evals, nev,
true);
356 for(
int i = 0; i < nev; i++)
362 matDefl(*W2, *W, tmp);
366 evals[i] = dotWW2.real() / norm2W;
376 printfQuda(
"Eigenvalue %d: %1.12e Res.: %1.12e\n", i+1, evals[i], relerr);
384 cudaHostUnregister(hproj);
395 initCGparam.
iter = 0;
397 initCGparam.
secs = 0;
405 Vm(0), initCGparam(param), profile(profile), eigcg_alloc(false)
419 printfQuda(
"\nInitialize eigCG(m=%d, nev=%d) solver.\n", param.
m, param.
nev);
427 printfQuda(
"\nIncEigCG will deploy initCG solver.\n");
435 if(eigcg_alloc)
delete Vm;
447 const double b2 =
norm2(b);
451 printfQuda(
"Warning: inverting on zero-field source\n");
485 matSloppy(r, x, tmp, tmp2);
492 const bool use_heavy_quark_res =
501 double heavy_quark_res = 0.0;
503 int heavy_quark_check = 10;
505 double alpha=1.0, beta=0.0;
516 if(eigcg_alloc ==
false){
518 printfQuda(
"\nAllocating resources for the EigCG solver...\n");
543 double alpha0 = 1.0, beta0 = 0.0;
548 PrintStats(
"EigCG", k, r2, b2, heavy_quark_res);
558 beta = sigma / r2_old;
561 if (use_heavy_quark_res && k%heavy_quark_check==0) {
570 matSloppy(Ap, p, tmp, tmp2);
629 sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2;
633 PrintStats(
"EigCG", k, r2, b2, heavy_quark_res);
652 printfQuda(
"EigCG: Eigenspace restarts = %d\n", eigvRestart);
657 matSloppy(r, x, tmp, tmp2);
660 #if (__COMPUTE_CAPABILITY__ >= 200)
674 if (&tmp2 != &tmp)
delete tmp2_p;
733 if(!use_eigcg || (newnevs == 0))
return;
735 if(!eigcg_alloc)
errorQuda(
"\nError: cannot expand deflation spase (eigcg ritz vectors were cleaned).\n");
737 printfQuda(
"\nConstruct projection matrix..\n");
741 if((newnevs + dpar->
cur_dim) > dpar->
ld)
errorQuda(
"\nIncorrect deflation space...\n");
749 for(
int j = 0; j < i; j++)
757 if(alpha.real() > 1e-16)
764 errorQuda(
"\nCannot orthogonalize %dth vector\n", i);
784 for (
int i = 0; i < j; i++)
811 int curr_evals = dpar->
cur_dim;
813 double *evals = (
double*)calloc(curr_evals,
sizeof(
double));
817 cudaHostRegister(projm, dpar->
ld*dpar->
tot_dim*
sizeof(
Complex), cudaHostRegisterMapped);
823 magma_args.
MagmaHEEVD(projm, evals, curr_evals,
true);
837 for(
int i = 0; i < nevs_to_print; i++)
843 matDefl(*W2, *W, tmp);
847 evals[i] = dotWW2.real() / norm2W;
853 double relerr =
sqrt(
norm2(*W) / norm2W );
857 printfQuda(
"Eigenvalue %d: %1.12e Residual: %1.12e\n", i+1, evals[i], relerr);
867 cudaHostUnregister(projm);
879 printf(
"\nToo big number of eigenvectors was requested, switched to maximum available number %d\n", dpar->
cur_dim);
883 double *evals = (
double*)calloc(dpar->
cur_dim,
sizeof(
double));
887 cudaHostRegister(projm, dpar->
ld*dpar->
tot_dim*
sizeof(
Complex), cudaHostRegisterMapped);
896 for(
int i = 0; i < dpar->
cur_dim; i++)
898 if(fabs(evals[i]) > 1e-16)
904 errorQuda(
"\nCannot invert Ritz value.\n");
919 if(eigcg_alloc ==
false){
921 printfQuda(
"\nAllocating resources for the eigenvectors...\n");
940 while ((relerr < tol) && (idx < max_nevs))
950 matDefl(*W2, *W, tmp);
954 evals[
idx] = dotWW2.real() / norm2W;
962 printfQuda(
"Eigenvalue %d: %1.12e Residual: %1.12e\n", (idx+1), evals[idx], relerr);
986 cudaHostUnregister(projm);
1002 double check_nrm2 =
norm2(b);
1006 for(
int i = 0; i < dpar->
cur_dim; i++)
1013 for(
int i = 0; i < dpar->
cur_dim; i++)
1018 check_nrm2 =
norm2(x);
1019 printfQuda(
"\nDeflated guess spinor norm (gpu): %1.15e\n",
sqrt(check_nrm2));
1030 if(dpar->
rtz_dim == 0)
return;
1032 double check_nrm2 =
norm2(b);
1036 for(
int i = 0; i < dpar->
rtz_dim; i++)
1045 check_nrm2 =
norm2(x);
1046 printfQuda(
"\nDeflated guess spinor norm (gpu): %1.15e\n",
sqrt(check_nrm2));
1054 const int first_idx = dpar->
cur_dim;
1062 if(cleanEigCGResources)
1077 const int spinorSize = 24;
1080 int nev_to_copy = nev > defl_param->
cur_dim ? defl_param->
cur_dim : nev;
1081 if(nev > defl_param->
cur_dim)
warningQuda(
"\nWill copy %d eigenvectors (requested %d)\n", nev_to_copy, nev);
1083 for(
int i = 0; i < nev_to_copy; i++)
1086 size_t offset = i*h_size;
1087 void *hptr = (
void*)((
char*)hu+offset);
1098 if(inv_eigenvals) memcpy(inv_eigenvals, defl_param->
ritz_values, nev_to_copy*
sizeof(
double));
1121 const bool use_reduced_vector_set =
true;
1123 const bool use_cg_updates =
false;
1125 const int eigcg_min_restarts = 3;
1127 int eigcg_restarts = 0;
1134 const double cg_iterref_tol = 5e-2;
1144 printfQuda(
"\nWarning: IncEigCG will deploy initCG solver.\n");
1168 printf(
"\nRunning solo precision EigCG.\n");
1170 eigcg_restarts =
EigCG(*out, *in);
1180 printf(
"\nRunning mixed precision EigCG.\n");
1184 const double stop =
norm2(*in)*ext_tol*ext_tol;
1186 double tot_time = 0.0;
1203 eigcg_restarts =
EigCG(*outSloppy, *inSloppy);
1223 double stop_div_r2 = stop / r2;
1225 eigcg_global_stop = stop;
1229 initCGparam.
tol = cg_iterref_tol;
1238 bool cg_updates = (use_cg_updates || (eigcg_restarts < eigcg_min_restarts) || (defl_param->
cur_dim == defl_param->
tot_dim) || (stop_div_r2 > cg_iterref_tol));
1240 bool eigcg_updates = !cg_updates;
1242 if(cg_updates) initCG =
new CG(matSloppy, matCGSloppy, initCGparam, profile);
1257 eigcg_restarts =
EigCG(*outSloppy, *inSloppy);
1261 (*initCG)(*outSloppy, *inSloppy);
1272 stop_div_r2 = stop / r2;
1274 tot_time += (eigcg_updates ?
param.
secs : initCGparam.
secs);
1278 if(((eigcg_restarts >= eigcg_min_restarts) || (stop_div_r2 < cg_iterref_tol)) && (defl_param->
cur_dim < defl_param->
tot_dim))
1287 if(!initCG && (r2 > stop)) initCG =
new CG(matSloppy, matCGSloppy, initCGparam, profile);
1291 eigcg_updates =
false;
1296 if(cg_updates && initCG)
1320 printfQuda(
"\ninitCG stat: %i iter / %g secs = %g Gflops. \n", initCGparam.
iter, initCGparam.
secs, initCGparam.
gflops);
1329 double full_tol = initCGparam.
tol;
1347 if(use_reduced_vector_set)
1354 const int max_restart_num = 3;
1356 int restart_idx = 0;
1358 const double inc_tol = 1e-2;
1366 initCGparam.
delta = 1e-2;
1369 while((restart_tol > full_tol) && (restart_idx < max_restart_num))
1371 initCGparam.
tol = restart_tol;
1373 initCG =
new CG(mat, matCGSloppy, initCGparam, profile);
1375 (*initCG)(*
out, *
in);
1379 matDefl(*W, *out, tmp);
1383 if(use_reduced_vector_set)
1392 printfQuda(
"\ninitCG stat: %i iter / %g secs = %g Gflops. \n", initCGparam.
iter, initCGparam.
secs, initCGparam.
gflops);
1395 double new_restart_tol = restart_tol*inc_tol;
1397 restart_tol = (new_restart_tol > full_tol) ? new_restart_tol : full_tol;
1404 initCGparam.
tol = full_tol;
1406 initCG =
new CG(mat, matCGSloppy, initCGparam, profile);
1408 (*initCG)(*
out, *
in);
1414 printfQuda(
"\ninitCG total stat (%d restarts): %i iter / %g secs = %g Gflops. \n", restart_idx, initCGparam.
iter, initCGparam.
secs, initCGparam.
gflops);
1433 if(use_reduced_vector_set){
1435 const int max_nev = defl_param->
cur_dim;
1437 double eigenval_tol = 1e-1;
1454 mat(*final_r, *out, *tmp2);
void DeleteDeflationSpace(DeflationParam *¶m)
bool in_incremental_stage
#define SINGLE_PRECISION_EPSILON
void setPrecision(QudaPrecision precision)
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
enum QudaPrecision_s QudaPrecision
void FillLanczosOffDiag(const int _2nev, cudaColorSpinorField *v, cudaColorSpinorField *u, double inv_sqrt_r2)
void LoadLanczosDiag(int idx, double alpha, double alpha0, double beta0)
QudaInverterType inv_type
QudaVerbosity getVerbosity()
void StoreRitzVecs(void *host_buf, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources=false)
void ReportEigenvalueAccuracy(DeflationParam *param, int nevs_to_print)
__host__ __device__ ValueType sqrt(ValueType x)
DeflationParam(ColorSpinorParam &eigv_param, SolverParam ¶m)
std::complex< double > Complex
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaPrecision precision_ritz
void SolveProjMatrix(void *rhs, const int ldn, const int n, void *H, const int ldH)
IncEigCG(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matCGSloppy, DiracMatrix &matDefl, SolverParam ¶m, TimeProfile &profile)
int EigCG(cudaColorSpinorField &out, cudaColorSpinorField &in)
void DeflateSpinor(cudaColorSpinorField &out, cudaColorSpinorField &in, DeflationParam *param, bool set2zero=true)
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
unsigned long long flops() const
cudaColorSpinorField * cudaRitzVectors
#define MAX_EIGENVEC_WINDOW
cudaColorSpinorField * tmp2
cudaColorSpinorField * tmp
void ResetDeflationCurrentDim(const int addedvecs)
QudaResidualType residual_type
int RestartVm(void *vm, const int cld, const int clen, const int vprec)
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
int EigvTotalLength() const
void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
FloatingPoint< float > Float
void fillInitCGSolveParam(SolverParam &initCGparam)
int EigvDim() const
for eigcg only:
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
bool convergence(const double &r2, const double &hq2, const double &r2_tol, const double &hq_tol)
double normCuda(const cudaColorSpinorField &b)
void FillLanczosDiag(const int _2nev)
void operator()(cudaColorSpinorField *out, cudaColorSpinorField *in)
void ExpandDeflationSpace(DeflationParam *param, const int new_nev)
void PrintSummary(const char *name, int k, const double &r2, const double &b2)
unsigned long long blas_flops
double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &r)
void DeflateSpinorReduced(cudaColorSpinorField &out, cudaColorSpinorField &in, DeflationParam *param, bool set2zero=true)
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
void DeleteEigCGSearchSpace()
void * memset(void *s, int c, size_t n)
void CreateDeflationSpace(cudaColorSpinorField &eigcgSpinor, DeflationParam *¶m)
QudaPrecision Precision() const
double Last(QudaProfileType idx)
void reduceDouble(double &)
void MagmaHEEVD(void *dTvecm, void *hTvalm, const int problem_size, bool host=false)
void zeroCuda(cudaColorSpinorField &a)
void LoadLanczosOffDiag(int idx, double alpha0, double beta0)
void Start(QudaProfileType idx)
void ReshapeDeviceRitzVectorsSet(const int nev, QudaPrecision new_ritz_prec=QUDA_INVALID_PRECISION)
void PrintStats(const char *, int k, const double &r2, const double &b2, const double &hq2)
QudaUseInitGuess use_init_guess
QudaPrecision precision_sloppy
bool use_sloppy_partial_accumulator
void SaveEigCGRitzVecs(DeflationParam *param, bool cleanResources=false)
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
__host__ __device__ ValueType conj(ValueType x)
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
void CheckEigenvalues(const cudaColorSpinorField *Vm, const DiracMatrix &matDefl, const int restart_num)
void CleanDeviceRitzVectors()
void axCuda(const double &a, cudaColorSpinorField &x)
double norm2(const ColorSpinorField &)
void LoadEigenvectors(DeflationParam *param, int max_nevs, double tol=1e-3)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
QudaSiteSubset SiteSubset() const
QudaPrecision eigcg_precision
EigCGArgs(int m, int nev)
cudaColorSpinorField & Eigenvec(const int idx) const
for deflated solvers: