22 long ds = end.tv_sec - start.tv_sec;
23 long dus = end.tv_usec - start.tv_usec;
24 return ds + 0.000001*dus;
55 for (
int i=0; i<k; i++) {
63 for (
int i=0; i<k-1; i++) {
64 beta[i+1][k] =
caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]);
66 caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]);
69 for (
int i=0; i<k-2; i+=3) {
71 caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]);
75 if ((k - 3*(k/3)) % 2 == 0) {
78 caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]);
81 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
87 for (
int i=0; i<k-1; i+=2) {
89 caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]);
94 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
98 errorQuda(
"Orthogonalization type not defined");
104 for (
int k=n-1; k>=0;k--) {
106 for (
int j=k+1;j<n; j++) {
107 delta[k] -= beta[k][j]*delta[j];
109 delta[k] /= gamma[k];
119 backSubs(alpha, beta, gamma, delta, k);
123 for (
int i=0; i<k-2; i+=3)
124 caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x);
127 if ((k - 3*(k/3)) % 2 == 0)
caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x);
136 Solver(invParam, profile), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0)
143 K =
new CG(matPrecon, matPrecon, Kparam, profile);
145 K =
new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam, profile);
147 K =
new MR(matPrecon, Kparam, profile);
176 for (
int i=0; i<Nkrylov; i++) {
197 bool precMatch =
true;
212 for (
int i=0; i<Nkrylov; i++) beta[i] =
new Complex[Nkrylov];
213 double *gamma =
new double[Nkrylov];
217 const bool use_heavy_quark_res =
220 double heavy_quark_res = 0.0;
227 for (
int i=0; i<4; i++) parity +=
commCoords(i);
247 bool l2_converge =
false;
252 PrintStats(
"GCR", total_iter+k, r2, b2, heavy_quark_res);
274 if (m==0) {
copyCuda(*p[k], pPre); }
281 matSloppy(*Ap[k], *p[k], tmp);
288 gamma[k] = sqrt(Apr.z);
289 if (gamma[k] == 0.0)
errorQuda(
"GCR breakdown\n");
290 alpha[k] =
Complex(Apr.x, Apr.y) / gamma[k];
298 PrintStats(
"GCR", total_iter, r2, b2, heavy_quark_res);
315 if (use_heavy_quark_res) {
322 PrintStats(
"GCR (restart)", restart, r2, b2, heavy_quark_res);
329 if (r2 < stop) l2_converge =
true;
355 #if (__COMPUTE_CAPABILITY__ >= 200)
385 for (
int i=0; i<Nkrylov; i++) {
393 for (
int i=0; i<Nkrylov; i++)
delete []beta[i];