QUDA: quda/lib/inv_gcr_quda.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <math.h>
00004 
00005 #include <complex>
00006 
00007 #include <quda_internal.h>
00008 #include <blas_quda.h>
00009 #include <dslash_quda.h>
00010 #include <invert_quda.h>
00011 #include <util_quda.h>
00012 
00013 #include<face_quda.h>
00014 
00015 #include <color_spinor_field.h>
00016 
00017 #include <sys/time.h>
00018 
00019 struct timeval orth0, orth1, pre0, pre1, mat0, mat1, rst0, rst1;
00020 
00021 double timeInterval(struct timeval start, struct timeval end) {
00022   long ds = end.tv_sec - start.tv_sec;
00023   long dus = end.tv_usec - start.tv_usec;
00024   return ds + 0.000001*dus;
00025 }
00026 
00027 // set the required parameters for the inner solver
00028 void fillInnerInvertParam(QudaInvertParam &inner, const QudaInvertParam &outer) {
00029   inner.tol = outer.tol_precondition;
00030   inner.maxiter = outer.maxiter_precondition;
00031   inner.reliable_delta = 1e-20; // no reliable updates within the inner solver
00032   
00033   inner.cuda_prec = outer.prec_precondition; // preconditioners are uni-precision solvers
00034   inner.cuda_prec_sloppy = outer.prec_precondition;
00035   
00036   inner.verbosity = outer.verbosity_precondition;
00037   
00038   inner.iter = 0;
00039   inner.gflops = 0;
00040   inner.secs = 0;
00041 
00042   inner.inv_type_precondition = QUDA_GCR_INVERTER; // used to tell the inner solver it is an inner solver
00043 
00044   if (outer.inv_type == QUDA_GCR_INVERTER && outer.cuda_prec_sloppy != outer.prec_precondition) 
00045     inner.preserve_source = QUDA_PRESERVE_SOURCE_NO;
00046   else inner.preserve_source = QUDA_PRESERVE_SOURCE_YES;
00047 
00048 }
00049 
00050 void orthoDir(quda::Complex **beta, cudaColorSpinorField *Ap[], int k) {
00051   gettimeofday(&orth0, NULL);
00052 
00053   int type = 1;
00054 
00055   switch (type) {
00056   case 0: // no kernel fusion
00057     for (int i=0; i<k; i++) { // 5 (k-1) memory transactions here
00058       beta[i][k] = cDotProductCuda(*Ap[i], *Ap[k]);
00059       caxpyCuda(-beta[i][k], *Ap[i], *Ap[k]);
00060     }
00061     break;
00062   case 1: // basic kernel fusion
00063     if (k==0) break;
00064     beta[0][k] = cDotProductCuda(*Ap[0], *Ap[k]);
00065     for (int i=0; i<k-1; i++) { // 4 (k-1) memory transactions here
00066       beta[i+1][k] = caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]);
00067     }
00068     caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]);
00069     break;
00070   case 2: // 
00071     for (int i=0; i<k-2; i+=3) { // 5 (k-1) memory transactions here
00072       for (int j=i; j<i+3; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
00073       caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]);
00074     }
00075     
00076     if (k%3 != 0) { // need to update the remainder
00077       if ((k - 3*(k/3)) % 2 == 0) {
00078         beta[k-2][k] = cDotProductCuda(*Ap[k-2], *Ap[k]);
00079         beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
00080         caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]);
00081       } else {
00082         beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
00083         caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
00084       }
00085     }
00086 
00087     break;
00088   case 3:
00089     for (int i=0; i<k-1; i+=2) {
00090       for (int j=i; j<i+2; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]);
00091       caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]);
00092     }
00093     
00094     if (k%2 != 0) { // need to update the remainder
00095       beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]);
00096       caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]);
00097     }
00098     break;
00099   default:
00100     errorQuda("Orthogonalization type not defined");
00101   }
00102 
00103   gettimeofday(&orth1, NULL);
00104 }   
00105 
00106 void backSubs(const quda::Complex *alpha, quda::Complex** const beta, const double *gamma, quda::Complex *delta, int n) {
00107   for (int k=n-1; k>=0;k--) {
00108     delta[k] = alpha[k];
00109     for (int j=k+1;j<n; j++) {
00110       delta[k] -= beta[k][j]*delta[j];
00111     }
00112     delta[k] /= gamma[k];
00113   }
00114 }
00115 
00116 void updateSolution(cudaColorSpinorField &x, const quda::Complex *alpha, quda::Complex** const beta, 
00117                     double *gamma, int k, cudaColorSpinorField *p[]) {
00118 
00119   quda::Complex *delta = new quda::Complex[k];
00120 
00121   // Update the solution vector
00122   backSubs(alpha, beta, gamma, delta, k);
00123   
00124   //for (int i=0; i<k; i++) caxpyCuda(delta[i], *p[i], x);
00125   
00126   for (int i=0; i<k-2; i+=3) 
00127     caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x); 
00128   
00129   if (k%3 != 0) { // need to update the remainder
00130     if ((k - 3*(k/3)) % 2 == 0) caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x);
00131     else caxpyCuda(delta[k-1], *p[k-1], x);
00132   }
00133 
00134   delete []delta;
00135 }
00136 
00137 GCR::GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, QudaInvertParam &invParam) :
00138   Solver(invParam), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0)
00139 {
00140 
00141   Kparam = newQudaInvertParam();
00142   fillInnerInvertParam(Kparam, invParam);
00143 
00144   if (invParam.inv_type_precondition == QUDA_CG_INVERTER) // inner CG preconditioner
00145     K = new CG(matPrecon, matPrecon, Kparam);
00146   else if (invParam.inv_type_precondition == QUDA_BICGSTAB_INVERTER) // inner BiCGstab preconditioner
00147     K = new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam);
00148   else if (invParam.inv_type_precondition == QUDA_MR_INVERTER) // inner MR preconditioner
00149     K = new MR(matPrecon, Kparam);
00150   else if (invParam.inv_type_precondition != QUDA_INVALID_INVERTER) // unknown preconditioner
00151     errorQuda("Unknown inner solver %d", invParam.inv_type_precondition);
00152 
00153 }
00154 
00155 GCR::~GCR() {
00156   if (K) delete K;
00157 }
00158 
00159 void GCR::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b)
00160 {
00161   int Nkrylov = invParam.gcrNkrylov; // size of Krylov space
00162 
00163   ColorSpinorParam param(x);
00164   param.create = QUDA_ZERO_FIELD_CREATE;
00165   cudaColorSpinorField r(x, param); 
00166 
00167   cudaColorSpinorField y(x, param); // high precision accumulator
00168 
00169   // create sloppy fields used for orthogonalization
00170   param.precision = invParam.cuda_prec_sloppy;
00171   cudaColorSpinorField **p = new cudaColorSpinorField*[Nkrylov];
00172   cudaColorSpinorField **Ap = new cudaColorSpinorField*[Nkrylov];
00173   for (int i=0; i<Nkrylov; i++) {
00174     p[i] = new cudaColorSpinorField(x, param);
00175     Ap[i] = new cudaColorSpinorField(x, param);
00176   }
00177 
00178   cudaColorSpinorField tmp(x, param); //temporary for sloppy mat-vec
00179 
00180   cudaColorSpinorField *x_sloppy, *r_sloppy;
00181   if (invParam.cuda_prec_sloppy != invParam.cuda_prec) {
00182     param.precision = invParam.cuda_prec_sloppy;
00183     x_sloppy = new cudaColorSpinorField(x, param);
00184     r_sloppy = new cudaColorSpinorField(x, param);
00185   } else {
00186     x_sloppy = &x;
00187     r_sloppy = &r;
00188   }
00189 
00190   cudaColorSpinorField &xSloppy = *x_sloppy;
00191   cudaColorSpinorField &rSloppy = *r_sloppy;
00192 
00193   // these low precision fields are used by the inner solver
00194   bool precMatch = true;
00195   cudaColorSpinorField *r_pre, *p_pre;
00196   if (invParam.prec_precondition != invParam.cuda_prec_sloppy) {
00197     param.precision = invParam.prec_precondition;
00198     p_pre = new cudaColorSpinorField(x, param);
00199     r_pre = new cudaColorSpinorField(x, param);
00200     precMatch = false;
00201   } else {
00202     p_pre = NULL;
00203     r_pre = r_sloppy;
00204   }
00205   cudaColorSpinorField &rPre = *r_pre;
00206 
00207   quda::Complex *alpha = new quda::Complex[Nkrylov];
00208   quda::Complex **beta = new quda::Complex*[Nkrylov];
00209   for (int i=0; i<Nkrylov; i++) beta[i] = new quda::Complex[Nkrylov];
00210   double *gamma = new double[Nkrylov];
00211 
00212   double b2 = normCuda(b);
00213 
00214   double stop = b2*invParam.tol*invParam.tol; // stopping condition of solver
00215 
00216   int k = 0;
00217 
00218   // calculate initial residual
00219   mat(r, x);
00220   double r2 = xmyNormCuda(b, r);  
00221   copyCuda(rSloppy, r);
00222 
00223   quda::blas_flops = 0;
00224 
00225   stopwatchStart();
00226 
00227   int total_iter = 0;
00228   int restart = 0;
00229   double r2_old = r2;
00230 
00231   if (invParam.verbosity >= QUDA_VERBOSE) 
00232     printfQuda("GCR: %d total iterations, %d Krylov iterations, r2 = %e\n", total_iter+k, k, r2);
00233 
00234   double orthT = 0, matT = 0, preT = 0, resT = 0;
00235 
00236   while (r2 > stop && total_iter < invParam.maxiter) {
00237     
00238     gettimeofday(&pre0, NULL);
00239 
00240     if (invParam.inv_type_precondition != QUDA_INVALID_INVERTER) {
00241       cudaColorSpinorField &pPre = (precMatch ? *p[k] : *p_pre);
00242 
00243       copyCuda(rPre, rSloppy);
00244 
00245       (*K)(pPre, rPre);
00246 
00247       // relaxation p = omega*p + (1-omega)*r
00248       if (invParam.omega!=1.0) axpbyCuda((1.0-invParam.omega), rPre, invParam.omega, pPre);
00249 
00250       copyCuda(*p[k], pPre);
00251     } else { // no preconditioner
00252       *p[k] = rSloppy;
00253     } 
00254 
00255 
00256     gettimeofday(&pre1, NULL);
00257 
00258     gettimeofday(&mat0, NULL);
00259     matSloppy(*Ap[k], *p[k], tmp);
00260     gettimeofday(&mat1, NULL);
00261 
00262     orthoDir(beta, Ap, k);
00263 
00264     double3 Apr = cDotProductNormACuda(*Ap[k], rSloppy);
00265 
00266     gamma[k] = sqrt(Apr.z); // gamma[k] = Ap[k]
00267     if (gamma[k] == 0.0) errorQuda("GCR breakdown\n");
00268     alpha[k] = quda::Complex(Apr.x, Apr.y) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r)
00269 
00270     // r -= (1/|Ap|^2) * (Ap, r) r, Ap *= 1/|Ap|
00271     r2 = cabxpyAxNormCuda(1.0/gamma[k], -alpha[k], *Ap[k], rSloppy); 
00272 
00273     if (invParam.verbosity >= QUDA_DEBUG_VERBOSE) {
00274       double x2 = norm2(x);
00275       double p2 = norm2(*p[k]);
00276       double Ap2 = norm2(*Ap[k]);
00277       printfQuda("GCR: alpha = (%e,%e), norm2(x) = %e, norm2(p) = %e, norm2(Ap) = %e\n", 
00278                  real(alpha[k]), imag(alpha[k]), x2, p2, Ap2);
00279     }
00280 
00281     k++;
00282     total_iter++;
00283 
00284     if (invParam.verbosity >= QUDA_VERBOSE) 
00285       printfQuda("GCR: %d total iterations, %d Krylov iterations, r2 = %e\n", total_iter, k, r2);
00286 
00287     gettimeofday(&rst0, NULL);
00288     // update solution and residual since max Nkrylov reached, converged or reliable update required
00289     if (k==Nkrylov || r2 < stop || r2/r2_old < invParam.reliable_delta) { 
00290 
00291       // update the solution vector
00292       updateSolution(xSloppy, alpha, beta, gamma, k, p);
00293 
00294       // recalculate residual in high precision
00295       copyCuda(x, xSloppy);
00296       xpyCuda(x, y);
00297 
00298       double r2Sloppy = r2;
00299 
00300       k = 0;
00301       mat(r, y);
00302       r2 = xmyNormCuda(b, r);  
00303 
00304       if (r2 > stop) {
00305         restart++; // restarting if residual is still too great
00306 
00307         if (invParam.verbosity >= QUDA_VERBOSE) 
00308           printfQuda("\nGCR: restart %d, iterated r2 = %e, true r2 = %e\n", restart, r2Sloppy, r2);
00309       }
00310 
00311       copyCuda(rSloppy, r);
00312       zeroCuda(xSloppy);
00313 
00314       r2_old = r2;
00315     }
00316     gettimeofday(&rst1, NULL);
00317 
00318     orthT += timeInterval(orth0, orth1);
00319     matT += timeInterval(mat0, mat1);
00320     preT += timeInterval(pre0, pre1);
00321     resT += timeInterval(rst0, rst1);
00322 
00323   }
00324 
00325   copyCuda(x, y);
00326 
00327   if (k>=invParam.maxiter && invParam.verbosity >= QUDA_SUMMARIZE) 
00328     warningQuda("Exceeded maximum iterations %d", invParam.maxiter);
00329 
00330   if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("GCR: number of restarts = %d\n", restart);
00331   
00332   invParam.secs += stopwatchReadSeconds();
00333   
00334   double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
00335   reduceDouble(gflops);
00336 
00337   printfQuda("%f gflops %e Preconditoner = %e, Mat-Vec = %e, orthogonolization %e restart %e\n", 
00338              gflops / stopwatchReadSeconds(), invParam.secs, preT, matT, orthT, resT);
00339   invParam.gflops += gflops;
00340   invParam.iter += total_iter;
00341   
00342   if (invParam.verbosity >= QUDA_SUMMARIZE) {
00343     // Calculate the true residual
00344     mat(r, x);
00345     double true_res = xmyNormCuda(b, r);
00346     
00347     printfQuda("GCR: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 
00348                total_iter, sqrt(r2/b2), sqrt(true_res / b2));    
00349   }
00350 
00351   if (invParam.cuda_prec_sloppy != invParam.cuda_prec) {
00352     delete x_sloppy;
00353     delete r_sloppy;
00354   }
00355 
00356   if (invParam.prec_precondition != invParam.cuda_prec_sloppy) {
00357     delete p_pre;
00358     delete r_pre;
00359   }
00360 
00361   for (int i=0; i<Nkrylov; i++) {
00362     delete p[i];
00363     delete Ap[i];
00364   }
00365   delete[] p;
00366   delete[] Ap;
00367 
00368   delete alpha;
00369   for (int i=0; i<Nkrylov; i++) delete []beta[i];
00370   delete []beta;
00371   delete gamma;
00372 
00373   return;
00374 }