QUDA: quda/lib/inv_multi_cg_quda.cpp Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <math.h>
00004 
00005 #include <quda_internal.h>
00006 #include <color_spinor_field.h>
00007 #include <blas_quda.h>
00008 #include <dslash_quda.h>
00009 #include <invert_quda.h>
00010 #include <util_quda.h>
00011 #include <sys/time.h>
00012 
00013 /* 
00014  * The lowest mass is in offsets[0]
00015  * The calling function must take care of that
00016  */
00017 int invertMultiShiftCgCuda(const DiracMatrix &mat, const DiracMatrix &matSloppy, cudaColorSpinorField **x, cudaColorSpinorField b,
00018                            QudaInvertParam *invert_param, double *offsets, int num_offsets, double *residue_sq)
00019 {
00020   
00021   if (num_offsets == 0) {
00022     return 0;
00023   }
00024 
00025   int finished[num_offsets];
00026   double shifts[num_offsets];
00027   double zeta_i[num_offsets], zeta_im1[num_offsets], zeta_ip1[num_offsets];
00028   double beta_i[num_offsets], beta_im1[num_offsets], alpha[num_offsets];
00029   int i, j;
00030   
00031   int j_low = 0;   
00032   int num_offsets_now = num_offsets;
00033   for (i=0; i<num_offsets; i++) {
00034     finished[i] = 0;
00035     shifts[i] = offsets[i] - offsets[0];
00036     zeta_im1[i] = zeta_i[i] = 1.0;
00037     beta_im1[i] = -1.0;
00038     alpha[i] = 0.0;
00039   }
00040   
00041   //double msq_x4 = offsets[0];
00042 
00043   cudaColorSpinorField *r = new cudaColorSpinorField(b);
00044   
00045   cudaColorSpinorField *x_sloppy[num_offsets], *r_sloppy;
00046   
00047   ColorSpinorParam param;
00048   param.create = QUDA_ZERO_FIELD_CREATE;
00049   param.precision = invert_param->cuda_prec_sloppy;
00050   
00051   if (invert_param->cuda_prec_sloppy == x[0]->Precision()) {
00052     for (i=0; i<num_offsets; i++){
00053       x_sloppy[i] = x[i];
00054       zeroCuda(*x_sloppy[i]);
00055     }
00056     r_sloppy = r;
00057   } else {
00058     for (i=0; i<num_offsets; i++) {
00059       x_sloppy[i] = new cudaColorSpinorField(*x[i], param);
00060     }
00061     param.create = QUDA_COPY_FIELD_CREATE;
00062     r_sloppy = new cudaColorSpinorField(*r, param);
00063   }
00064   
00065   cudaColorSpinorField* p[num_offsets];  
00066   for(i=0;i < num_offsets;i++){
00067     p[i]= new cudaColorSpinorField(*r_sloppy);    
00068   }
00069   
00070   param.create = QUDA_ZERO_FIELD_CREATE;
00071   param.precision = invert_param->cuda_prec_sloppy;
00072   cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param);
00073   
00074   double b2 = 0.0;
00075   b2 = normCuda(b);
00076     
00077   double r2 = b2;
00078   double r2_old;
00079   double stop = r2*invert_param->tol*invert_param->tol; // stopping condition of solver
00080     
00081   double pAp;
00082     
00083   int k = 0;
00084     
00085   stopwatchStart();
00086   while (r2 > stop &&  k < invert_param->maxiter) {
00087     //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0);
00088     //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4);
00089     matSloppy(*Ap, *p[0]);
00090     
00091     pAp = reDotProductCuda(*p[0], *Ap);
00092     beta_i[0] = r2 / pAp;        
00093 
00094     zeta_ip1[0] = 1.0;
00095     for (j=1; j<num_offsets_now; j++) {
00096       zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low];
00097       double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]);
00098       double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+shifts[j]*beta_i[j_low]);
00099       /*THISBLOWSUP
00100         zeta_ip1[j] /= c1 + c2;
00101         beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
00102       */
00103       /*TRYTHIS*/
00104       if( (c1+c2) != 0.0 )
00105         zeta_ip1[j] /= (c1 + c2); 
00106       else {
00107         zeta_ip1[j] = 0.0;
00108         finished[j] = 1;
00109       }
00110       if( zeta_i[j] != 0.0) {
00111         beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
00112       } else  {
00113         zeta_ip1[j] = 0.0;
00114         beta_i[j] = 0.0;
00115         finished[j] = 1;
00116         if (invert_param->verbosity >= QUDA_VERBOSE)
00117           printfQuda("SETTING A ZERO, j=%d, num_offsets_now=%d\n",j,num_offsets_now);
00118         //if(j==num_offsets_now-1)node0_PRINTF("REDUCING OFFSETS\n");
00119         if(j==num_offsets_now-1)num_offsets_now--;
00120         // don't work any more on finished solutions
00121         // this only works if largest offsets are last, otherwise
00122         // just wastes time multiplying by zero
00123       }
00124     }   
00125         
00126     r2_old = r2;
00127     r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy);
00128 
00129     alpha[0] = r2 / r2_old;
00130         
00131     for (j=1; j<num_offsets_now; j++) {
00132       /*THISBLOWSUP
00133         alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
00134         (zeta_i[j] * beta_i[j_low]);
00135       */
00136       /*TRYTHIS*/
00137       if( zeta_i[j] * beta_i[j_low] != 0.0)
00138         alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
00139           (zeta_i[j] * beta_i[j_low]);
00140       else {
00141         alpha[j] = 0.0;
00142         finished[j] = 1;
00143       }
00144     }
00145         
00146     axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]);  
00147     for (j=1; j<num_offsets_now; j++) {
00148       axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]);
00149     }
00150     
00151     for (j=0; j<num_offsets_now; j++) {
00152       beta_im1[j] = beta_i[j];
00153       zeta_im1[j] = zeta_i[j];
00154       zeta_i[j] = zeta_ip1[j];
00155     }
00156 
00157     k++;
00158     if (invert_param->verbosity >= QUDA_VERBOSE){
00159       printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2);
00160     }
00161   }
00162     
00163   if (x[0]->Precision() != x_sloppy[0]->Precision()) {
00164     for(i=0;i < num_offsets; i++){
00165       copyCuda(*x[i], *x_sloppy[i]);
00166     }
00167   }
00168 
00169   *residue_sq = r2;
00170 
00171   invert_param->secs = stopwatchReadSeconds();
00172     
00173   if (k==invert_param->maxiter) {
00174     warningQuda("Exceeded maximum iterations %d\n", invert_param->maxiter);
00175   }
00176     
00177   float gflops = (blas_quda_flops + mat.flops() + matSloppy.flops())*1e-9;
00178   invert_param->gflops = gflops;
00179   invert_param->iter = k;
00180   //#if 0
00181   // Calculate the true residual
00182   mat(*r, *x[0]);    
00183   double true_res = xmyNormCuda(b, *r);
00184   if (invert_param->verbosity >= QUDA_SUMMARIZE){
00185     printfQuda("Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 
00186                k,r2, (true_res / b2));
00187   }    
00188   //#endif
00189     
00190   
00191   delete r;
00192   for(i=0;i < num_offsets; i++){
00193     delete p[i];
00194   }
00195   delete Ap;
00196   
00197   if (invert_param->cuda_prec_sloppy != x[0]->Precision()) {
00198     for(i=0;i < num_offsets;i++){
00199       delete x_sloppy[i];
00200     }
00201     delete r_sloppy;
00202   }
00203   
00204   
00205   return k;
00206 }
00207