QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 00005 #include <quda_internal.h> 00006 #include <color_spinor_field.h> 00007 #include <blas_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <face_quda.h> 00012 00013 //#include <sys/time.h> 00014 00015 00016 00027 MultiShiftCG::MultiShiftCG(DiracMatrix &mat, DiracMatrix &matSloppy, QudaInvertParam &invParam) 00028 : MultiShiftSolver(invParam), mat(mat), matSloppy(matSloppy) { 00029 00030 } 00031 00032 MultiShiftCG::~MultiShiftCG() { 00033 00034 } 00035 00036 void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b) 00037 { 00038 00039 int num_offset = invParam.num_offset; 00040 double *offset = invParam.offset; 00041 double *residue_sq = invParam.tol_offset; 00042 00043 if (num_offset == 0) return; 00044 00045 int *finished = new int [num_offset]; 00046 double *zeta_i = new double[num_offset]; 00047 double *zeta_im1 = new double[num_offset]; 00048 double *zeta_ip1 = new double[num_offset]; 00049 double *beta_i = new double[num_offset]; 00050 double *beta_im1 = new double[num_offset]; 00051 double *alpha = new double[num_offset]; 00052 int i, j; 00053 00054 int j_low = 0; 00055 int num_offset_now = num_offset; 00056 for (i=0; i<num_offset; i++) { 00057 finished[i] = 0; 00058 zeta_im1[i] = zeta_i[i] = 1.0; 00059 beta_im1[i] = -1.0; 00060 alpha[i] = 0.0; 00061 } 00062 00063 //double msq_x4 = offset[0]; 00064 00065 cudaColorSpinorField *r = new cudaColorSpinorField(b); 00066 00067 cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset], *r_sloppy; 00068 00069 ColorSpinorParam param; 00070 param.create = QUDA_ZERO_FIELD_CREATE; 00071 param.precision = invParam.cuda_prec_sloppy; 00072 00073 if (invParam.cuda_prec_sloppy == x[0]->Precision()) { 00074 for (i=0; i<num_offset; i++){ 00075 x_sloppy[i] = x[i]; 00076 zeroCuda(*x_sloppy[i]); 00077 } 00078 r_sloppy = r; 00079 } else { 00080 for (i=0; i<num_offset; i++) { 00081 x_sloppy[i] = new cudaColorSpinorField(*x[i], param); 00082 } 00083 param.create = QUDA_COPY_FIELD_CREATE; 00084 r_sloppy = new cudaColorSpinorField(*r, param); 00085 } 00086 00087 cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset]; 00088 for(i=0;i < num_offset;i++){ 00089 p[i]= new cudaColorSpinorField(*r_sloppy); 00090 } 00091 00092 param.create = QUDA_ZERO_FIELD_CREATE; 00093 param.precision = invParam.cuda_prec_sloppy; 00094 cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param); 00095 00096 cudaColorSpinorField tmp1(*Ap, param); 00097 00098 double b2 = 0.0; 00099 b2 = normCuda(b); 00100 00101 double r2 = b2; 00102 double r2_old; 00103 double stop = r2*invParam.tol*invParam.tol; // stopping condition of solver 00104 00105 double pAp; 00106 00107 int k = 0; 00108 00109 stopwatchStart(); 00110 while (r2 > stop && k < invParam.maxiter) { 00111 //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0); 00112 //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4); 00113 matSloppy(*Ap, *p[0], tmp1); 00114 if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){ 00115 axpyCuda(offset[0], *p[0], *Ap); 00116 } 00117 pAp = reDotProductCuda(*p[0], *Ap); 00118 beta_i[0] = r2 / pAp; 00119 00120 zeta_ip1[0] = 1.0; 00121 for (j=1; j<num_offset_now; j++) { 00122 zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low]; 00123 double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]); 00124 double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+(offset[j]-offset[0])*beta_i[j_low]); 00125 /*THISBLOWSUP 00126 zeta_ip1[j] /= c1 + c2; 00127 beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; 00128 */ 00129 /*TRYTHIS*/ 00130 if( (c1+c2) != 0.0 ) 00131 zeta_ip1[j] /= (c1 + c2); 00132 else { 00133 zeta_ip1[j] = 0.0; 00134 finished[j] = 1; 00135 } 00136 if( zeta_i[j] != 0.0) { 00137 beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; 00138 } else { 00139 zeta_ip1[j] = 0.0; 00140 beta_i[j] = 0.0; 00141 finished[j] = 1; 00142 if (invParam.verbosity >= QUDA_VERBOSE) 00143 printfQuda("SETTING A ZERO, j=%d, num_offset_now=%d\n",j,num_offset_now); 00144 //if(j==num_offset_now-1)node0_PRINTF("REDUCING OFFSET\n"); 00145 if(j==num_offset_now-1) num_offset_now--; 00146 // don't work any more on finished solutions 00147 // this only works if largest offsets are last, otherwise 00148 // just wastes time multiplying by zero 00149 } 00150 } 00151 00152 r2_old = r2; 00153 r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy); 00154 00155 alpha[0] = r2 / r2_old; 00156 00157 for (j=1; j<num_offset_now; j++) { 00158 /*THISBLOWSUP 00159 alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / 00160 (zeta_i[j] * beta_i[j_low]); 00161 */ 00162 /*TRYTHIS*/ 00163 if( zeta_i[j] * beta_i[j_low] != 0.0) 00164 alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / 00165 (zeta_i[j] * beta_i[j_low]); 00166 else { 00167 alpha[j] = 0.0; 00168 finished[j] = 1; 00169 } 00170 } 00171 00172 axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]); 00173 for (j=1; j<num_offset_now; j++) { 00174 axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]); 00175 } 00176 00177 for (j=0; j<num_offset_now; j++) { 00178 beta_im1[j] = beta_i[j]; 00179 zeta_im1[j] = zeta_i[j]; 00180 zeta_i[j] = zeta_ip1[j]; 00181 } 00182 00183 k++; 00184 if (invParam.verbosity >= QUDA_VERBOSE){ 00185 printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2); 00186 } 00187 } 00188 00189 if (x[0]->Precision() != x_sloppy[0]->Precision()) { 00190 for(i=0;i < num_offset; i++){ 00191 copyCuda(*x[i], *x_sloppy[i]); 00192 } 00193 } 00194 00195 *residue_sq = r2; 00196 00197 invParam.secs = stopwatchReadSeconds(); 00198 00199 if (k==invParam.maxiter) { 00200 warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter); 00201 } 00202 00203 double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; 00204 reduceDouble(gflops); 00205 00206 invParam.gflops = gflops; 00207 invParam.iter = k; 00208 00209 // Calculate the true residual of the system with the smallest shift 00210 mat(*r, *x[0]); 00211 if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){ 00212 axpyCuda(offset[0],*x[0], *r); // Offset it. 00213 } 00214 double true_res = xmyNormCuda(b, *r); 00215 if (invParam.verbosity >= QUDA_SUMMARIZE){ 00216 printfQuda("MultiShift CG: Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 00217 k,r2, (true_res / b2)); 00218 } 00219 if (invParam.verbosity >= QUDA_VERBOSE){ 00220 printfQuda("MultiShift CG: Converged after %d iterations\n", k); 00221 printfQuda(" shift=0 resid_rel=%e\n", sqrt(true_res/b2)); 00222 for(int i=1; i < num_offset; i++) { 00223 mat(*r, *x[i]); 00224 if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){ 00225 axpyCuda(offset[i],*x[i], *r); // Offset it. 00226 }else{ 00227 axpyCuda(offset[i]-offset[0],*x[i], *r); // Offset it. 00228 } 00229 true_res = xmyNormCuda(b, *r); 00230 printfQuda(" shift=%d resid_rel=%e\n",i, sqrt(true_res/b2)); 00231 } 00232 } 00233 00234 delete r; 00235 for(i=0;i < num_offset; i++){ 00236 delete p[i]; 00237 } 00238 delete p; 00239 delete Ap; 00240 00241 if (invParam.cuda_prec_sloppy != x[0]->Precision()) { 00242 for(i=0;i < num_offset;i++){ 00243 delete x_sloppy[i]; 00244 } 00245 delete r_sloppy; 00246 } 00247 delete x_sloppy; 00248 00249 delete []finished; 00250 delete []zeta_i; 00251 delete []zeta_im1; 00252 delete []zeta_ip1; 00253 delete []beta_i; 00254 delete []beta_im1; 00255 delete []alpha; 00256 00257 } 00258