|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 00005 #include <quda_internal.h> 00006 #include <color_spinor_field.h> 00007 #include <blas_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <sys/time.h> 00012 00013 /* 00014 * The lowest mass is in offsets[0] 00015 * The calling function must take care of that 00016 */ 00017 int invertMultiShiftCgCuda(const DiracMatrix &mat, const DiracMatrix &matSloppy, cudaColorSpinorField **x, cudaColorSpinorField b, 00018 QudaInvertParam *invert_param, double *offsets, int num_offsets, double *residue_sq) 00019 { 00020 00021 if (num_offsets == 0) { 00022 return 0; 00023 } 00024 00025 int finished[num_offsets]; 00026 double shifts[num_offsets]; 00027 double zeta_i[num_offsets], zeta_im1[num_offsets], zeta_ip1[num_offsets]; 00028 double beta_i[num_offsets], beta_im1[num_offsets], alpha[num_offsets]; 00029 int i, j; 00030 00031 int j_low = 0; 00032 int num_offsets_now = num_offsets; 00033 for (i=0; i<num_offsets; i++) { 00034 finished[i] = 0; 00035 shifts[i] = offsets[i] - offsets[0]; 00036 zeta_im1[i] = zeta_i[i] = 1.0; 00037 beta_im1[i] = -1.0; 00038 alpha[i] = 0.0; 00039 } 00040 00041 //double msq_x4 = offsets[0]; 00042 00043 cudaColorSpinorField *r = new cudaColorSpinorField(b); 00044 00045 cudaColorSpinorField *x_sloppy[num_offsets], *r_sloppy; 00046 00047 ColorSpinorParam param; 00048 param.create = QUDA_ZERO_FIELD_CREATE; 00049 param.precision = invert_param->cuda_prec_sloppy; 00050 00051 if (invert_param->cuda_prec_sloppy == x[0]->Precision()) { 00052 for (i=0; i<num_offsets; i++){ 00053 x_sloppy[i] = x[i]; 00054 zeroCuda(*x_sloppy[i]); 00055 } 00056 r_sloppy = r; 00057 } else { 00058 for (i=0; i<num_offsets; i++) { 00059 x_sloppy[i] = new cudaColorSpinorField(*x[i], param); 00060 } 00061 param.create = QUDA_COPY_FIELD_CREATE; 00062 r_sloppy = new cudaColorSpinorField(*r, param); 00063 } 00064 00065 cudaColorSpinorField* p[num_offsets]; 00066 for(i=0;i < num_offsets;i++){ 00067 p[i]= new cudaColorSpinorField(*r_sloppy); 00068 } 00069 00070 param.create = QUDA_ZERO_FIELD_CREATE; 00071 param.precision = invert_param->cuda_prec_sloppy; 00072 cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param); 00073 00074 double b2 = 0.0; 00075 b2 = normCuda(b); 00076 00077 double r2 = b2; 00078 double r2_old; 00079 double stop = r2*invert_param->tol*invert_param->tol; // stopping condition of solver 00080 00081 double pAp; 00082 00083 int k = 0; 00084 00085 stopwatchStart(); 00086 while (r2 > stop && k < invert_param->maxiter) { 00087 //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0); 00088 //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4); 00089 matSloppy(*Ap, *p[0]); 00090 00091 pAp = reDotProductCuda(*p[0], *Ap); 00092 beta_i[0] = r2 / pAp; 00093 00094 zeta_ip1[0] = 1.0; 00095 for (j=1; j<num_offsets_now; j++) { 00096 zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low]; 00097 double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]); 00098 double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+shifts[j]*beta_i[j_low]); 00099 /*THISBLOWSUP 00100 zeta_ip1[j] /= c1 + c2; 00101 beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; 00102 */ 00103 /*TRYTHIS*/ 00104 if( (c1+c2) != 0.0 ) 00105 zeta_ip1[j] /= (c1 + c2); 00106 else { 00107 zeta_ip1[j] = 0.0; 00108 finished[j] = 1; 00109 } 00110 if( zeta_i[j] != 0.0) { 00111 beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; 00112 } else { 00113 zeta_ip1[j] = 0.0; 00114 beta_i[j] = 0.0; 00115 finished[j] = 1; 00116 if (invert_param->verbosity >= QUDA_VERBOSE) 00117 printfQuda("SETTING A ZERO, j=%d, num_offsets_now=%d\n",j,num_offsets_now); 00118 //if(j==num_offsets_now-1)node0_PRINTF("REDUCING OFFSETS\n"); 00119 if(j==num_offsets_now-1)num_offsets_now--; 00120 // don't work any more on finished solutions 00121 // this only works if largest offsets are last, otherwise 00122 // just wastes time multiplying by zero 00123 } 00124 } 00125 00126 r2_old = r2; 00127 r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy); 00128 00129 alpha[0] = r2 / r2_old; 00130 00131 for (j=1; j<num_offsets_now; j++) { 00132 /*THISBLOWSUP 00133 alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / 00134 (zeta_i[j] * beta_i[j_low]); 00135 */ 00136 /*TRYTHIS*/ 00137 if( zeta_i[j] * beta_i[j_low] != 0.0) 00138 alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / 00139 (zeta_i[j] * beta_i[j_low]); 00140 else { 00141 alpha[j] = 0.0; 00142 finished[j] = 1; 00143 } 00144 } 00145 00146 axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]); 00147 for (j=1; j<num_offsets_now; j++) { 00148 axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]); 00149 } 00150 00151 for (j=0; j<num_offsets_now; j++) { 00152 beta_im1[j] = beta_i[j]; 00153 zeta_im1[j] = zeta_i[j]; 00154 zeta_i[j] = zeta_ip1[j]; 00155 } 00156 00157 k++; 00158 if (invert_param->verbosity >= QUDA_VERBOSE){ 00159 printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2); 00160 } 00161 } 00162 00163 if (x[0]->Precision() != x_sloppy[0]->Precision()) { 00164 for(i=0;i < num_offsets; i++){ 00165 copyCuda(*x[i], *x_sloppy[i]); 00166 } 00167 } 00168 00169 *residue_sq = r2; 00170 00171 invert_param->secs = stopwatchReadSeconds(); 00172 00173 if (k==invert_param->maxiter) { 00174 warningQuda("Exceeded maximum iterations %d\n", invert_param->maxiter); 00175 } 00176 00177 float gflops = (blas_quda_flops + mat.flops() + matSloppy.flops())*1e-9; 00178 invert_param->gflops = gflops; 00179 invert_param->iter = k; 00180 //#if 0 00181 // Calculate the true residual 00182 mat(*r, *x[0]); 00183 double true_res = xmyNormCuda(b, *r); 00184 if (invert_param->verbosity >= QUDA_SUMMARIZE){ 00185 printfQuda("Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 00186 k,r2, (true_res / b2)); 00187 } 00188 //#endif 00189 00190 00191 delete r; 00192 for(i=0;i < num_offsets; i++){ 00193 delete p[i]; 00194 } 00195 delete Ap; 00196 00197 if (invert_param->cuda_prec_sloppy != x[0]->Precision()) { 00198 for(i=0;i < num_offsets;i++){ 00199 delete x_sloppy[i]; 00200 } 00201 delete r_sloppy; 00202 } 00203 00204 00205 return k; 00206 } 00207
1.7.3