QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 00005 #include <complex> 00006 00007 #include <quda_internal.h> 00008 #include <blas_quda.h> 00009 #include <dslash_quda.h> 00010 #include <invert_quda.h> 00011 #include <util_quda.h> 00012 00013 #include<face_quda.h> 00014 00015 #include <color_spinor_field.h> 00016 00017 #include <sys/time.h> 00018 00019 struct timeval orth0, orth1, pre0, pre1, mat0, mat1, rst0, rst1; 00020 00021 double timeInterval(struct timeval start, struct timeval end) { 00022 long ds = end.tv_sec - start.tv_sec; 00023 long dus = end.tv_usec - start.tv_usec; 00024 return ds + 0.000001*dus; 00025 } 00026 00027 // set the required parameters for the inner solver 00028 void fillInnerInvertParam(QudaInvertParam &inner, const QudaInvertParam &outer) { 00029 inner.tol = outer.tol_precondition; 00030 inner.maxiter = outer.maxiter_precondition; 00031 inner.reliable_delta = 1e-20; // no reliable updates within the inner solver 00032 00033 inner.cuda_prec = outer.prec_precondition; // preconditioners are uni-precision solvers 00034 inner.cuda_prec_sloppy = outer.prec_precondition; 00035 00036 inner.verbosity = outer.verbosity_precondition; 00037 00038 inner.iter = 0; 00039 inner.gflops = 0; 00040 inner.secs = 0; 00041 00042 inner.inv_type_precondition = QUDA_GCR_INVERTER; // used to tell the inner solver it is an inner solver 00043 00044 if (outer.inv_type == QUDA_GCR_INVERTER && outer.cuda_prec_sloppy != outer.prec_precondition) 00045 inner.preserve_source = QUDA_PRESERVE_SOURCE_NO; 00046 else inner.preserve_source = QUDA_PRESERVE_SOURCE_YES; 00047 00048 } 00049 00050 void orthoDir(quda::Complex **beta, cudaColorSpinorField *Ap[], int k) { 00051 gettimeofday(&orth0, NULL); 00052 00053 int type = 1; 00054 00055 switch (type) { 00056 case 0: // no kernel fusion 00057 for (int i=0; i<k; i++) { // 5 (k-1) memory transactions here 00058 beta[i][k] = cDotProductCuda(*Ap[i], *Ap[k]); 00059 caxpyCuda(-beta[i][k], *Ap[i], *Ap[k]); 00060 } 00061 break; 00062 case 1: // basic kernel fusion 00063 if (k==0) break; 00064 beta[0][k] = cDotProductCuda(*Ap[0], *Ap[k]); 00065 for (int i=0; i<k-1; i++) { // 4 (k-1) memory transactions here 00066 beta[i+1][k] = caxpyDotzyCuda(-beta[i][k], *Ap[i], *Ap[k], *Ap[i+1]); 00067 } 00068 caxpyCuda(-beta[k-1][k], *Ap[k-1], *Ap[k]); 00069 break; 00070 case 2: // 00071 for (int i=0; i<k-2; i+=3) { // 5 (k-1) memory transactions here 00072 for (int j=i; j<i+3; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]); 00073 caxpbypczpwCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], -beta[i+2][k], *Ap[i+2], *Ap[k]); 00074 } 00075 00076 if (k%3 != 0) { // need to update the remainder 00077 if ((k - 3*(k/3)) % 2 == 0) { 00078 beta[k-2][k] = cDotProductCuda(*Ap[k-2], *Ap[k]); 00079 beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]); 00080 caxpbypzCuda(beta[k-2][k], *Ap[k-2], beta[k-1][k], *Ap[k-1], *Ap[k]); 00081 } else { 00082 beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]); 00083 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]); 00084 } 00085 } 00086 00087 break; 00088 case 3: 00089 for (int i=0; i<k-1; i+=2) { 00090 for (int j=i; j<i+2; j++) beta[j][k] = cDotProductCuda(*Ap[j], *Ap[k]); 00091 caxpbypzCuda(-beta[i][k], *Ap[i], -beta[i+1][k], *Ap[i+1], *Ap[k]); 00092 } 00093 00094 if (k%2 != 0) { // need to update the remainder 00095 beta[k-1][k] = cDotProductCuda(*Ap[k-1], *Ap[k]); 00096 caxpyCuda(beta[k-1][k], *Ap[k-1], *Ap[k]); 00097 } 00098 break; 00099 default: 00100 errorQuda("Orthogonalization type not defined"); 00101 } 00102 00103 gettimeofday(&orth1, NULL); 00104 } 00105 00106 void backSubs(const quda::Complex *alpha, quda::Complex** const beta, const double *gamma, quda::Complex *delta, int n) { 00107 for (int k=n-1; k>=0;k--) { 00108 delta[k] = alpha[k]; 00109 for (int j=k+1;j<n; j++) { 00110 delta[k] -= beta[k][j]*delta[j]; 00111 } 00112 delta[k] /= gamma[k]; 00113 } 00114 } 00115 00116 void updateSolution(cudaColorSpinorField &x, const quda::Complex *alpha, quda::Complex** const beta, 00117 double *gamma, int k, cudaColorSpinorField *p[]) { 00118 00119 quda::Complex *delta = new quda::Complex[k]; 00120 00121 // Update the solution vector 00122 backSubs(alpha, beta, gamma, delta, k); 00123 00124 //for (int i=0; i<k; i++) caxpyCuda(delta[i], *p[i], x); 00125 00126 for (int i=0; i<k-2; i+=3) 00127 caxpbypczpwCuda(delta[i], *p[i], delta[i+1], *p[i+1], delta[i+2], *p[i+2], x); 00128 00129 if (k%3 != 0) { // need to update the remainder 00130 if ((k - 3*(k/3)) % 2 == 0) caxpbypzCuda(delta[k-2], *p[k-2], delta[k-1], *p[k-1], x); 00131 else caxpyCuda(delta[k-1], *p[k-1], x); 00132 } 00133 00134 delete []delta; 00135 } 00136 00137 GCR::GCR(DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, QudaInvertParam &invParam) : 00138 Solver(invParam), mat(mat), matSloppy(matSloppy), matPrecon(matPrecon), K(0) 00139 { 00140 00141 Kparam = newQudaInvertParam(); 00142 fillInnerInvertParam(Kparam, invParam); 00143 00144 if (invParam.inv_type_precondition == QUDA_CG_INVERTER) // inner CG preconditioner 00145 K = new CG(matPrecon, matPrecon, Kparam); 00146 else if (invParam.inv_type_precondition == QUDA_BICGSTAB_INVERTER) // inner BiCGstab preconditioner 00147 K = new BiCGstab(matPrecon, matPrecon, matPrecon, Kparam); 00148 else if (invParam.inv_type_precondition == QUDA_MR_INVERTER) // inner MR preconditioner 00149 K = new MR(matPrecon, Kparam); 00150 else if (invParam.inv_type_precondition != QUDA_INVALID_INVERTER) // unknown preconditioner 00151 errorQuda("Unknown inner solver %d", invParam.inv_type_precondition); 00152 00153 } 00154 00155 GCR::~GCR() { 00156 if (K) delete K; 00157 } 00158 00159 void GCR::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 00160 { 00161 int Nkrylov = invParam.gcrNkrylov; // size of Krylov space 00162 00163 ColorSpinorParam param(x); 00164 param.create = QUDA_ZERO_FIELD_CREATE; 00165 cudaColorSpinorField r(x, param); 00166 00167 cudaColorSpinorField y(x, param); // high precision accumulator 00168 00169 // create sloppy fields used for orthogonalization 00170 param.precision = invParam.cuda_prec_sloppy; 00171 cudaColorSpinorField **p = new cudaColorSpinorField*[Nkrylov]; 00172 cudaColorSpinorField **Ap = new cudaColorSpinorField*[Nkrylov]; 00173 for (int i=0; i<Nkrylov; i++) { 00174 p[i] = new cudaColorSpinorField(x, param); 00175 Ap[i] = new cudaColorSpinorField(x, param); 00176 } 00177 00178 cudaColorSpinorField tmp(x, param); //temporary for sloppy mat-vec 00179 00180 cudaColorSpinorField *x_sloppy, *r_sloppy; 00181 if (invParam.cuda_prec_sloppy != invParam.cuda_prec) { 00182 param.precision = invParam.cuda_prec_sloppy; 00183 x_sloppy = new cudaColorSpinorField(x, param); 00184 r_sloppy = new cudaColorSpinorField(x, param); 00185 } else { 00186 x_sloppy = &x; 00187 r_sloppy = &r; 00188 } 00189 00190 cudaColorSpinorField &xSloppy = *x_sloppy; 00191 cudaColorSpinorField &rSloppy = *r_sloppy; 00192 00193 // these low precision fields are used by the inner solver 00194 bool precMatch = true; 00195 cudaColorSpinorField *r_pre, *p_pre; 00196 if (invParam.prec_precondition != invParam.cuda_prec_sloppy) { 00197 param.precision = invParam.prec_precondition; 00198 p_pre = new cudaColorSpinorField(x, param); 00199 r_pre = new cudaColorSpinorField(x, param); 00200 precMatch = false; 00201 } else { 00202 p_pre = NULL; 00203 r_pre = r_sloppy; 00204 } 00205 cudaColorSpinorField &rPre = *r_pre; 00206 00207 quda::Complex *alpha = new quda::Complex[Nkrylov]; 00208 quda::Complex **beta = new quda::Complex*[Nkrylov]; 00209 for (int i=0; i<Nkrylov; i++) beta[i] = new quda::Complex[Nkrylov]; 00210 double *gamma = new double[Nkrylov]; 00211 00212 double b2 = normCuda(b); 00213 00214 double stop = b2*invParam.tol*invParam.tol; // stopping condition of solver 00215 00216 int k = 0; 00217 00218 // calculate initial residual 00219 mat(r, x); 00220 double r2 = xmyNormCuda(b, r); 00221 copyCuda(rSloppy, r); 00222 00223 quda::blas_flops = 0; 00224 00225 stopwatchStart(); 00226 00227 int total_iter = 0; 00228 int restart = 0; 00229 double r2_old = r2; 00230 00231 if (invParam.verbosity >= QUDA_VERBOSE) 00232 printfQuda("GCR: %d total iterations, %d Krylov iterations, r2 = %e\n", total_iter+k, k, r2); 00233 00234 double orthT = 0, matT = 0, preT = 0, resT = 0; 00235 00236 while (r2 > stop && total_iter < invParam.maxiter) { 00237 00238 gettimeofday(&pre0, NULL); 00239 00240 if (invParam.inv_type_precondition != QUDA_INVALID_INVERTER) { 00241 cudaColorSpinorField &pPre = (precMatch ? *p[k] : *p_pre); 00242 00243 copyCuda(rPre, rSloppy); 00244 00245 (*K)(pPre, rPre); 00246 00247 // relaxation p = omega*p + (1-omega)*r 00248 if (invParam.omega!=1.0) axpbyCuda((1.0-invParam.omega), rPre, invParam.omega, pPre); 00249 00250 copyCuda(*p[k], pPre); 00251 } else { // no preconditioner 00252 *p[k] = rSloppy; 00253 } 00254 00255 00256 gettimeofday(&pre1, NULL); 00257 00258 gettimeofday(&mat0, NULL); 00259 matSloppy(*Ap[k], *p[k], tmp); 00260 gettimeofday(&mat1, NULL); 00261 00262 orthoDir(beta, Ap, k); 00263 00264 double3 Apr = cDotProductNormACuda(*Ap[k], rSloppy); 00265 00266 gamma[k] = sqrt(Apr.z); // gamma[k] = Ap[k] 00267 if (gamma[k] == 0.0) errorQuda("GCR breakdown\n"); 00268 alpha[k] = quda::Complex(Apr.x, Apr.y) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r) 00269 00270 // r -= (1/|Ap|^2) * (Ap, r) r, Ap *= 1/|Ap| 00271 r2 = cabxpyAxNormCuda(1.0/gamma[k], -alpha[k], *Ap[k], rSloppy); 00272 00273 if (invParam.verbosity >= QUDA_DEBUG_VERBOSE) { 00274 double x2 = norm2(x); 00275 double p2 = norm2(*p[k]); 00276 double Ap2 = norm2(*Ap[k]); 00277 printfQuda("GCR: alpha = (%e,%e), norm2(x) = %e, norm2(p) = %e, norm2(Ap) = %e\n", 00278 real(alpha[k]), imag(alpha[k]), x2, p2, Ap2); 00279 } 00280 00281 k++; 00282 total_iter++; 00283 00284 if (invParam.verbosity >= QUDA_VERBOSE) 00285 printfQuda("GCR: %d total iterations, %d Krylov iterations, r2 = %e\n", total_iter, k, r2); 00286 00287 gettimeofday(&rst0, NULL); 00288 // update solution and residual since max Nkrylov reached, converged or reliable update required 00289 if (k==Nkrylov || r2 < stop || r2/r2_old < invParam.reliable_delta) { 00290 00291 // update the solution vector 00292 updateSolution(xSloppy, alpha, beta, gamma, k, p); 00293 00294 // recalculate residual in high precision 00295 copyCuda(x, xSloppy); 00296 xpyCuda(x, y); 00297 00298 double r2Sloppy = r2; 00299 00300 k = 0; 00301 mat(r, y); 00302 r2 = xmyNormCuda(b, r); 00303 00304 if (r2 > stop) { 00305 restart++; // restarting if residual is still too great 00306 00307 if (invParam.verbosity >= QUDA_VERBOSE) 00308 printfQuda("\nGCR: restart %d, iterated r2 = %e, true r2 = %e\n", restart, r2Sloppy, r2); 00309 } 00310 00311 copyCuda(rSloppy, r); 00312 zeroCuda(xSloppy); 00313 00314 r2_old = r2; 00315 } 00316 gettimeofday(&rst1, NULL); 00317 00318 orthT += timeInterval(orth0, orth1); 00319 matT += timeInterval(mat0, mat1); 00320 preT += timeInterval(pre0, pre1); 00321 resT += timeInterval(rst0, rst1); 00322 00323 } 00324 00325 copyCuda(x, y); 00326 00327 if (k>=invParam.maxiter && invParam.verbosity >= QUDA_SUMMARIZE) 00328 warningQuda("Exceeded maximum iterations %d", invParam.maxiter); 00329 00330 if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("GCR: number of restarts = %d\n", restart); 00331 00332 invParam.secs += stopwatchReadSeconds(); 00333 00334 double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9; 00335 reduceDouble(gflops); 00336 00337 printfQuda("%f gflops %e Preconditoner = %e, Mat-Vec = %e, orthogonolization %e restart %e\n", 00338 gflops / stopwatchReadSeconds(), invParam.secs, preT, matT, orthT, resT); 00339 invParam.gflops += gflops; 00340 invParam.iter += total_iter; 00341 00342 if (invParam.verbosity >= QUDA_SUMMARIZE) { 00343 // Calculate the true residual 00344 mat(r, x); 00345 double true_res = xmyNormCuda(b, r); 00346 00347 printfQuda("GCR: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 00348 total_iter, sqrt(r2/b2), sqrt(true_res / b2)); 00349 } 00350 00351 if (invParam.cuda_prec_sloppy != invParam.cuda_prec) { 00352 delete x_sloppy; 00353 delete r_sloppy; 00354 } 00355 00356 if (invParam.prec_precondition != invParam.cuda_prec_sloppy) { 00357 delete p_pre; 00358 delete r_pre; 00359 } 00360 00361 for (int i=0; i<Nkrylov; i++) { 00362 delete p[i]; 00363 delete Ap[i]; 00364 } 00365 delete[] p; 00366 delete[] Ap; 00367 00368 delete alpha; 00369 for (int i=0; i<Nkrylov; i++) delete []beta[i]; 00370 delete []beta; 00371 delete gamma; 00372 00373 return; 00374 }