QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <time.h> 00004 #include <math.h> 00005 #include <string.h> 00006 00007 #include <util_quda.h> 00008 #include <test_util.h> 00009 #include <blas_reference.h> 00010 #include <wilson_dslash_reference.h> 00011 #include "misc.h" 00012 00013 #include "face_quda.h" 00014 00015 #ifdef QMP_COMMS 00016 #include <qmp.h> 00017 #endif 00018 00019 #include <gauge_qio.h> 00020 00021 #define MAX(a,b) ((a)>(b)?(a):(b)) 00022 00023 // In a typical application, quda.h is the only QUDA header required. 00024 #include <quda.h> 00025 00026 // Wilson, clover-improved Wilson, and twisted mass are supported. 00027 extern QudaDslashType dslash_type; 00028 extern bool tune; 00029 extern int device; 00030 extern int xdim; 00031 extern int ydim; 00032 extern int zdim; 00033 extern int tdim; 00034 extern int gridsize_from_cmdline[]; 00035 extern QudaReconstructType link_recon; 00036 extern QudaPrecision prec; 00037 extern QudaReconstructType link_recon_sloppy; 00038 extern QudaPrecision prec_sloppy; 00039 00040 extern char latfile[]; 00041 00042 void 00043 display_test_info() 00044 { 00045 printfQuda("running the following test:\n"); 00046 00047 printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension\n"); 00048 printfQuda("%s %s %s %s %d/%d/%d %d \n", 00049 get_prec_str(prec),get_prec_str(prec_sloppy), 00050 get_recon_str(link_recon), 00051 get_recon_str(link_recon_sloppy), xdim, ydim, zdim, tdim); 00052 00053 printfQuda("Grid partition info: X Y Z T\n"); 00054 printfQuda(" %d %d %d %d\n", 00055 commDimPartitioned(0), 00056 commDimPartitioned(1), 00057 commDimPartitioned(2), 00058 commDimPartitioned(3)); 00059 00060 return ; 00061 00062 } 00063 00064 extern void usage(char** ); 00065 int main(int argc, char **argv) 00066 { 00067 /* 00068 int ndim=4, dims[4] = {1, 1, 1, 1}; 00069 char dimchar[] = {'X', 'Y', 'Z', 'T'}; 00070 char *gridsizeopt[] = {"--xgridsize", "--ygridsize", "--zgridsize", "--tgridsize"}; 00071 00072 for (int i=1; i<argc; i++) { 00073 for (int d=0; d<ndim; d++) { 00074 if (!strcmp(argv[i], gridsizeopt[d])) { 00075 if (i+1 >= argc) { 00076 printf("Usage: %s <args>\n", argv[0]); 00077 printf("%s\t Set %c comms grid size (default = 1)\n", gridsizeopt[d], dimchar[d]); 00078 exit(1); 00079 } 00080 dims[d] = atoi(argv[i+1]); 00081 if (dims[d] <= 0 ) { 00082 printf("Error: Invalid %c grid size\n", dimchar[d]); 00083 exit(1); 00084 } 00085 i++; 00086 break; 00087 } 00088 } 00089 } 00090 */ 00091 00092 00093 00094 int i; 00095 for (i =1;i < argc; i++){ 00096 00097 if(process_command_line_option(argc, argv, &i) == 0){ 00098 continue; 00099 } 00100 00101 printf("ERROR: Invalid option:%s\n", argv[i]); 00102 usage(argv); 00103 00104 00105 } 00106 00107 00108 if (prec_sloppy == QUDA_INVALID_PRECISION){ 00109 prec_sloppy = prec; 00110 } 00111 if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){ 00112 link_recon_sloppy = link_recon; 00113 } 00114 00115 00116 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00117 00118 // *** QUDA parameters begin here. 00119 00120 int multi_shift = 0; // whether to test multi-shift or standard solver 00121 00122 if (dslash_type != QUDA_WILSON_DSLASH && 00123 dslash_type != QUDA_CLOVER_WILSON_DSLASH && 00124 dslash_type != QUDA_TWISTED_MASS_DSLASH) { 00125 printf("dslash_type %d not supported\n", dslash_type); 00126 exit(0); 00127 } 00128 00129 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00130 QudaPrecision cuda_prec = prec; 00131 QudaPrecision cuda_prec_sloppy = prec_sloppy; 00132 QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; 00133 00134 // offsets used only by multi-shift solver 00135 int num_offsets = 4; 00136 double offsets[4] = {0.01, 0.02, 0.03, 0.04}; 00137 00138 QudaGaugeParam gauge_param = newQudaGaugeParam(); 00139 QudaInvertParam inv_param = newQudaInvertParam(); 00140 00141 gauge_param.X[0] = xdim; 00142 gauge_param.X[1] = ydim; 00143 gauge_param.X[2] = zdim; 00144 gauge_param.X[3] = tdim; 00145 00146 gauge_param.anisotropy = 1.0; 00147 gauge_param.type = QUDA_WILSON_LINKS; 00148 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00149 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00150 00151 gauge_param.cpu_prec = cpu_prec; 00152 gauge_param.cuda_prec = cuda_prec; 00153 gauge_param.reconstruct = link_recon; 00154 gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; 00155 gauge_param.reconstruct_sloppy = link_recon_sloppy; 00156 gauge_param.cuda_prec_precondition = cuda_prec_precondition; 00157 gauge_param.reconstruct_precondition = link_recon_sloppy; 00158 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00159 00160 inv_param.dslash_type = dslash_type; 00161 00162 double mass = -0.2180; 00163 inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass)); 00164 00165 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00166 inv_param.mu = 0.1; 00167 inv_param.twist_flavor = QUDA_TWIST_MINUS; 00168 } 00169 00170 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00171 inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; 00172 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00173 inv_param.dagger = QUDA_DAG_NO; 00174 inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; 00175 00176 inv_param.inv_type = QUDA_BICGSTAB_INVERTER; 00177 inv_param.gcrNkrylov = 30; 00178 inv_param.tol = 5e-7; 00179 inv_param.maxiter = 30; 00180 inv_param.reliable_delta = 1e-1; // ignored by multi-shift solver 00181 00182 // domain decomposition preconditioner parameters 00183 inv_param.inv_type_precondition = QUDA_INVALID_INVERTER; 00184 inv_param.tol_precondition = 1e-1; 00185 inv_param.maxiter_precondition = 10; 00186 inv_param.verbosity_precondition = QUDA_SILENT; 00187 inv_param.prec_precondition = cuda_prec_precondition; 00188 inv_param.omega = 1.0; 00189 00190 inv_param.cpu_prec = cpu_prec; 00191 inv_param.cuda_prec = cuda_prec; 00192 inv_param.cuda_prec_sloppy = cuda_prec_sloppy; 00193 inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; 00194 inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00195 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00196 00197 inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; 00198 00199 gauge_param.ga_pad = 0; // 24*24*24/2; 00200 inv_param.sp_pad = 0; // 24*24*24/2; 00201 inv_param.cl_pad = 0; // 24*24*24/2; 00202 00203 // For multi-GPU, ga_pad must be large enough to store a time-slice 00204 #ifdef MULTI_GPU 00205 int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; 00206 int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; 00207 int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; 00208 int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; 00209 int pad_size =MAX(x_face_size, y_face_size); 00210 pad_size = MAX(pad_size, z_face_size); 00211 pad_size = MAX(pad_size, t_face_size); 00212 gauge_param.ga_pad = pad_size; 00213 #endif 00214 00215 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00216 inv_param.clover_cpu_prec = cpu_prec; 00217 inv_param.clover_cuda_prec = cuda_prec; 00218 inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; 00219 inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; 00220 inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; 00221 } 00222 00223 inv_param.verbosity = QUDA_VERBOSE; 00224 00225 // *** Everything between here and the call to initQuda() is 00226 // *** application-specific. 00227 00228 // set parameters for the reference Dslash, and prepare fields to be loaded 00229 setDims(gauge_param.X); 00230 00231 size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00232 size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00233 00234 void *gauge[4], *clover_inv=0, *clover=0; 00235 00236 for (int dir = 0; dir < 4; dir++) { 00237 gauge[dir] = malloc(V*gaugeSiteSize*gSize); 00238 } 00239 00240 if (strcmp(latfile,"")) { // load in the command line supplied gauge field 00241 read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); 00242 construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param); 00243 } else { // else generate a random SU(3) field 00244 construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param); 00245 } 00246 00247 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00248 double norm = 0.0; // clover components are random numbers in the range (-norm, norm) 00249 double diag = 1.0; // constant added to the diagonal 00250 00251 size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00252 clover_inv = malloc(V*cloverSiteSize*cSize); 00253 construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec); 00254 00255 // The uninverted clover term is only needed when solving the unpreconditioned 00256 // system or when using "asymmetric" even/odd preconditioning. 00257 int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE || 00258 inv_param.solve_type == QUDA_NORMEQ_PC_SOLVE); 00259 int asymmetric = preconditioned && 00260 (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || 00261 inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); 00262 if (!preconditioned) { 00263 clover = clover_inv; 00264 clover_inv = NULL; 00265 } else if (asymmetric) { // fake it by using the same random matrix 00266 clover = clover_inv; // for both clover and clover_inv 00267 } else { 00268 clover = NULL; 00269 } 00270 } 00271 00272 void *spinorIn = malloc(V*spinorSiteSize*sSize); 00273 void *spinorCheck = malloc(V*spinorSiteSize*sSize); 00274 00275 void *spinorOut = NULL, **spinorOutMulti = NULL; 00276 if (multi_shift) { 00277 spinorOutMulti = (void**)malloc(num_offsets*sizeof(void *)); 00278 for (int i=0; i<num_offsets; i++) { 00279 spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize); 00280 } 00281 } else { 00282 spinorOut = malloc(V*spinorSiteSize*sSize); 00283 } 00284 00285 // create a point source at 0 00286 if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) *((float*)spinorIn) = 1.0; 00287 else *((double*)spinorIn) = 1.0; 00288 00289 // start the timer 00290 double time0 = -((double)clock()); 00291 00292 // initialize the QUDA library 00293 initQuda(device); 00294 00295 // load the gauge field 00296 loadGaugeQuda((void*)gauge, &gauge_param); 00297 00298 // load the clover term, if desired 00299 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); 00300 00301 // perform the inversion 00302 if (multi_shift) { 00303 double resid_sq; 00304 invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param, 00305 offsets, num_offsets, &resid_sq); 00306 } else { 00307 invertQuda(spinorOut, spinorIn, &inv_param); 00308 } 00309 00310 // stop the timer 00311 time0 += clock(); 00312 time0 /= CLOCKS_PER_SEC; 00313 00314 printf("Device memory used:\n Spinor: %f GiB\n Gauge: %f GiB\n", 00315 inv_param.spinorGiB, gauge_param.gaugeGiB); 00316 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) printf(" Clover: %f GiB\n", inv_param.cloverGiB); 00317 printf("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", 00318 inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0); 00319 00320 if (multi_shift) { 00321 00322 void *spinorTmp = malloc(V*spinorSiteSize*sSize); 00323 00324 printf("Host residuum checks: \n"); 00325 for(int i=0; i < num_offsets; i++) { 00326 ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00327 00328 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00329 tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00330 inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); 00331 tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00332 inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); 00333 } else { 00334 wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0, 00335 inv_param.cpu_prec, gauge_param); 00336 wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, 00337 inv_param.cpu_prec, gauge_param); 00338 } 00339 00340 axpy(offsets[i], spinorOutMulti[i], spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00341 mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00342 double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00343 double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec); 00344 printf("Shift i=%d Relative residual: requested = %g, actual = %g\n", i, inv_param.tol, sqrt(nrm2/src2)); 00345 } 00346 free(spinorTmp); 00347 00348 } else { 00349 00350 if (inv_param.solution_type == QUDA_MAT_SOLUTION) { 00351 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00352 tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00353 0, inv_param.cpu_prec, gauge_param); 00354 } else { 00355 wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, 00356 inv_param.cpu_prec, gauge_param); 00357 } 00358 if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { 00359 ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00360 } 00361 } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { 00362 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00363 tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00364 inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); 00365 } else { 00366 wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, 00367 inv_param.cpu_prec, gauge_param); 00368 } 00369 if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { 00370 ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00371 } 00372 } 00373 00374 mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00375 double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); 00376 double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec); 00377 printf("Relative residual: requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); 00378 00379 } 00380 00381 freeGaugeQuda(); 00382 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) freeCloverQuda(); 00383 00384 // finalize the QUDA library 00385 endQuda(); 00386 00387 // end if the communications layer 00388 endCommsQuda(); 00389 00390 return 0; 00391 }