QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <time.h> 00004 #include <math.h> 00005 00006 #include <test_util.h> 00007 #include <blas_reference.h> 00008 #include <staggered_dslash_reference.h> 00009 #include <quda.h> 00010 #include <string.h> 00011 #include <face_quda.h> 00012 #include "misc.h" 00013 #include <gauge_field.h> 00014 00015 #ifdef MULTI_GPU 00016 #include <face_quda.h> 00017 #endif 00018 00019 #define MAX(a,b) ((a)>(b)?(a):(b)) 00020 #define mySpinorSiteSize 6 00021 00022 extern void usage(char** argv); 00023 void *fatlink[4]; 00024 void *longlink[4]; 00025 00026 #ifdef MULTI_GPU 00027 void** ghost_fatlink, **ghost_longlink; 00028 #endif 00029 00030 extern int device; 00031 extern bool tune; 00032 00033 extern QudaReconstructType link_recon; 00034 extern QudaPrecision prec; 00035 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00036 00037 extern QudaReconstructType link_recon_sloppy; 00038 extern QudaPrecision prec_sloppy; 00039 cpuColorSpinorField* in; 00040 cpuColorSpinorField* out; 00041 cpuColorSpinorField* ref; 00042 cpuColorSpinorField* tmp; 00043 00044 cpuGaugeField *cpuFat = NULL; 00045 cpuGaugeField *cpuLong = NULL; 00046 00047 static double tol = 1e-6; 00048 00049 static int testtype = 0; 00050 extern int xdim; 00051 extern int ydim; 00052 extern int zdim; 00053 extern int tdim; 00054 extern bool kernelPackT; 00055 extern int gridsize_from_cmdline[]; 00056 00057 00058 static void end(); 00059 00060 extern int Z[4]; 00061 extern int V; 00062 extern int Vh; 00063 static int Vs_x, Vs_y, Vs_z, Vs_t; 00064 extern int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00065 static int Vsh[4]; 00066 00067 00068 template<typename Float> 00069 void constructSpinorField(Float *res) { 00070 for(int i = 0; i < Vh; i++) { 00071 for (int s = 0; s < 1; s++) { 00072 for (int m = 0; m < 3; m++) { 00073 res[i*(1*3*2) + s*(3*2) + m*(2) + 0] = rand() / (Float)RAND_MAX; 00074 res[i*(1*3*2) + s*(3*2) + m*(2) + 1] = rand() / (Float)RAND_MAX; 00075 } 00076 } 00077 } 00078 } 00079 00080 void 00081 setDimConstants(int *X) 00082 { 00083 V = 1; 00084 for (int d=0; d< 4; d++) { 00085 V *= X[d]; 00086 Z[d] = X[d]; 00087 } 00088 Vh = V/2; 00089 00090 Vs_x = X[1]*X[2]*X[3]; 00091 Vs_y = X[0]*X[2]*X[3]; 00092 Vs_z = X[0]*X[1]*X[3]; 00093 Vs_t = X[0]*X[1]*X[2]; 00094 00095 00096 Vsh_x = Vs_x/2; 00097 Vsh_y = Vs_y/2; 00098 Vsh_z = Vs_z/2; 00099 Vsh_t = Vs_t/2; 00100 00101 Vsh[0] = Vsh_x; 00102 Vsh[1] = Vsh_y; 00103 Vsh[2] = Vsh_z; 00104 Vsh[3] = Vsh_t; 00105 } 00106 00107 static void 00108 set_params(QudaGaugeParam* gaugeParam, QudaInvertParam* inv_param, 00109 int X1, int X2, int X3, int X4, 00110 QudaPrecision cpu_prec, QudaPrecision prec, QudaPrecision prec_sloppy, 00111 QudaReconstructType link_recon, QudaReconstructType link_recon_sloppy, 00112 double mass, double tol, int maxiter, double reliable_delta, 00113 double tadpole_coeff 00114 ) 00115 { 00116 gaugeParam->X[0] = X1; 00117 gaugeParam->X[1] = X2; 00118 gaugeParam->X[2] = X3; 00119 gaugeParam->X[3] = X4; 00120 00121 gaugeParam->cpu_prec = cpu_prec; 00122 gaugeParam->cuda_prec = prec; 00123 gaugeParam->reconstruct = link_recon; 00124 gaugeParam->cuda_prec_sloppy = prec_sloppy; 00125 gaugeParam->reconstruct_sloppy = link_recon_sloppy; 00126 gaugeParam->gauge_fix = QUDA_GAUGE_FIXED_NO; 00127 gaugeParam->anisotropy = 1.0; 00128 gaugeParam->tadpole_coeff = tadpole_coeff; 00129 gaugeParam->t_boundary = QUDA_ANTI_PERIODIC_T; 00130 gaugeParam->gauge_order = QUDA_QDP_GAUGE_ORDER; 00131 gaugeParam->ga_pad = X1*X2*X3/2; 00132 00133 inv_param->verbosity = QUDA_VERBOSE; 00134 inv_param->mass = mass; 00135 00136 // outer solver parameters 00137 inv_param->inv_type = QUDA_CG_INVERTER; 00138 inv_param->tol = tol; 00139 inv_param->maxiter = 10000; 00140 inv_param->reliable_delta = 1e-1; // ignored by multi-shift solver 00141 00142 //inv_param->inv_type = QUDA_GCR_INVERTER; 00143 //inv_param->gcrNkrylov = 10; 00144 00145 // domain decomposition preconditioner parameters 00146 //inv_param->inv_type_precondition = QUDA_MR_INVERTER; 00147 //inv_param->tol_precondition = 1e-1; 00148 //inv_param->maxiter_precondition = 100; 00149 //inv_param->verbosity_precondition = QUDA_SILENT; 00150 //inv_param->prec_precondition = prec_sloppy; 00151 00152 inv_param->solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; 00153 inv_param->solve_type = QUDA_NORMEQ_PC_SOLVE; 00154 inv_param->matpc_type = QUDA_MATPC_EVEN_EVEN; 00155 inv_param->dagger = QUDA_DAG_NO; 00156 inv_param->mass_normalization = QUDA_MASS_NORMALIZATION; 00157 00158 inv_param->cpu_prec = cpu_prec; 00159 inv_param->cuda_prec = prec; 00160 inv_param->cuda_prec_sloppy = prec_sloppy; 00161 inv_param->preserve_source = QUDA_PRESERVE_SOURCE_YES; 00162 inv_param->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // this is meaningless, but must be thus set 00163 inv_param->dirac_order = QUDA_DIRAC_ORDER; 00164 inv_param->dslash_type = QUDA_ASQTAD_DSLASH; 00165 inv_param->tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; 00166 inv_param->sp_pad = X1*X2*X3/2; 00167 inv_param->use_init_guess = QUDA_USE_INIT_GUESS_YES; 00168 00169 } 00170 00171 int 00172 invert_test(void) 00173 { 00174 QudaGaugeParam gaugeParam = newQudaGaugeParam(); 00175 QudaInvertParam inv_param = newQudaInvertParam(); 00176 00177 double mass = 0.1; 00178 00179 set_params(&gaugeParam, &inv_param, 00180 xdim, ydim, zdim, tdim, 00181 cpu_prec, prec, prec_sloppy, 00182 link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 00183 0.8); 00184 00185 // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) 00186 initQuda(device); 00187 00188 setDims(gaugeParam.X); 00189 setDimConstants(gaugeParam.X); 00190 00191 size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00192 for (int dir = 0; dir < 4; dir++) { 00193 fatlink[dir] = malloc(V*gaugeSiteSize*gSize); 00194 longlink[dir] = malloc(V*gaugeSiteSize*gSize); 00195 } 00196 00197 construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); 00198 00199 for (int dir = 0; dir < 4; dir++) { 00200 for(int i = 0;i < V*gaugeSiteSize;i++){ 00201 if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ 00202 ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; 00203 }else{ 00204 ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; 00205 } 00206 } 00207 } 00208 00209 ColorSpinorParam csParam; 00210 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00211 csParam.nColor=3; 00212 csParam.nSpin=1; 00213 csParam.nDim=4; 00214 for(int d = 0; d < 4; d++) { 00215 csParam.x[d] = gaugeParam.X[d]; 00216 } 00217 csParam.x[0] /= 2; 00218 00219 csParam.precision = inv_param.cpu_prec; 00220 csParam.pad = 0; 00221 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00222 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00223 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00224 csParam.gammaBasis = inv_param.gamma_basis; 00225 csParam.create = QUDA_ZERO_FIELD_CREATE; 00226 in = new cpuColorSpinorField(csParam); 00227 out = new cpuColorSpinorField(csParam); 00228 ref = new cpuColorSpinorField(csParam); 00229 tmp = new cpuColorSpinorField(csParam); 00230 00231 if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ 00232 constructSpinorField((float*)in->V()); 00233 }else{ 00234 constructSpinorField((double*)in->V()); 00235 } 00236 00237 int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); 00238 tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); 00239 tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); 00240 00241 int fat_pad = tmp_value; 00242 int link_pad = 3*tmp_value; 00243 00244 #ifdef MULTI_GPU 00245 gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; 00246 gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; 00247 GaugeFieldParam cpuFatParam(fatlink, gaugeParam); 00248 cpuFat = new cpuGaugeField(cpuFatParam); 00249 cpuFat->exchangeGhost(); 00250 ghost_fatlink = (void**)cpuFat->Ghost(); 00251 00252 gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; 00253 GaugeFieldParam cpuLongParam(longlink, gaugeParam); 00254 cpuLong = new cpuGaugeField(cpuLongParam); 00255 cpuLong->exchangeGhost(); 00256 ghost_longlink = (void**)cpuLong->Ghost(); 00257 #endif 00258 00259 if(testtype == 6){ 00260 record_gauge(gaugeParam.X, fatlink, fat_pad, 00261 longlink, link_pad, 00262 link_recon, link_recon_sloppy, 00263 &gaugeParam); 00264 }else{ 00265 00266 #ifdef MULTI_GPU 00267 00268 00269 gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; 00270 gaugeParam.ga_pad = fat_pad; 00271 gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 00272 loadGaugeQuda(fatlink, &gaugeParam); 00273 00274 gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; 00275 gaugeParam.ga_pad = link_pad; 00276 gaugeParam.reconstruct= link_recon; 00277 gaugeParam.reconstruct_sloppy = link_recon_sloppy; 00278 loadGaugeQuda(longlink, &gaugeParam); 00279 #else 00280 gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; 00281 gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 00282 loadGaugeQuda(fatlink, &gaugeParam); 00283 00284 gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; 00285 gaugeParam.reconstruct = link_recon; 00286 gaugeParam.reconstruct_sloppy = link_recon_sloppy; 00287 loadGaugeQuda(longlink, &gaugeParam); 00288 #endif 00289 } 00290 00291 00292 double time0 = -((double)clock()); // Start the timer 00293 00294 double nrm2=0; 00295 double src2=0; 00296 int ret = 0; 00297 00298 switch(testtype){ 00299 case 0: //even 00300 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00301 00302 invertQuda(out->V(), in->V(), &inv_param); 00303 00304 time0 += clock(); 00305 time0 /= CLOCKS_PER_SEC; 00306 00307 #ifdef MULTI_GPU 00308 matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 00309 out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); 00310 #else 00311 matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); 00312 #endif 00313 00314 mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00315 nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00316 src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00317 00318 break; 00319 00320 case 1: //odd 00321 00322 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00323 invertQuda(out->V(), in->V(), &inv_param); 00324 time0 += clock(); // stop the timer 00325 time0 /= CLOCKS_PER_SEC; 00326 00327 #ifdef MULTI_GPU 00328 matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 00329 out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); 00330 #else 00331 matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); 00332 #endif 00333 mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00334 nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00335 src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); 00336 00337 break; 00338 00339 case 2: //full spinor 00340 00341 errorQuda("full spinor not supported\n"); 00342 break; 00343 00344 case 3: //multi mass CG, even 00345 case 4: 00346 case 5: 00347 case 6: 00348 00349 #define NUM_OFFSETS 7 00350 00351 double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81}; 00352 double offsets[NUM_OFFSETS]; 00353 int num_offsets =NUM_OFFSETS; 00354 void* outArray[NUM_OFFSETS]; 00355 int len; 00356 00357 cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; 00358 spinorOutArray[0] = out; 00359 for(int i=1;i < num_offsets; i++){ 00360 spinorOutArray[i] = new cpuColorSpinorField(csParam); 00361 } 00362 00363 for(int i=0;i < num_offsets; i++){ 00364 outArray[i] = spinorOutArray[i]->V(); 00365 } 00366 00367 for (int i=0; i< num_offsets;i++){ 00368 offsets[i] = 4*masses[i]*masses[i]; 00369 } 00370 00371 len=Vh; 00372 00373 if (testtype == 3 || testtype == 6){ 00374 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00375 } else if (testtype == 4){ 00376 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00377 }else { //testtype ==5 00378 errorQuda("test 5 not supported\n"); 00379 } 00380 00381 double residue_sq; 00382 if (testtype == 6){ 00383 invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); 00384 }else{ 00385 invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); 00386 } 00387 cudaThreadSynchronize(); 00388 printfQuda("Final residue squred =%g\n", residue_sq); 00389 time0 += clock(); // stop the timer 00390 time0 /= CLOCKS_PER_SEC; 00391 00392 printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 00393 time0, inv_param.iter, inv_param.secs, 00394 inv_param.gflops/inv_param.secs); 00395 00396 00397 printfQuda("checking the solution\n"); 00398 QudaParity parity = QUDA_INVALID_PARITY; 00399 if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ 00400 //parity = QUDA_EVENODD_PARITY; 00401 errorQuda("full parity not supported\n"); 00402 }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ 00403 parity = QUDA_EVEN_PARITY; 00404 }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ 00405 parity = QUDA_ODD_PARITY; 00406 }else{ 00407 errorQuda("ERROR: invalid spinor parity \n"); 00408 exit(1); 00409 } 00410 for(int i=0;i < num_offsets;i++){ 00411 printfQuda("%dth solution: mass=%f, ", i, masses[i]); 00412 #ifdef MULTI_GPU 00413 matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 00414 spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, 00415 gaugeParam.cpu_prec, tmp, parity); 00416 #else 00417 matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); 00418 #endif 00419 mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); 00420 double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); 00421 double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); 00422 00423 printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); 00424 00425 //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge 00426 if (sqrt(nrm2/src2) > 10*inv_param.tol){ 00427 ret |=1; 00428 } 00429 } 00430 00431 if (ret ==1){ 00432 errorQuda("Converge failed!\n"); 00433 } 00434 00435 for(int i=1; i < num_offsets;i++){ 00436 delete spinorOutArray[i]; 00437 } 00438 00439 00440 }//switch 00441 00442 00443 if (testtype <=2){ 00444 00445 printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); 00446 00447 printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 00448 time0, inv_param.iter, inv_param.secs, 00449 inv_param.gflops/inv_param.secs); 00450 00451 //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge 00452 if (sqrt(nrm2/src2) > 100*inv_param.tol){ 00453 ret = 1; 00454 errorQuda("Convergence failed!\n"); 00455 } 00456 } 00457 00458 end(); 00459 return ret; 00460 } 00461 00462 00463 00464 static void 00465 end(void) 00466 { 00467 for(int i=0;i < 4;i++){ 00468 free(fatlink[i]); 00469 free(longlink[i]); 00470 } 00471 00472 delete in; 00473 delete out; 00474 delete ref; 00475 delete tmp; 00476 00477 if (cpuFat) delete cpuFat; 00478 if (cpuLong) delete cpuLong; 00479 00480 endQuda(); 00481 } 00482 00483 00484 void 00485 display_test_info() 00486 { 00487 printfQuda("running the following test:\n"); 00488 00489 printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n"); 00490 printfQuda("%s %s %s %s %s %d/%d/%d %d \n", 00491 get_prec_str(prec),get_prec_str(prec_sloppy), 00492 get_recon_str(link_recon), 00493 get_recon_str(link_recon_sloppy), get_test_type(testtype), xdim, ydim, zdim, tdim); 00494 00495 printfQuda("Grid partition info: X Y Z T\n"); 00496 printfQuda(" %d %d %d %d\n", 00497 commDimPartitioned(0), 00498 commDimPartitioned(1), 00499 commDimPartitioned(2), 00500 commDimPartitioned(3)); 00501 00502 return ; 00503 00504 } 00505 00506 void 00507 usage_extra(char** argv ) 00508 { 00509 printfQuda("Extra options:\n"); 00510 printfQuda(" --tol <resid_tol> # Set residual tolerance\n"); 00511 printfQuda(" --test <0/1> # Test method\n"); 00512 printfQuda(" 0: Even even spinor CG inverter\n"); 00513 printfQuda(" 1: Odd odd spinor CG inverter\n"); 00514 printfQuda(" 3: Even even spinor multishift CG inverter\n"); 00515 printfQuda(" 4: Odd odd spinor multishift CG inverter\n"); 00516 printfQuda(" 6: Even even spinor mixed precision multishift CG inverter\n"); 00517 printfQuda(" --cpu_prec <double/single/half> # Set CPU precision\n"); 00518 00519 return ; 00520 } 00521 int main(int argc, char** argv) 00522 { 00523 00524 int i; 00525 for (i =1;i < argc; i++){ 00526 00527 if(process_command_line_option(argc, argv, &i) == 0){ 00528 continue; 00529 } 00530 00531 00532 if( strcmp(argv[i], "--tol") == 0){ 00533 float tmpf; 00534 if (i+1 >= argc){ 00535 usage(argv); 00536 } 00537 sscanf(argv[i+1], "%f", &tmpf); 00538 if (tmpf <= 0){ 00539 printf("ERROR: invalid tol(%f)\n", tmpf); 00540 usage(argv); 00541 } 00542 tol = tmpf; 00543 i++; 00544 continue; 00545 } 00546 00547 00548 if( strcmp(argv[i], "--test") == 0){ 00549 if (i+1 >= argc){ 00550 usage(argv); 00551 } 00552 testtype = atoi(argv[i+1]); 00553 i++; 00554 continue; 00555 } 00556 00557 if( strcmp(argv[i], "--cpu_prec") == 0){ 00558 if (i+1 >= argc){ 00559 usage(argv); 00560 } 00561 cpu_prec= get_prec(argv[i+1]); 00562 i++; 00563 continue; 00564 } 00565 00566 00567 printf("ERROR: Invalid option:%s\n", argv[i]); 00568 usage(argv); 00569 } 00570 00571 00572 if (prec_sloppy == QUDA_INVALID_PRECISION){ 00573 prec_sloppy = prec; 00574 } 00575 if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){ 00576 link_recon_sloppy = link_recon; 00577 } 00578 00579 00580 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00581 display_test_info(); 00582 00583 int ret = invert_test(); 00584 00585 endCommsQuda(); 00586 00587 return ret; 00588 }