QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <string.h> 00005 00006 #include <quda.h> 00007 #include <quda_internal.h> 00008 #include <dirac_quda.h> 00009 #include <dslash_quda.h> 00010 #include <invert_quda.h> 00011 #include <util_quda.h> 00012 #include <blas_quda.h> 00013 00014 #include <misc.h> 00015 #include <test_util.h> 00016 #include <staggered_dslash_reference.h> 00017 #include <gauge_field.h> 00018 00019 #include <face_quda.h> 00020 00021 #include <assert.h> 00022 00023 #define MAX(a,b) ((a)>(b)?(a):(b)) 00024 #define staggeredSpinorSiteSize 6 00025 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00026 00027 extern void usage(char** argv ); 00028 00029 int test_type = 0; 00030 00031 extern bool tune; 00032 00033 QudaGaugeParam gaugeParam; 00034 QudaInvertParam inv_param; 00035 00036 cpuGaugeField *cpuFat = NULL; 00037 cpuGaugeField *cpuLong = NULL; 00038 00039 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00040 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut; 00041 00042 cudaColorSpinorField* tmp; 00043 00044 void *hostGauge[4]; 00045 void *fatlink[4], *longlink[4]; 00046 00047 #ifdef MULTI_GPU 00048 const void **ghost_fatlink, **ghost_longlink; 00049 #endif 00050 00051 const int loops = 100; 00052 00053 QudaParity parity; 00054 extern QudaDagType dagger; 00055 int transfer = 0; // include transfer time in the benchmark? 00056 extern int xdim; 00057 extern int ydim; 00058 extern int zdim; 00059 extern int tdim; 00060 extern int gridsize_from_cmdline[]; 00061 extern QudaReconstructType link_recon; 00062 extern QudaPrecision prec; 00063 00064 extern int device; 00065 00066 int X[4]; 00067 00068 00069 Dirac* dirac; 00070 extern int Z[4]; 00071 extern int V; 00072 extern int Vh; 00073 static int Vs_x, Vs_y, Vs_z, Vs_t; 00074 extern int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00075 static int Vsh[4]; 00076 00077 void 00078 setDimConstants(int *X) 00079 { 00080 V = 1; 00081 for (int d=0; d< 4; d++) { 00082 V *= X[d]; 00083 Z[d] = X[d]; 00084 } 00085 Vh = V/2; 00086 00087 Vs_x = X[1]*X[2]*X[3]; 00088 Vs_y = X[0]*X[2]*X[3]; 00089 Vs_z = X[0]*X[1]*X[3]; 00090 Vs_t = X[0]*X[1]*X[2]; 00091 00092 00093 Vsh_x = Vs_x/2; 00094 Vsh_y = Vs_y/2; 00095 Vsh_z = Vs_z/2; 00096 Vsh_t = Vs_t/2; 00097 00098 Vsh[0] = Vsh_x; 00099 Vsh[1] = Vsh_y; 00100 Vsh[2] = Vsh_z; 00101 Vsh[3] = Vsh_t; 00102 } 00103 00104 void init() 00105 { 00106 00107 initQuda(device); 00108 00109 gaugeParam = newQudaGaugeParam(); 00110 inv_param = newQudaInvertParam(); 00111 00112 gaugeParam.X[0] = X[0] = xdim; 00113 gaugeParam.X[1] = X[1] = ydim; 00114 gaugeParam.X[2] = X[2] = zdim; 00115 gaugeParam.X[3] = X[3] = tdim; 00116 00117 setDims(gaugeParam.X); 00118 00119 setDimConstants(gaugeParam.X); 00120 00121 gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; 00122 gaugeParam.cuda_prec = prec; 00123 gaugeParam.reconstruct = link_recon; 00124 gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; 00125 gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; 00126 00127 gaugeParam.anisotropy = 1.0; 00128 gaugeParam.tadpole_coeff = 0.8; 00129 gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER; 00130 gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; 00131 gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; 00132 gaugeParam.gaugeGiB = 0; 00133 00134 inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; 00135 inv_param.cuda_prec = prec; 00136 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00137 inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00138 inv_param.dagger = dagger; 00139 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00140 inv_param.dslash_type = QUDA_ASQTAD_DSLASH; 00141 00142 int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]); 00143 tmpint = MAX(tmpint, X[0]*X[1]*X[3]); 00144 tmpint = MAX(tmpint, X[0]*X[1]*X[2]); 00145 00146 00147 gaugeParam.ga_pad = tmpint; 00148 inv_param.sp_pad = tmpint; 00149 00150 ColorSpinorParam csParam; 00151 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00152 csParam.nColor=3; 00153 csParam.nSpin=1; 00154 csParam.nDim=4; 00155 for(int d = 0; d < 4; d++) { 00156 csParam.x[d] = gaugeParam.X[d]; 00157 } 00158 csParam.precision = inv_param.cpu_prec; 00159 csParam.pad = 0; 00160 if (test_type < 2) { 00161 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00162 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00163 csParam.x[0] /= 2; 00164 } else { 00165 inv_param.solution_type = QUDA_MAT_SOLUTION; 00166 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00167 } 00168 00169 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00170 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00171 csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered 00172 csParam.create = QUDA_ZERO_FIELD_CREATE; 00173 00174 spinor = new cpuColorSpinorField(csParam); 00175 spinorOut = new cpuColorSpinorField(csParam); 00176 spinorRef = new cpuColorSpinorField(csParam); 00177 00178 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00179 csParam.x[0] = gaugeParam.X[0]; 00180 00181 printfQuda("Randomizing fields ...\n"); 00182 00183 spinor->Source(QUDA_RANDOM_SOURCE); 00184 00185 size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00186 00187 for (int dir = 0; dir < 4; dir++) { 00188 fatlink[dir] = malloc(V*gaugeSiteSize*gSize); 00189 longlink[dir] = malloc(V*gaugeSiteSize*gSize); 00190 } 00191 if (fatlink == NULL || longlink == NULL){ 00192 errorQuda("ERROR: malloc failed for fatlink/longlink"); 00193 } 00194 construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); 00195 00196 #ifdef MULTI_GPU 00197 gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; 00198 gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; 00199 GaugeFieldParam cpuFatParam(fatlink, gaugeParam); 00200 cpuFat = new cpuGaugeField(cpuFatParam); 00201 cpuFat->exchangeGhost(); 00202 ghost_fatlink = cpuFat->Ghost(); 00203 00204 gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; 00205 GaugeFieldParam cpuLongParam(longlink, gaugeParam); 00206 cpuLong = new cpuGaugeField(cpuLongParam); 00207 cpuLong->exchangeGhost(); 00208 ghost_longlink = cpuLong->Ghost(); 00209 00210 int x_face_size = X[1]*X[2]*X[3]/2; 00211 int y_face_size = X[0]*X[2]*X[3]/2; 00212 int z_face_size = X[0]*X[1]*X[3]/2; 00213 int t_face_size = X[0]*X[1]*X[2]/2; 00214 int pad_size =MAX(x_face_size, y_face_size); 00215 pad_size = MAX(pad_size, z_face_size); 00216 pad_size = MAX(pad_size, t_face_size); 00217 gaugeParam.ga_pad = pad_size; 00218 #endif 00219 00220 gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; 00221 gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 00222 00223 printfQuda("Fat links sending..."); 00224 loadGaugeQuda(fatlink, &gaugeParam); 00225 printfQuda("Fat links sent\n"); 00226 00227 gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; 00228 00229 #ifdef MULTI_GPU 00230 gaugeParam.ga_pad = 3*pad_size; 00231 #endif 00232 00233 gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = link_recon; 00234 printfQuda("Long links sending..."); 00235 loadGaugeQuda(longlink, &gaugeParam); 00236 printfQuda("Long links sent...\n"); 00237 00238 printfQuda("Sending fields to GPU..."); 00239 00240 if (!transfer) { 00241 00242 //csParam.verbose = QUDA_DEBUG_VERBOSE; 00243 00244 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00245 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00246 csParam.pad = inv_param.sp_pad; 00247 csParam.precision = inv_param.cuda_prec; 00248 if (test_type < 2){ 00249 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00250 csParam.x[0] /=2; 00251 } 00252 00253 printfQuda("Creating cudaSpinor\n"); 00254 cudaSpinor = new cudaColorSpinorField(csParam); 00255 00256 printfQuda("Creating cudaSpinorOut\n"); 00257 cudaSpinorOut = new cudaColorSpinorField(csParam); 00258 00259 printfQuda("Sending spinor field to GPU\n"); 00260 *cudaSpinor = *spinor; 00261 00262 cudaThreadSynchronize(); 00263 checkCudaError(); 00264 00265 double spinor_norm2 = norm2(*spinor); 00266 double cuda_spinor_norm2= norm2(*cudaSpinor); 00267 printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2); 00268 00269 if(test_type == 2){ 00270 csParam.x[0] /=2; 00271 } 00272 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00273 tmp = new cudaColorSpinorField(csParam); 00274 00275 bool pc = (test_type != 2); 00276 DiracParam diracParam; 00277 setDiracParam(diracParam, &inv_param, pc); 00278 00279 diracParam.verbose = QUDA_VERBOSE; 00280 diracParam.tmp1=tmp; 00281 00282 dirac = Dirac::create(diracParam); 00283 00284 } else { 00285 errorQuda("Error not suppported"); 00286 } 00287 00288 return; 00289 } 00290 00291 void end(void) 00292 { 00293 for (int dir = 0; dir < 4; dir++) { 00294 free(fatlink[dir]); 00295 free(longlink[dir]); 00296 } 00297 00298 if (!transfer){ 00299 delete dirac; 00300 delete cudaSpinor; 00301 delete cudaSpinorOut; 00302 delete tmp; 00303 } 00304 00305 delete spinor; 00306 delete spinorOut; 00307 delete spinorRef; 00308 00309 if (cpuFat) delete cpuFat; 00310 if (cpuLong) delete cpuLong; 00311 00312 endQuda(); 00313 } 00314 00315 double dslashCUDA(int niter) { 00316 00317 cudaEvent_t start, end; 00318 cudaEventCreate(&start); 00319 cudaEventRecord(start, 0); 00320 cudaEventSynchronize(start); 00321 00322 for (int i = 0; i < niter; i++) { 00323 switch (test_type) { 00324 case 0: 00325 parity = QUDA_EVEN_PARITY; 00326 if (transfer){ 00327 //dslashQuda(spinorOdd, spinorEven, &inv_param, parity); 00328 } else { 00329 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00330 } 00331 break; 00332 case 1: 00333 parity = QUDA_ODD_PARITY; 00334 if (transfer){ 00335 //MatPCQuda(spinorOdd, spinorEven, &inv_param); 00336 } else { 00337 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00338 } 00339 break; 00340 case 2: 00341 if (transfer){ 00342 //MatQuda(spinorGPU, spinor, &inv_param); 00343 } else { 00344 dirac->M(*cudaSpinorOut, *cudaSpinor); 00345 } 00346 } 00347 } 00348 00349 cudaEventCreate(&end); 00350 cudaEventRecord(end, 0); 00351 cudaEventSynchronize(end); 00352 float runTime; 00353 cudaEventElapsedTime(&runTime, start, end); 00354 cudaEventDestroy(start); 00355 cudaEventDestroy(end); 00356 00357 double secs = runTime / 1000; //stopwatchReadSeconds(); 00358 00359 // check for errors 00360 cudaError_t stat = cudaGetLastError(); 00361 if (stat != cudaSuccess) 00362 errorQuda("with ERROR: %s\n", cudaGetErrorString(stat)); 00363 00364 return secs; 00365 } 00366 00367 void staggeredDslashRef() 00368 { 00369 #ifndef MULTI_GPU 00370 int cpu_parity = 0; 00371 #endif 00372 00373 // compare to dslash reference implementation 00374 printfQuda("Calculating reference implementation..."); 00375 fflush(stdout); 00376 switch (test_type) { 00377 case 0: 00378 #ifdef MULTI_GPU 00379 00380 staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 00381 spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); 00382 #else 00383 cpu_parity = 0; //EVEN 00384 staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 00385 inv_param.cpu_prec, gaugeParam.cpu_prec); 00386 00387 #endif 00388 00389 00390 break; 00391 case 1: 00392 #ifdef MULTI_GPU 00393 staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 00394 spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); 00395 00396 #else 00397 cpu_parity=1; //ODD 00398 staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 00399 inv_param.cpu_prec, gaugeParam.cpu_prec); 00400 #endif 00401 break; 00402 case 2: 00403 //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger, 00404 //inv_param.cpu_prec, gaugeParam.cpu_prec); 00405 break; 00406 default: 00407 errorQuda("Test type not defined"); 00408 } 00409 00410 printfQuda("done.\n"); 00411 00412 } 00413 00414 static int dslashTest() 00415 { 00416 int accuracy_level = 0; 00417 00418 init(); 00419 00420 int attempts = 1; 00421 00422 for (int i=0; i<attempts; i++) { 00423 00424 if (tune) { // warm-up run 00425 printfQuda("Tuning...\n"); 00426 setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); 00427 dslashCUDA(1); 00428 } 00429 printfQuda("Executing %d kernel loops...", loops); 00430 double secs = dslashCUDA(loops); 00431 00432 #ifdef DSLASH_PROFILING 00433 printDslashProfile(); 00434 #endif 00435 00436 if (!transfer) *spinorOut = *cudaSpinorOut; 00437 00438 printfQuda("\n%fms per loop\n", 1000*secs); 00439 staggeredDslashRef(); 00440 00441 unsigned long long flops = dirac->Flops(); 00442 int link_floats = 8*gaugeParam.reconstruct+8*18; 00443 int spinor_floats = 8*6*2 + 6; 00444 int link_float_size = prec; 00445 int spinor_float_size = 0; 00446 00447 link_floats = test_type ? (2*link_floats) : link_floats; 00448 spinor_floats = test_type ? (2*spinor_floats) : spinor_floats; 00449 00450 int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size; 00451 if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4; 00452 00453 printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); 00454 printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9)); 00455 00456 if (!transfer) { 00457 double spinor_ref_norm2 = norm2(*spinorRef); 00458 double cuda_spinor_out_norm2 = norm2(*cudaSpinorOut); 00459 double spinor_out_norm2 = norm2(*spinorOut); 00460 printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, 00461 spinor_out_norm2); 00462 } else { 00463 double spinor_ref_norm2 = norm2(*spinorRef); 00464 double spinor_out_norm2 = norm2(*spinorOut); 00465 printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2); 00466 } 00467 00468 accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00469 } 00470 end(); 00471 00472 return accuracy_level; 00473 } 00474 00475 00476 void display_test_info() 00477 { 00478 printfQuda("running the following test:\n"); 00479 00480 printfQuda("prec recon test_type dagger S_dim T_dimension\n"); 00481 printfQuda("%s %s %d %d %d/%d/%d %d \n", 00482 get_prec_str(prec), get_recon_str(link_recon), 00483 test_type, dagger, xdim, ydim, zdim, tdim); 00484 printfQuda("Grid partition info: X Y Z T\n"); 00485 printfQuda(" %d %d %d %d\n", 00486 commDimPartitioned(0), 00487 commDimPartitioned(1), 00488 commDimPartitioned(2), 00489 commDimPartitioned(3)); 00490 00491 return ; 00492 00493 } 00494 00495 00496 void 00497 usage_extra(char** argv ) 00498 { 00499 printfQuda("Extra options:\n"); 00500 printfQuda(" --test <0/1> # Test method\n"); 00501 printfQuda(" 0: Even destination spinor\n"); 00502 printfQuda(" 1: Odd destination spinor\n"); 00503 return ; 00504 } 00505 00506 int main(int argc, char **argv) 00507 { 00508 00509 int i; 00510 for (i =1;i < argc; i++){ 00511 00512 if(process_command_line_option(argc, argv, &i) == 0){ 00513 continue; 00514 } 00515 00516 if( strcmp(argv[i], "--test") == 0){ 00517 if (i+1 >= argc){ 00518 usage(argv); 00519 } 00520 test_type = atoi(argv[i+1]); 00521 if (test_type < 0 || test_type > 2){ 00522 errorQuda("Error: invalid test type"); 00523 } 00524 i++; 00525 continue; 00526 } 00527 00528 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00529 usage(argv); 00530 } 00531 00532 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00533 00534 display_test_info(); 00535 00536 int ret =1; 00537 int accuracy_level = dslashTest(); 00538 00539 printfQuda("accuracy_level =%d\n", accuracy_level); 00540 00541 if (accuracy_level >= 1) ret = 0; //probably no error, -1 means no matching 00542 endCommsQuda(); 00543 return ret; 00544 } 00545