QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <string.h> 00005 00006 #include <quda.h> 00007 #include <quda_internal.h> 00008 #include <dirac_quda.h> 00009 #include <dslash_quda.h> 00010 #include <invert_quda.h> 00011 #include <util_quda.h> 00012 #include <blas_quda.h> 00013 00014 #include <test_util.h> 00015 #include <wilson_dslash_reference.h> 00016 #include "misc.h" 00017 00018 #include <gauge_qio.h> 00019 00020 #define MAX(a,b) ((a)>(b)?(a):(b)) 00021 00022 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00023 const int test_type = 0; 00024 00025 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd? 00026 const int transfer = 0; // include transfer time in the benchmark? 00027 00028 const int loops = 100; 00029 00030 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00031 QudaPrecision cuda_prec; 00032 00033 QudaGaugeParam gauge_param; 00034 QudaInvertParam inv_param; 00035 00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp1=0, *tmp2=0; 00038 00039 void *hostGauge[4], *hostClover, *hostCloverInv; 00040 00041 Dirac *dirac; 00042 00043 // Dirac operator type 00044 extern QudaDslashType dslash_type; 00045 00046 extern bool tune; 00047 00048 extern int device; 00049 extern int xdim; 00050 extern int ydim; 00051 extern int zdim; 00052 extern int tdim; 00053 extern int gridsize_from_cmdline[]; 00054 extern QudaReconstructType link_recon; 00055 extern QudaPrecision prec; 00056 extern bool kernelPackT; 00057 extern QudaDagType dagger; 00058 00059 extern char latfile[]; 00060 00061 void init(int argc, char **argv) { 00062 00063 00064 kernelPackT = false; // Set true for kernel T face packing 00065 cuda_prec= prec; 00066 00067 gauge_param = newQudaGaugeParam(); 00068 inv_param = newQudaInvertParam(); 00069 00070 gauge_param.X[0] = xdim; 00071 gauge_param.X[1] = ydim; 00072 gauge_param.X[2] = zdim; 00073 gauge_param.X[3] = tdim; 00074 setDims(gauge_param.X); 00075 00076 gauge_param.anisotropy = 1.0; 00077 00078 gauge_param.type = QUDA_WILSON_LINKS; 00079 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00080 gauge_param.t_boundary = QUDA_PERIODIC_T; 00081 00082 gauge_param.cpu_prec = cpu_prec; 00083 gauge_param.cuda_prec = cuda_prec; 00084 gauge_param.reconstruct = link_recon; 00085 gauge_param.reconstruct_sloppy = link_recon; 00086 gauge_param.cuda_prec_sloppy = cuda_prec; 00087 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00088 00089 inv_param.kappa = 0.1; 00090 00091 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00092 inv_param.mu = 0.01; 00093 inv_param.twist_flavor = QUDA_TWIST_MINUS; 00094 } 00095 00096 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00097 inv_param.dagger = dagger; 00098 00099 inv_param.cpu_prec = cpu_prec; 00100 if (inv_param.cpu_prec != gauge_param.cpu_prec) 00101 errorQuda("Gauge and spinor cpu precisions must match"); 00102 00103 inv_param.cuda_prec = cuda_prec; 00104 00105 #ifndef MULTI_GPU // free parameter for single GPU 00106 gauge_param.ga_pad = 0; 00107 #else // must be this one c/b face for multi gpu 00108 int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; 00109 int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; 00110 int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; 00111 int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; 00112 int pad_size =MAX(x_face_size, y_face_size); 00113 pad_size = MAX(pad_size, z_face_size); 00114 pad_size = MAX(pad_size, t_face_size); 00115 gauge_param.ga_pad = pad_size; 00116 #endif 00117 inv_param.sp_pad = 0; 00118 inv_param.cl_pad = 0; 00119 00120 //inv_param.sp_pad = 24*24*24; 00121 //inv_param.cl_pad = 24*24*24; 00122 00123 inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis 00124 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00125 00126 if (test_type == 2) { 00127 inv_param.solution_type = QUDA_MAT_SOLUTION; 00128 } else { 00129 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00130 } 00131 00132 inv_param.dslash_type = dslash_type; 00133 00134 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00135 inv_param.clover_cpu_prec = cpu_prec; 00136 inv_param.clover_cuda_prec = cuda_prec; 00137 inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; 00138 inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; 00139 //if (test_type > 0) { 00140 hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); 00141 hostCloverInv = hostClover; // fake it 00142 /*} else { 00143 hostClover = NULL; 00144 hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); 00145 }*/ 00146 } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00147 00148 } 00149 00150 //inv_param.verbosity = QUDA_VERBOSE; 00151 00152 // construct input fields 00153 for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); 00154 00155 ColorSpinorParam csParam; 00156 00157 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00158 csParam.nColor = 3; 00159 csParam.nSpin = 4; 00160 if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00161 csParam.twistFlavor = inv_param.twist_flavor; 00162 } 00163 csParam.nDim = 4; 00164 for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; 00165 csParam.precision = inv_param.cpu_prec; 00166 csParam.pad = 0; 00167 if (test_type < 2) { 00168 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00169 csParam.x[0] /= 2; 00170 } else { 00171 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00172 } 00173 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00174 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00175 csParam.gammaBasis = inv_param.gamma_basis; 00176 csParam.create = QUDA_ZERO_FIELD_CREATE; 00177 00178 //csParam.verbose = QUDA_DEBUG_VERBOSE; 00179 00180 spinor = new cpuColorSpinorField(csParam); 00181 spinorOut = new cpuColorSpinorField(csParam); 00182 spinorRef = new cpuColorSpinorField(csParam); 00183 00184 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00185 csParam.x[0] = gauge_param.X[0]; 00186 00187 printfQuda("Randomizing fields... "); 00188 00189 if (strcmp(latfile,"")) { // load in the command line supplied gauge field 00190 read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); 00191 construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param); 00192 } else { // else generate a random SU(3) field 00193 construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); 00194 } 00195 00196 spinor->Source(QUDA_RANDOM_SOURCE); 00197 00198 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00199 double norm = 0.0; // clover components are random numbers in the range (-norm, norm) 00200 double diag = 1.0; // constant added to the diagonal 00201 00202 if (test_type == 2) { 00203 construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); 00204 } else { 00205 construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); 00206 } 00207 } 00208 printfQuda("done.\n"); fflush(stdout); 00209 00210 initQuda(device); 00211 00212 printfQuda("Sending gauge field to GPU\n"); 00213 loadGaugeQuda(hostGauge, &gauge_param); 00214 00215 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00216 printfQuda("Sending clover field to GPU\n"); 00217 loadCloverQuda(hostClover, hostCloverInv, &inv_param); 00218 //clover = cudaCloverPrecise; 00219 } 00220 00221 if (!transfer) { 00222 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00223 csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00224 csParam.pad = inv_param.sp_pad; 00225 csParam.precision = inv_param.cuda_prec; 00226 if (csParam.precision == QUDA_DOUBLE_PRECISION ) { 00227 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00228 } else { 00229 /* Single and half */ 00230 csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00231 } 00232 00233 if (test_type < 2) { 00234 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00235 csParam.x[0] /= 2; 00236 } 00237 00238 printfQuda("Creating cudaSpinor\n"); 00239 cudaSpinor = new cudaColorSpinorField(csParam); 00240 printfQuda("Creating cudaSpinorOut\n"); 00241 cudaSpinorOut = new cudaColorSpinorField(csParam); 00242 00243 if (test_type == 2) csParam.x[0] /= 2; 00244 00245 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00246 tmp1 = new cudaColorSpinorField(csParam); 00247 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || 00248 dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00249 tmp2 = new cudaColorSpinorField(csParam); 00250 } 00251 00252 printfQuda("Sending spinor field to GPU\n"); 00253 *cudaSpinor = *spinor; 00254 00255 std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 00256 norm2(*cudaSpinor) << std::endl; 00257 00258 bool pc = (test_type != 2); 00259 DiracParam diracParam; 00260 setDiracParam(diracParam, &inv_param, pc); 00261 diracParam.verbose = QUDA_VERBOSE; 00262 diracParam.tmp1 = tmp1; 00263 diracParam.tmp2 = tmp2; 00264 00265 dirac = Dirac::create(diracParam); 00266 } else { 00267 std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; 00268 } 00269 00270 } 00271 00272 void end() { 00273 if (!transfer) { 00274 delete dirac; 00275 delete cudaSpinor; 00276 delete cudaSpinorOut; 00277 delete tmp1; 00278 delete tmp2; 00279 } 00280 00281 // release memory 00282 delete spinor; 00283 delete spinorOut; 00284 delete spinorRef; 00285 00286 for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); 00287 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00288 if (test_type == 2) free(hostClover); 00289 else free(hostCloverInv); 00290 } 00291 endQuda(); 00292 00293 } 00294 00295 // execute kernel 00296 double dslashCUDA(int niter) { 00297 00298 cudaEvent_t start, end; 00299 cudaEventCreate(&start); 00300 cudaEventCreate(&end); 00301 cudaEventRecord(start, 0); 00302 00303 for (int i = 0; i < niter; i++) { 00304 switch (test_type) { 00305 case 0: 00306 if (transfer) { 00307 dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); 00308 } else { 00309 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00310 } 00311 break; 00312 case 1: 00313 case 2: 00314 if (transfer) { 00315 MatQuda(spinorOut->V(), spinor->V(), &inv_param); 00316 } else { 00317 dirac->M(*cudaSpinorOut, *cudaSpinor); 00318 } 00319 break; 00320 } 00321 } 00322 00323 cudaEventRecord(end, 0); 00324 cudaEventSynchronize(end); 00325 float runTime; 00326 cudaEventElapsedTime(&runTime, start, end); 00327 cudaEventDestroy(start); 00328 cudaEventDestroy(end); 00329 00330 double secs = runTime / 1000; //stopwatchReadSeconds(); 00331 00332 // check for errors 00333 cudaError_t stat = cudaGetLastError(); 00334 if (stat != cudaSuccess) 00335 printfQuda("with ERROR: %s\n", cudaGetErrorString(stat)); 00336 00337 return secs; 00338 } 00339 00340 void dslashRef() { 00341 00342 // FIXME: remove once reference clover is finished 00343 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00344 if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { 00345 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00346 } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { 00347 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00348 } 00349 } 00350 00351 // compare to dslash reference implementation 00352 printfQuda("Calculating reference implementation..."); 00353 fflush(stdout); 00354 00355 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || 00356 dslash_type == QUDA_WILSON_DSLASH) { 00357 switch (test_type) { 00358 case 0: 00359 wil_dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); 00360 break; 00361 case 1: 00362 wil_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.matpc_type, dagger, 00363 inv_param.cpu_prec, gauge_param); 00364 break; 00365 case 2: 00366 wil_mat(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param); 00367 break; 00368 default: 00369 printfQuda("Test type not defined\n"); 00370 exit(-1); 00371 } 00372 } else { // twisted mass 00373 switch (test_type) { 00374 case 0: 00375 tm_dslash(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00376 parity, dagger, inv_param.cpu_prec, gauge_param); 00377 break; 00378 case 1: 00379 tm_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00380 inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); 00381 break; 00382 case 2: 00383 tm_mat(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00384 dagger, inv_param.cpu_prec, gauge_param); 00385 break; 00386 default: 00387 printfQuda("Test type not defined\n"); 00388 exit(-1); 00389 } 00390 } 00391 00392 printfQuda("done.\n"); 00393 } 00394 00395 00396 void display_test_info() 00397 { 00398 printfQuda("running the following test:\n"); 00399 00400 printfQuda("prec recon test_type dagger S_dim T_dimension dslash_type\n"); 00401 printfQuda("%s %s %d %d %d/%d/%d %d %s\n", 00402 get_prec_str(prec), get_recon_str(link_recon), 00403 test_type, dagger, xdim, ydim, zdim, tdim, 00404 get_dslash_type_str(dslash_type)); 00405 printfQuda("Grid partition info: X Y Z T\n"); 00406 printfQuda(" %d %d %d %d\n", 00407 commDimPartitioned(0), 00408 commDimPartitioned(1), 00409 commDimPartitioned(2), 00410 commDimPartitioned(3)); 00411 00412 return ; 00413 00414 } 00415 00416 extern void usage(char**); 00417 00418 00419 int main(int argc, char **argv) 00420 { 00421 00422 for (int i =1;i < argc; i++){ 00423 if(process_command_line_option(argc, argv, &i) == 0){ 00424 continue; 00425 } 00426 00427 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00428 usage(argv); 00429 } 00430 00431 00432 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00433 00434 display_test_info(); 00435 00436 init(argc, argv); 00437 00438 float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30); 00439 printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB); 00440 printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); 00441 00442 int attempts = 1; 00443 dslashRef(); 00444 for (int i=0; i<attempts; i++) { 00445 00446 if (tune) { // warm-up run 00447 printfQuda("Tuning...\n"); 00448 setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); 00449 dslashCUDA(1); 00450 } 00451 printfQuda("Executing %d kernel loops...\n", loops); 00452 dirac->Flops(); 00453 double secs = dslashCUDA(loops); 00454 printfQuda("done.\n\n"); 00455 00456 #ifdef DSLASH_PROFILING 00457 printDslashProfile(); 00458 #endif 00459 00460 if (!transfer) *spinorOut = *cudaSpinorOut; 00461 00462 // print timing information 00463 printfQuda("%fms per loop\n", 1000*secs); 00464 00465 unsigned long long flops = 0; 00466 if (!transfer) flops = dirac->Flops(); 00467 int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24; 00468 if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 00469 spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short 00470 int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; 00471 if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { 00472 gauge_floats += test_type ? 72*2 : 72; 00473 } 00474 printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); 00475 printfQuda("GB/s = %f\n\n", 00476 Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); 00477 00478 if (!transfer) { 00479 double norm2_cpu = norm2(*spinorRef); 00480 double norm2_cuda= norm2(*cudaSpinorOut); 00481 double norm2_cpu_cuda= norm2(*spinorOut); 00482 printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda); 00483 } else { 00484 double norm2_cpu = norm2(*spinorRef); 00485 double norm2_cpu_cuda= norm2(*spinorOut); 00486 printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda); 00487 } 00488 00489 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00490 } 00491 end(); 00492 00493 endCommsQuda(); 00494 }