|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <string.h> 00005 00006 #include <quda.h> 00007 #include <quda_internal.h> 00008 #include <dirac_quda.h> 00009 #include <dslash_quda.h> 00010 #include <invert_quda.h> 00011 #include <util_quda.h> 00012 #include <blas_quda.h> 00013 00014 #include <misc.h> 00015 #include <test_util.h> 00016 #include <staggered_dslash_reference.h> 00017 00018 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00019 int test_type = 0; 00020 int device = 0; 00021 00022 QudaGaugeParam gauge_param; 00023 QudaInvertParam inv_param; 00024 00025 FullGauge cudaFatLink; 00026 FullGauge cudaLongLink; 00027 00028 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00029 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut; 00030 00031 cudaColorSpinorField* tmp; 00032 00033 void *hostGauge[4]; 00034 void *fatlink[4], *longlink[4]; 00035 00036 QudaParity parity; 00037 QudaDagType dagger = QUDA_DAG_NO; 00038 int transfer = 0; // include transfer time in the benchmark? 00039 int tdim = 24; 00040 int sdim = 24; 00041 00042 QudaReconstructType link_recon = QUDA_RECONSTRUCT_12; 00043 QudaPrecision prec = QUDA_SINGLE_PRECISION; 00044 00045 Dirac* dirac; 00046 00047 void init() 00048 { 00049 gauge_param = newQudaGaugeParam(); 00050 inv_param = newQudaInvertParam(); 00051 00052 gauge_param.X[0] = sdim; 00053 gauge_param.X[1] = sdim; 00054 gauge_param.X[2] = sdim; 00055 gauge_param.X[3] = tdim; 00056 00057 setDims(gauge_param.X); 00058 00059 Vh = sdim*sdim*sdim*tdim/2; 00060 00061 gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION; 00062 gauge_param.cuda_prec = prec; 00063 gauge_param.reconstruct = link_recon; 00064 gauge_param.reconstruct_sloppy = gauge_param.reconstruct; 00065 gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; 00066 00067 gauge_param.tadpole_coeff = 0.8; 00068 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00069 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00070 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00071 gauge_param.gaugeGiB = 0; 00072 00073 inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; 00074 inv_param.cuda_prec = prec; 00075 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00076 inv_param.dagger = dagger; 00077 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00078 inv_param.dslash_type = QUDA_ASQTAD_DSLASH; 00079 00080 gauge_param.ga_pad = sdim*sdim*sdim/2; 00081 inv_param.sp_pad = sdim*sdim*sdim/2; 00082 00083 ColorSpinorParam csParam; 00084 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00085 csParam.nColor=3; 00086 csParam.nSpin=1; 00087 csParam.nDim=4; 00088 for(int d = 0; d < 4; d++) { 00089 csParam.x[d] = gauge_param.X[d]; 00090 } 00091 csParam.precision = inv_param.cpu_prec; 00092 csParam.pad = 0; 00093 if (test_type < 2) { 00094 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00095 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00096 csParam.x[0] /= 2; 00097 } else { 00098 inv_param.solution_type = QUDA_MAT_SOLUTION; 00099 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00100 } 00101 00102 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00103 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00104 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00105 csParam.create = QUDA_ZERO_FIELD_CREATE; 00106 00107 spinor = new cpuColorSpinorField(csParam); 00108 spinorOut = new cpuColorSpinorField(csParam); 00109 spinorRef = new cpuColorSpinorField(csParam); 00110 00111 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00112 csParam.x[0] = gauge_param.X[0]; 00113 00114 printfQuda("Randomizing fields ...\n"); 00115 00116 spinor->Source(QUDA_RANDOM_SOURCE); 00117 00118 size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00119 00120 for (int dir = 0; dir < 4; dir++) { 00121 fatlink[dir] = malloc(V*gaugeSiteSize*gSize); 00122 longlink[dir] = malloc(V*gaugeSiteSize*gSize); 00123 } 00124 if (fatlink == NULL || longlink == NULL){ 00125 fprintf(stderr, "ERROR: malloc failed for fatlink/longlink\n"); 00126 exit(1); 00127 } 00128 00129 construct_fat_long_gauge_field(fatlink, longlink, 1, gauge_param.cpu_prec, &gauge_param); 00130 00131 #if 0 00132 //printf("links are:\n"); 00133 //display_link(fatlink[0], 1, gauge_param.cpu_prec); 00134 //display_link(longlink[0], 1, gauge_param.cpu_prec); 00135 00136 for (int i =0;i < 4 ;i++){ 00137 int dir = 2*i; 00138 link_sanity_check(fatlink[i], V, gauge_param.cpu_prec, dir, &gauge_param); 00139 link_sanity_check(longlink[i], V, gauge_param.cpu_prec, dir, &gauge_param); 00140 } 00141 00142 //printf("spinors are:\n"); 00143 //display_spinor(spinor, 10, inv_param.cpu_prec); 00144 #endif 00145 00146 initQuda(device); 00147 00148 gauge_param.type = QUDA_ASQTAD_FAT_LINKS; 00149 gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 00150 loadGaugeQuda(fatlink, &gauge_param); 00151 00152 gauge_param.type = QUDA_ASQTAD_LONG_LINKS; 00153 gauge_param.reconstruct = gauge_param.reconstruct_sloppy = link_recon; 00154 loadGaugeQuda(longlink, &gauge_param); 00155 00156 cudaFatLink = cudaFatLinkPrecise; 00157 cudaLongLink = cudaLongLinkPrecise; 00158 00159 printf("Sending fields to GPU..."); fflush(stdout); 00160 00161 if (!transfer) { 00162 00163 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00164 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00165 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00166 csParam.pad = inv_param.sp_pad; 00167 csParam.precision = inv_param.cuda_prec; 00168 if (test_type < 2){ 00169 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00170 csParam.x[0] /=2; 00171 } 00172 00173 printfQuda("Creating cudaSpinor\n"); 00174 cudaSpinor = new cudaColorSpinorField(csParam); 00175 00176 printfQuda("Creating cudaSpinorOut\n"); 00177 cudaSpinorOut = new cudaColorSpinorField(csParam); 00178 00179 printfQuda("Sending spinor field to GPU\n"); 00180 *cudaSpinor = *spinor; 00181 00182 cudaThreadSynchronize(); 00183 checkCudaError(); 00184 00185 printf("Source CPU = %f, CUDA=%f\n", norm2(*spinor), norm2(*cudaSpinor)); 00186 00187 if(test_type == 2){ 00188 csParam.x[0] /=2; 00189 } 00190 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00191 tmp = new cudaColorSpinorField(csParam); 00192 00193 bool pc = (test_type != 2); 00194 DiracParam diracParam; 00195 setDiracParam(diracParam, &inv_param, pc); 00196 diracParam.fatGauge = &cudaFatLinkPrecise; 00197 diracParam.longGauge = &cudaLongLinkPrecise; 00198 00199 diracParam.verbose = QUDA_VERBOSE; 00200 diracParam.tmp1=tmp; 00201 dirac = Dirac::create(diracParam); 00202 00203 } else { 00204 printf("ERROR: not suppported\n"); 00205 } 00206 00207 return; 00208 } 00209 00210 void end(void) 00211 { 00212 for (int dir = 0; dir < 4; dir++) { 00213 free(fatlink[dir]); 00214 free(longlink[dir]); 00215 } 00216 if (!transfer){ 00217 delete dirac; 00218 delete cudaSpinor; 00219 delete cudaSpinorOut; 00220 delete tmp; 00221 } 00222 00223 delete spinor; 00224 delete spinorOut; 00225 delete spinorRef; 00226 00227 endQuda(); 00228 } 00229 00230 double dslashCUDA() { 00231 00232 // execute kernel 00233 const int LOOPS = 1; 00234 printf("Executing %d kernel loops...", LOOPS); 00235 fflush(stdout); 00236 stopwatchStart(); 00237 for (int i = 0; i < LOOPS; i++) { 00238 switch (test_type) { 00239 case 0: 00240 parity = QUDA_EVEN_PARITY; 00241 if (transfer){ 00242 //dslashQuda(spinorOdd, spinorEven, &inv_param, parity); 00243 } 00244 else { 00245 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00246 } 00247 break; 00248 case 1: 00249 parity = QUDA_ODD_PARITY; 00250 if (transfer){ 00251 //MatPCQuda(spinorOdd, spinorEven, &inv_param); 00252 }else { 00253 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00254 } 00255 break; 00256 case 2: 00257 if (transfer){ 00258 //MatQuda(spinorGPU, spinor, &inv_param); 00259 } 00260 else { 00261 dirac->M(*cudaSpinorOut, *cudaSpinor); 00262 } 00263 } 00264 } 00265 00266 // check for errors 00267 cudaError_t stat = cudaGetLastError(); 00268 if (stat != cudaSuccess) 00269 printf("with ERROR: %s\n", cudaGetErrorString(stat)); 00270 00271 cudaThreadSynchronize(); 00272 double secs = stopwatchReadSeconds() / LOOPS; 00273 printf("done.\n\n"); 00274 00275 return secs; 00276 } 00277 00278 void staggeredDslashRef() 00279 { 00280 int cpu_parity; 00281 // compare to dslash reference implementation 00282 printf("Calculating reference implementation..."); 00283 fflush(stdout); 00284 switch (test_type) { 00285 case 0: 00286 cpu_parity = 0; //EVEN 00287 staggered_dslash(spinorRef->v, fatlink, longlink, spinor->v, cpu_parity, dagger, 00288 inv_param.cpu_prec, gauge_param.cpu_prec); 00289 break; 00290 case 1: 00291 cpu_parity=1; //ODD 00292 staggered_dslash(spinorRef->v, fatlink, longlink, spinor->v, cpu_parity, dagger, 00293 inv_param.cpu_prec, gauge_param.cpu_prec); 00294 break; 00295 case 2: 00296 //mat(spinorRef->v, fatlink, longlink, spinor->v, kappa, dagger, 00297 //inv_param.cpu_prec, gauge_param.cpu_prec); 00298 break; 00299 default: 00300 printf("Test type not defined\n"); 00301 exit(-1); 00302 } 00303 00304 printf("done.\n"); 00305 00306 } 00307 00308 static void dslashTest() 00309 { 00310 00311 init(); 00312 00313 int attempts = 1; 00314 00315 staggeredDslashRef(); 00316 00317 for (int i=0; i<attempts; i++) { 00318 00319 double secs = dslashCUDA(); 00320 00321 if (!transfer) { 00322 *spinorOut = *cudaSpinorOut; 00323 } 00324 00325 printf("\n%fms per loop\n", 1000*secs); 00326 00327 int flops = dirac->Flops(); 00328 int link_floats = 8*gauge_param.packed_size+8*18; 00329 int spinor_floats = 8*6*2 + 6; 00330 int link_float_size = prec; 00331 int spinor_float_size = 0; 00332 00333 link_floats = test_type ? (2*link_floats) : link_floats; 00334 spinor_floats = test_type ? (2*spinor_floats) : spinor_floats; 00335 00336 int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size; 00337 if (prec == QUDA_HALF_PRECISION) { 00338 bytes_for_one_site += (8*2 + 1)*4; 00339 } 00340 printf("GFLOPS = %f\n", 1.0e-9*flops/secs); 00341 printf("GiB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/(secs*(1<<30))); 00342 00343 if (!transfer) { 00344 std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 00345 ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00346 } else { 00347 std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00348 } 00349 00350 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00351 00352 printf("Output spinor:\n"); 00353 spinorOut->PrintVector(0); 00354 00355 printf("Ref spinor:\n"); 00356 spinorRef->PrintVector(0); 00357 00358 } 00359 end(); 00360 00361 } 00362 00363 00364 void display_test_info() 00365 { 00366 printf("running the following test:\n"); 00367 00368 printf("prec recon test_type dagger S_dim T_dimension\n"); 00369 printf("%s %s %d %d %d %d \n", 00370 get_prec_str(prec), get_recon_str(link_recon), 00371 test_type, dagger, sdim, tdim); 00372 return ; 00373 00374 } 00375 00376 void usage(char** argv ) 00377 { 00378 printf("Usage: %s <args>\n", argv[0]); 00379 printf("--prec <double/single/half> \t Precision in GPU\n"); 00380 printf("--recon <8/12> \t\t\t Long link reconstruction type\n"); 00381 printf("--type <0/1/2> \t\t\t Test type\n"); 00382 printf("--dagger \t\t\t Set the dagger to 1\n"); 00383 printf("--tdim \t\t\t\t Set T dimention size(default 24)\n"); 00384 printf("--sdim \t\t\t\t Set space dimention size\n"); 00385 printf("--help \t\t\t\t Print out this message\n"); 00386 exit(1); 00387 return ; 00388 } 00389 00390 int main(int argc, char **argv) 00391 { 00392 int i; 00393 for (i =1;i < argc; i++){ 00394 00395 if( strcmp(argv[i], "--help")== 0){ 00396 usage(argv); 00397 } 00398 00399 if( strcmp(argv[i], "--device") == 0){ 00400 if (i+1 >= argc){ 00401 usage(argv); 00402 } 00403 device = atoi(argv[i+1]); 00404 if (device < 0){ 00405 fprintf(stderr, "Error: invalid device number(%d)\n", device); 00406 exit(1); 00407 } 00408 i++; 00409 continue; 00410 } 00411 00412 if( strcmp(argv[i], "--prec") == 0){ 00413 if (i+1 >= argc){ 00414 usage(argv); 00415 } 00416 prec = get_prec(argv[i+1]); 00417 i++; 00418 continue; 00419 } 00420 00421 00422 if( strcmp(argv[i], "--recon") == 0){ 00423 if (i+1 >= argc){ 00424 usage(argv); 00425 } 00426 link_recon = get_recon(argv[i+1]); 00427 i++; 00428 continue; 00429 } 00430 00431 if( strcmp(argv[i], "--test") == 0){ 00432 if (i+1 >= argc){ 00433 usage(argv); 00434 } 00435 test_type = atoi(argv[i+1]); 00436 if (test_type < 0 || test_type >= 2){ 00437 fprintf(stderr, "Error: invalid test type\n"); 00438 exit(1); 00439 } 00440 i++; 00441 continue; 00442 } 00443 00444 if( strcmp(argv[i], "--tdim") == 0){ 00445 if (i+1 >= argc){ 00446 usage(argv); 00447 } 00448 tdim = atoi(argv[i+1]); 00449 if (tdim < 0 || tdim > 128){ 00450 fprintf(stderr, "Error: invalid t dimention\n"); 00451 exit(1); 00452 } 00453 i++; 00454 continue; 00455 } 00456 00457 if( strcmp(argv[i], "--sdim") == 0){ 00458 if (i+1 >= argc){ 00459 usage(argv); 00460 } 00461 sdim = atoi(argv[i+1]); 00462 if (sdim < 0 || sdim > 128){ 00463 fprintf(stderr, "Error: invalid S dimention\n"); 00464 exit(1); 00465 } 00466 i++; 00467 continue; 00468 } 00469 00470 if( strcmp(argv[i], "--dagger") == 0){ 00471 dagger = QUDA_DAG_YES; 00472 continue; 00473 } 00474 00475 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00476 usage(argv); 00477 } 00478 00479 display_test_info(); 00480 00481 dslashTest(); 00482 }
1.7.3