QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <cstdio> 00002 #include <cstdlib> 00003 #include <cstring> 00004 00005 #include <quda.h> 00006 #include "test_util.h" 00007 #include "gauge_field.h" 00008 #include "fat_force_quda.h" 00009 #include "misc.h" 00010 #include "hisq_force_reference.h" 00011 #include "hisq_force_quda.h" 00012 #include "hisq_force_utils.h" 00013 #include "hw_quda.h" 00014 #include <sys/time.h> 00015 00016 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 00017 00018 00019 00020 #include "fermion_force_reference.h" 00021 using namespace hisq::fermion_force; 00022 00023 extern void usage(char** argv); 00024 static int device = 0; 00025 cudaGaugeField *cudaGauge = NULL; 00026 cpuGaugeField *cpuGauge = NULL; 00027 00028 cudaGaugeField *cudaForce = NULL; 00029 cpuGaugeField *cpuForce = NULL; 00030 00031 cudaGaugeField *cudaMom = NULL; 00032 cpuGaugeField *cpuMom = NULL; 00033 cpuGaugeField *refMom = NULL; 00034 00035 static FullHw cudaHw; 00036 static QudaGaugeParam gaugeParam; 00037 static void* hw; // the array of half_wilson_vector 00038 00039 00040 cpuGaugeField *cpuOprod = NULL; 00041 cudaGaugeField *cudaOprod = NULL; 00042 cpuGaugeField *cpuLongLinkOprod = NULL; 00043 cudaGaugeField *cudaLongLinkOprod = NULL; 00044 00045 int verify_results = 0; 00046 int ODD_BIT = 1; 00047 extern int xdim, ydim, zdim, tdim; 00048 00049 extern QudaPrecision prec; 00050 extern QudaReconstructType link_recon; 00051 QudaPrecision link_prec = QUDA_DOUBLE_PRECISION; 00052 QudaPrecision hw_prec = QUDA_DOUBLE_PRECISION; 00053 QudaPrecision cpu_hw_prec = QUDA_DOUBLE_PRECISION; 00054 QudaPrecision mom_prec = QUDA_DOUBLE_PRECISION; 00055 00056 00057 00058 static void setPrecision(QudaPrecision precision) 00059 { 00060 link_prec = precision; 00061 hw_prec = precision; 00062 cpu_hw_prec = precision; 00063 mom_prec = precision; 00064 return; 00065 } 00066 00067 int Z[4]; 00068 int V; 00069 int Vh; 00070 00071 00072 void 00073 setDims(int *X){ 00074 V = 1; 00075 for(int dir=0; dir<4; ++dir){ 00076 V *= X[dir]; 00077 Z[dir] = X[dir]; 00078 } 00079 Vh = V/2; 00080 return; 00081 } 00082 00083 00084 void 00085 total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double* io, double* flops) 00086 { 00087 //total IO counting for the middle/side/all link kernels 00088 //Explanation about these numbers can be founed in the corresnponding kernel functions in 00089 //the hisq kernel core file 00090 int linksize = prec*recon; 00091 int cmsize = prec*18; 00092 00093 int matrix_mul_flops = 198; 00094 int matrix_add_flops = 18; 00095 00096 int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24}; 00097 int middle_link_data_io[6][2] = { 00098 {3,6}, 00099 {3,4}, 00100 {3,7}, 00101 {3,5}, 00102 {3,5}, 00103 {3,2} 00104 }; 00105 int middle_link_data_flops[6][2] = { 00106 {3,1}, 00107 {2,0}, 00108 {4,1}, 00109 {3,0}, 00110 {4,1}, 00111 {2,0} 00112 }; 00113 00114 00115 int num_calls_side_link[2]= {192, 48}; 00116 int side_link_data_io[2][2] = { 00117 {1, 6}, 00118 {0, 3} 00119 }; 00120 int side_link_data_flops[2][2] = { 00121 {2, 2}, 00122 {0, 1} 00123 }; 00124 00125 00126 00127 int num_calls_all_link[2] ={192, 192}; 00128 int all_link_data_io[2][2] = { 00129 {3, 8}, 00130 {3, 6} 00131 }; 00132 int all_link_data_flops[2][2] = { 00133 {6, 3}, 00134 {4, 2} 00135 }; 00136 00137 00138 double total_io = 0; 00139 for(int i = 0;i < 6; i++){ 00140 total_io += num_calls_middle_link[i] 00141 *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize); 00142 } 00143 00144 for(int i = 0;i < 2; i++){ 00145 total_io += num_calls_side_link[i] 00146 *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize); 00147 } 00148 for(int i = 0;i < 2; i++){ 00149 total_io += num_calls_all_link[i] 00150 *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize); 00151 } 00152 total_io *= V; 00153 00154 00155 double total_flops = 0; 00156 for(int i = 0;i < 6; i++){ 00157 total_flops += num_calls_middle_link[i] 00158 *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops); 00159 } 00160 00161 for(int i = 0;i < 2; i++){ 00162 total_flops += num_calls_side_link[i] 00163 *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops); 00164 } 00165 for(int i = 0;i < 2; i++){ 00166 total_flops += num_calls_all_link[i] 00167 *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops); 00168 } 00169 total_flops *= V; 00170 00171 *io=total_io; 00172 *flops = total_flops; 00173 00174 printfQuda("flop/byte =%.1f\n", total_flops/total_io); 00175 return ; 00176 } 00177 00178 00179 void initDslashConstants(const cudaGaugeField &gauge, const int sp_stride); 00180 00181 00182 // allocate memory 00183 // set the layout, etc. 00184 static void 00185 hisq_force_init() 00186 { 00187 initQuda(device); 00188 00189 gaugeParam.X[0] = xdim; 00190 gaugeParam.X[1] = ydim; 00191 gaugeParam.X[2] = zdim; 00192 gaugeParam.X[3] = tdim; 00193 00194 setDims(gaugeParam.X); 00195 00196 00197 gaugeParam.cpu_prec = link_prec; 00198 gaugeParam.cuda_prec = link_prec; 00199 gaugeParam.reconstruct = link_recon; 00200 00201 gaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER; 00202 00203 GaugeFieldParam gParam(0, gaugeParam); 00204 gParam.create = QUDA_NULL_FIELD_CREATE; 00205 00206 cpuGauge = new cpuGaugeField(gParam); 00207 00208 // this is a hack to get the gauge field to appear as a void** rather than void* 00209 void* siteLink_2d[4]; 00210 for(int i=0;i < 4;i++){ 00211 siteLink_2d[i] = malloc(V*gaugeSiteSize* gaugeParam.cpu_prec); 00212 if(siteLink_2d[i] == NULL){ 00213 errorQuda("malloc failed for siteLink_2d\n"); 00214 } 00215 } 00216 00217 // fills the gauge field with random numbers 00218 createSiteLinkCPU(siteLink_2d, gaugeParam.cpu_prec, 1); 00219 00220 for(int dir = 0; dir < 4; dir++){ 00221 for(int i = 0;i < V; i++){ 00222 char* src = (char*)siteLink_2d[dir]; 00223 char* dst = (char*)cpuGauge->Gauge_p(); 00224 memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec); 00225 } 00226 } 00227 00228 gParam.precision = gaugeParam.cuda_prec; 00229 gParam.reconstruct = link_recon; 00230 cudaGauge = new cudaGaugeField(gParam); 00231 00232 // create the force matrix 00233 // cannot reconstruct, since the force matrix is not in SU(3) 00234 gParam.precision = gaugeParam.cpu_prec; 00235 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 00236 cpuForce = new cpuGaugeField(gParam); 00237 memset(cpuForce->Gauge_p(), 0, 4*cpuForce->Volume()*gaugeSiteSize*gaugeParam.cpu_prec); 00238 00239 gParam.precision = gaugeParam.cuda_prec; 00240 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 00241 cudaForce = new cudaGaugeField(gParam); 00242 cudaMemset((void**)(cudaForce->Gauge_p()), 0, cudaForce->Bytes()); 00243 00244 // create the momentum matrix 00245 gParam.reconstruct = QUDA_RECONSTRUCT_10; 00246 gParam.precision = gaugeParam.cpu_prec; 00247 cpuMom = new cpuGaugeField(gParam); 00248 refMom = new cpuGaugeField(gParam); 00249 00250 createMomCPU(cpuMom->Gauge_p(), mom_prec); 00251 00252 00253 memset(cpuMom->Gauge_p(), 0, cpuMom->Bytes()); 00254 memset(refMom->Gauge_p(), 0, refMom->Bytes()); 00255 00256 gParam.precision = gaugeParam.cuda_prec; 00257 cudaMom = new cudaGaugeField(gParam); // Are the elements initialised to zero? - No! 00258 00259 hw = malloc(4*cpuGauge->Volume()*hwSiteSize*gaugeParam.cpu_prec); 00260 if (hw == NULL){ 00261 fprintf(stderr, "ERROR: malloc failed for hw\n"); 00262 exit(1); 00263 } 00264 00265 createHwCPU(hw, hw_prec); 00266 cudaHw = createHwQuda(gaugeParam.X, hw_prec); 00267 00268 00269 00270 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 00271 gParam.precision = gaugeParam.cpu_prec; 00272 cpuOprod = new cpuGaugeField(gParam); 00273 computeLinkOrderedOuterProduct(hw, cpuOprod->Gauge_p(), hw_prec, 1); 00274 00275 cpuLongLinkOprod = new cpuGaugeField(gParam); 00276 computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3); 00277 00278 00279 gParam.precision = hw_prec; 00280 cudaOprod = new cudaGaugeField(gParam); 00281 cudaLongLinkOprod = new cudaGaugeField(gParam); 00282 00283 for(int i = 0;i < 4; i++){ 00284 free(siteLink_2d[i]); 00285 } 00286 return; 00287 } 00288 00289 00290 static void 00291 hisq_force_end() 00292 { 00293 delete cudaMom; 00294 delete cudaForce; 00295 delete cudaGauge; 00296 delete cudaOprod; 00297 delete cudaLongLinkOprod; 00298 freeHwQuda(cudaHw); 00299 00300 delete cpuGauge; 00301 delete cpuForce; 00302 delete cpuMom; 00303 delete refMom; 00304 delete cpuOprod; 00305 delete cpuLongLinkOprod; 00306 free(hw); 00307 00308 endQuda(); 00309 00310 return; 00311 } 00312 00313 static int 00314 hisq_force_test(void) 00315 { 00316 hisq_force_init(); 00317 00318 initDslashConstants(*cudaGauge, cudaGauge->VolumeCB()); 00319 hisqForceInitCuda(&gaugeParam); 00320 00321 00322 00323 float weight = 1.0; 00324 float act_path_coeff[6]; 00325 00326 act_path_coeff[0] = 0.625000; 00327 act_path_coeff[1] = -0.058479; 00328 act_path_coeff[2] = -0.087719; 00329 act_path_coeff[3] = 0.030778; 00330 act_path_coeff[4] = -0.007200; 00331 act_path_coeff[5] = -0.123113; 00332 00333 00334 double d_weight = 1.0; 00335 double d_act_path_coeff[6]; 00336 for(int i=0; i<6; ++i){ 00337 d_act_path_coeff[i] = act_path_coeff[i]; 00338 } 00339 00340 00341 00342 00343 00344 // copy the momentum field to the GPU 00345 cudaMom->loadCPUField(*refMom, QUDA_CPU_FIELD_LOCATION); 00346 // copy the gauge field to the GPU 00347 cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION); 00348 // copy the outer product field to the GPU 00349 cudaOprod->loadCPUField(*cpuOprod, QUDA_CPU_FIELD_LOCATION); 00350 // load the three-link outer product to the GPU 00351 cudaLongLinkOprod->loadCPUField(*cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION); 00352 00353 loadHwToGPU(cudaHw, hw, cpu_hw_prec); 00354 00355 struct timeval ht0, ht1; 00356 gettimeofday(&ht0, NULL); 00357 if (verify_results){ 00358 if(cpu_hw_prec == QUDA_SINGLE_PRECISION){ 00359 const float eps = 0.5; 00360 fermion_force_reference(eps, weight, 0, act_path_coeff, hw, cpuGauge->Gauge_p(), refMom->Gauge_p()); 00361 }else if(cpu_hw_prec == QUDA_DOUBLE_PRECISION){ 00362 const double eps = 0.5; 00363 fermion_force_reference(eps, d_weight, 0, d_act_path_coeff, hw, cpuGauge->Gauge_p(), refMom->Gauge_p()); 00364 } 00365 } 00366 gettimeofday(&ht1, NULL); 00367 00368 struct timeval t0, t1, t2, t3, t4, t5; 00369 00370 gettimeofday(&t0, NULL); 00371 hisqStaplesForceCuda(d_act_path_coeff, gaugeParam, *cudaOprod, *cudaGauge, cudaForce); 00372 cudaThreadSynchronize(); 00373 gettimeofday(&t1, NULL); 00374 checkCudaError(); 00375 00376 gettimeofday(&t2, NULL); 00377 hisqLongLinkForceCuda(d_act_path_coeff[1], gaugeParam, *cudaLongLinkOprod, *cudaGauge, cudaForce); 00378 cudaThreadSynchronize(); 00379 gettimeofday(&t3, NULL); 00380 checkCudaError(); 00381 gettimeofday(&t4, NULL); 00382 hisqCompleteForceCuda(gaugeParam, *cudaForce, *cudaGauge, cudaMom); 00383 cudaThreadSynchronize(); 00384 checkCudaError(); 00385 gettimeofday(&t5, NULL); 00386 00387 00388 00389 cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION); 00390 00391 int res; 00392 res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, gaugeParam.cpu_prec); 00393 00394 int accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), gaugeParam.cpu_prec); 00395 printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); 00396 00397 double total_io; 00398 double total_flops; 00399 total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops); 00400 00401 float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9; 00402 float perf = total_io / (TDIFF(t0, t1)) *1e-9; 00403 printf("Staples time: %.2f ms, perf =%.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf); 00404 printf("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t2,t3)*1000, TDIFF(t4,t5)*1000); 00405 printf("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000); 00406 00407 hisq_force_end(); 00408 00409 return accuracy_level; 00410 } 00411 00412 00413 static void 00414 display_test_info() 00415 { 00416 printf("running the following fermion force computation test:\n"); 00417 00418 printf("link_precision link_reconstruct space_dim(x/y/z) T_dimension\n"); 00419 printf("%s %s %d/%d/%d %d \n", 00420 get_prec_str(link_prec), 00421 get_recon_str(link_recon), 00422 xdim, ydim, zdim, tdim); 00423 return ; 00424 00425 } 00426 00427 void 00428 usage_extra(char** argv ) 00429 { 00430 printf("Extra options: \n"); 00431 printf(" --verify # Verify the GPU results using CPU results\n"); 00432 return ; 00433 } 00434 int 00435 main(int argc, char **argv) 00436 { 00437 int i; 00438 for (i =1;i < argc; i++){ 00439 00440 if(process_command_line_option(argc, argv, &i) == 0){ 00441 continue; 00442 } 00443 00444 if( strcmp(argv[i], "--verify") == 0){ 00445 verify_results=1; 00446 continue; 00447 } 00448 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00449 usage(argv); 00450 } 00451 00452 #ifdef MULTI_GPU 00453 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00454 #endif 00455 00456 setPrecision(prec); 00457 00458 display_test_info(); 00459 00460 int accuracy_level = hisq_force_test(); 00461 00462 00463 #ifdef MULTI_GPU 00464 endCommsQuda(); 00465 #endif 00466 00467 if(accuracy_level >=3 ){ 00468 return EXIT_SUCCESS; 00469 }else{ 00470 return -1; 00471 } 00472 00473 } 00474 00475