QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <string.h> 00004 #include <sys/time.h> 00005 #include <cuda.h> 00006 #include <cuda_runtime.h> 00007 00008 #include "quda.h" 00009 #include "test_util.h" 00010 #include "llfat_reference.h" 00011 #include "misc.h" 00012 #include "util_quda.h" 00013 00014 #ifdef MULTI_GPU 00015 #include "face_quda.h" 00016 #include "comm_quda.h" 00017 #endif 00018 00019 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 00020 00021 extern void usage(char** argv); 00022 static int verify_results = 0; 00023 00024 extern int device; 00025 int Z[4]; 00026 int V; 00027 int Vh; 00028 int Vs[4]; 00029 int Vsh[4]; 00030 static int Vs_x, Vs_y, Vs_z, Vs_t; 00031 static int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00032 00033 static int V_ex; 00034 static int Vh_ex; 00035 00036 static int X1, X1h, X2, X3, X4; 00037 static int E1, E1h, E2, E3, E4; 00038 00039 00040 extern int xdim, ydim, zdim, tdim; 00041 extern int gridsize_from_cmdline[]; 00042 00043 extern QudaReconstructType link_recon; 00044 extern QudaPrecision prec; 00045 static QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00046 static QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER; 00047 00048 static size_t gSize; 00049 00050 void 00051 setDims(int *X) { 00052 V = 1; 00053 for (int d=0; d< 4; d++) { 00054 V *= X[d]; 00055 Z[d] = X[d]; 00056 } 00057 Vh = V/2; 00058 00059 Vs[0] = Vs_x = X[1]*X[2]*X[3]; 00060 Vs[1] = Vs_y = X[0]*X[2]*X[3]; 00061 Vs[2] = Vs_z = X[0]*X[1]*X[3]; 00062 Vs[3] = Vs_t = X[0]*X[1]*X[2]; 00063 Vsh[0] = Vsh_x = Vs_x/2; 00064 Vsh[1] = Vsh_y = Vs_y/2; 00065 Vsh[2] = Vsh_z = Vs_z/2; 00066 Vsh[3] = Vsh_t = Vs_t/2; 00067 00068 00069 V_ex = 1; 00070 for (int d=0; d< 4; d++) { 00071 V_ex *= X[d]+4; 00072 } 00073 Vh_ex = V_ex/2; 00074 00075 X1=X[0]; X2 = X[1]; X3=X[2]; X4=X[3]; 00076 X1h=X1/2; 00077 E1=X1+4; E2=X2+4; E3=X3+4; E4=X4+4; 00078 E1h=E1/2; 00079 00080 00081 } 00082 00083 static int 00084 llfat_test(int test) 00085 { 00086 00087 QudaGaugeParam qudaGaugeParam; 00088 #ifdef MULTI_GPU 00089 void* ghost_sitelink[4]; 00090 void* ghost_sitelink_diag[16]; 00091 #endif 00092 00093 00094 initQuda(device); 00095 00096 cpu_prec = prec; 00097 gSize = cpu_prec; 00098 qudaGaugeParam = newQudaGaugeParam(); 00099 00100 qudaGaugeParam.anisotropy = 1.0; 00101 00102 qudaGaugeParam.X[0] = xdim; 00103 qudaGaugeParam.X[1] = ydim; 00104 qudaGaugeParam.X[2] = zdim; 00105 qudaGaugeParam.X[3] = tdim; 00106 00107 setDims(qudaGaugeParam.X); 00108 00109 qudaGaugeParam.cpu_prec = cpu_prec; 00110 qudaGaugeParam.cuda_prec = prec; 00111 qudaGaugeParam.gauge_order = gauge_order; 00112 qudaGaugeParam.type=QUDA_WILSON_LINKS; 00113 qudaGaugeParam.reconstruct = link_recon; 00114 /* 00115 qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE 00116 | QUDA_FAT_PRESERVE_GPU_GAUGE 00117 | QUDA_FAT_PRESERVE_COMM_MEM; 00118 */ 00119 qudaGaugeParam.preserve_gauge =0; 00120 void* fatlink; 00121 cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize); 00122 if(fatlink == NULL){ 00123 errorQuda("ERROR: allocating fatlink failed\n"); 00124 } 00125 00126 void* sitelink[4]; 00127 for(int i=0;i < 4;i++){ 00128 cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize); 00129 if(sitelink[i] == NULL){ 00130 errorQuda("ERROR; allocate sitelink[%d] failed\n", i); 00131 } 00132 } 00133 00134 void* sitelink_ex[4]; 00135 for(int i=0;i < 4;i++){ 00136 cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize); 00137 if(sitelink_ex[i] == NULL){ 00138 errorQuda("ERROR; allocate sitelink_ex[%d] failed\n", i); 00139 } 00140 } 00141 00142 00143 void* milc_sitelink; 00144 milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize); 00145 if(milc_sitelink == NULL){ 00146 errorQuda("ERROR: allocating milc_sitelink failed\n"); 00147 } 00148 00149 void* milc_sitelink_ex; 00150 milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize); 00151 if(milc_sitelink_ex == NULL){ 00152 errorQuda("Error: allocating milc_sitelink failed\n"); 00153 } 00154 00155 00156 00157 createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); 00158 00159 if(gauge_order == QUDA_MILC_GAUGE_ORDER){ 00160 for(int i=0; i<V; ++i){ 00161 for(int dir=0; dir<4; ++dir){ 00162 char* src = (char*)sitelink[dir]; 00163 memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize); 00164 } 00165 } 00166 } 00167 00168 00169 for(int i=0; i < V_ex; i++){ 00170 int sid = i; 00171 int oddBit=0; 00172 if(i >= Vh_ex){ 00173 sid = i - Vh_ex; 00174 oddBit = 1; 00175 } 00176 00177 int za = sid/E1h; 00178 int x1h = sid - za*E1h; 00179 int zb = za/E2; 00180 int x2 = za - zb*E2; 00181 int x4 = zb/E3; 00182 int x3 = zb - x4*E3; 00183 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00184 int x1 = 2*x1h + x1odd; 00185 00186 00187 if( x1< 2 || x1 >= X1 +2 00188 || x2< 2 || x2 >= X2 +2 00189 || x3< 2 || x3 >= X3 +2 00190 || x4< 2 || x4 >= X4 +2){ 00191 #ifdef MULTI_GPU 00192 continue; 00193 #endif 00194 } 00195 00196 00197 00198 x1 = (x1 - 2 + X1) % X1; 00199 x2 = (x2 - 2 + X2) % X2; 00200 x3 = (x3 - 2 + X3) % X3; 00201 x4 = (x4 - 2 + X4) % X4; 00202 00203 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; 00204 if(oddBit){ 00205 idx += Vh; 00206 } 00207 for(int dir= 0; dir < 4; dir++){ 00208 char* src = (char*)sitelink[dir]; 00209 char* dst = (char*)sitelink_ex[dir]; 00210 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); 00211 00212 // milc ordering 00213 memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); 00214 }//dir 00215 }//i 00216 00217 00218 double act_path_coeff[6]; 00219 for(int i=0;i < 6;i++){ 00220 act_path_coeff[i]= 0.1*i; 00221 } 00222 00223 00224 //only record the last call's performance 00225 //the first one is for creating the cpu/cuda data structures 00226 struct timeval t0, t1; 00227 00228 for(int i=0;i < 2;i++){ 00229 gettimeofday(&t0, NULL); 00230 if(gauge_order == QUDA_QDP_GAUGE_ORDER){ 00231 if(test == 0){ 00232 computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam, 00233 QUDA_COMPUTE_FAT_STANDARD); 00234 }else{ 00235 computeFatLinkQuda(fatlink, sitelink_ex, act_path_coeff, &qudaGaugeParam, 00236 QUDA_COMPUTE_FAT_EXTENDED_VOLUME); 00237 } 00238 }else if(gauge_order == QUDA_MILC_GAUGE_ORDER){ 00239 if(test == 0){ 00240 computeFatLinkQuda(fatlink, (void**)milc_sitelink, act_path_coeff, &qudaGaugeParam, 00241 QUDA_COMPUTE_FAT_STANDARD); 00242 }else{ 00243 computeFatLinkQuda(fatlink, (void**)milc_sitelink_ex, act_path_coeff, &qudaGaugeParam, 00244 QUDA_COMPUTE_FAT_EXTENDED_VOLUME); 00245 } 00246 } 00247 gettimeofday(&t1, NULL); 00248 } 00249 00250 double secs = TDIFF(t0,t1); 00251 00252 void* reflink[4]; 00253 for(int i=0;i < 4;i++){ 00254 reflink[i] = malloc(V*gaugeSiteSize*gSize); 00255 if(reflink[i] == NULL){ 00256 errorQuda("ERROR; allocate reflink[%d] failed\n", i); 00257 } 00258 } 00259 00260 if (verify_results){ 00261 00262 //FIXME: we have this compplication because references takes coeff as float/double 00263 // depending on the precision while the GPU code aways take coeff as double 00264 void* coeff; 00265 double coeff_dp[6]; 00266 float coeff_sp[6]; 00267 for(int i=0;i < 6;i++){ 00268 coeff_sp[i] = coeff_dp[i] = act_path_coeff[i]; 00269 } 00270 if(prec == QUDA_DOUBLE_PRECISION){ 00271 coeff = coeff_dp; 00272 }else{ 00273 coeff = coeff_sp; 00274 } 00275 #ifdef MULTI_GPU 00276 int optflag = 0; 00277 //we need x,y,z site links in the back and forward T slice 00278 // so it is 3*2*Vs_t 00279 for(int i=0;i < 4; i++){ 00280 ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize); 00281 if (ghost_sitelink[i] == NULL){ 00282 printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i); 00283 exit(1); 00284 } 00285 } 00286 00287 /* 00288 nu | | 00289 |_____| 00290 mu 00291 */ 00292 00293 for(int nu=0;nu < 4;nu++){ 00294 for(int mu=0; mu < 4;mu++){ 00295 if(nu == mu){ 00296 ghost_sitelink_diag[nu*4+mu] = NULL; 00297 }else{ 00298 //the other directions 00299 int dir1, dir2; 00300 for(dir1= 0; dir1 < 4; dir1++){ 00301 if(dir1 !=nu && dir1 != mu){ 00302 break; 00303 } 00304 } 00305 for(dir2=0; dir2 < 4; dir2++){ 00306 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00307 break; 00308 } 00309 } 00310 ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); 00311 if(ghost_sitelink_diag[nu*4+mu] == NULL){ 00312 errorQuda("malloc failed for ghost_sitelink_diag\n"); 00313 } 00314 00315 memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); 00316 } 00317 00318 } 00319 } 00320 00321 exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag); 00322 llfat_reference_mg(reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff); 00323 #else 00324 llfat_reference(reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); 00325 #endif 00326 00327 }//verify_results 00328 00329 //format change for fatlink 00330 void* myfatlink[4]; 00331 for(int i=0;i < 4;i++){ 00332 myfatlink[i] = malloc(V*gaugeSiteSize*gSize); 00333 if(myfatlink[i] == NULL){ 00334 printf("Error: malloc failed for myfatlink[%d]\n", i); 00335 exit(1); 00336 } 00337 memset(myfatlink[i], 0, V*gaugeSiteSize*gSize); 00338 } 00339 00340 for(int i=0;i < V; i++){ 00341 for(int dir=0; dir< 4; dir++){ 00342 char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize; 00343 char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize; 00344 memcpy(dst, src, gaugeSiteSize*gSize); 00345 } 00346 } 00347 00348 int res=1; 00349 for(int i=0;i < 4;i++){ 00350 res &= compare_floats(reflink[i], myfatlink[i], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); 00351 } 00352 int accuracy_level; 00353 00354 accuracy_level = strong_check_link(myfatlink, "GPU results: ", 00355 reflink, "CPU reference results:", 00356 V, qudaGaugeParam.cpu_prec); 00357 00358 printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); 00359 int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3]; 00360 int flops= 61632; 00361 double perf = 1.0* flops*volume/(secs*1024*1024*1024); 00362 printfQuda("fatlink computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf); 00363 00364 00365 for(int i=0;i < 4;i++){ 00366 free(myfatlink[i]); 00367 } 00368 00369 if (res == 0){//failed 00370 printfQuda("\n"); 00371 printfQuda("Warning: your test failed. \n"); 00372 printfQuda(" Did you use --verify?\n"); 00373 printfQuda(" Did you check the GPU health by running cuda memtest?\n"); 00374 } 00375 00376 #ifdef MULTI_GPU 00377 if (verify_results){ 00378 int i; 00379 for(i=0;i < 4;i++){ 00380 free(ghost_sitelink[i]); 00381 } 00382 for(i=0;i <4; i++){ 00383 for(int j=0;j <4; j++){ 00384 if (i==j){ 00385 continue; 00386 } 00387 free(ghost_sitelink_diag[i*4+j]); 00388 } 00389 } 00390 } 00391 #endif 00392 00393 for(int i=0;i < 4; i++){ 00394 cudaFreeHost(sitelink[i]); 00395 cudaFreeHost(sitelink_ex[i]); 00396 free(reflink[i]); 00397 } 00398 cudaFreeHost(fatlink); 00399 if(milc_sitelink) free(milc_sitelink); 00400 if(milc_sitelink_ex) free(milc_sitelink_ex); 00401 #ifdef MULTI_GPU 00402 exchange_llfat_cleanup(); 00403 #endif 00404 endQuda(); 00405 00406 return accuracy_level; 00407 00408 } 00409 00410 static void 00411 display_test_info(int test) 00412 { 00413 printfQuda("running the following test:\n"); 00414 00415 printfQuda("link_precision link_reconstruct space_dimension T_dimension Test Ordering\n"); 00416 printfQuda("%s %s %d/%d/%d/ %d %d %s \n", 00417 get_prec_str(prec), 00418 get_recon_str(link_recon), 00419 xdim, ydim, zdim, tdim, test, 00420 get_gauge_order_str(gauge_order)); 00421 00422 #ifdef MULTI_GPU 00423 printfQuda("Grid partition info: X Y Z T\n"); 00424 printfQuda(" %d %d %d %d\n", 00425 commDimPartitioned(0), 00426 commDimPartitioned(1), 00427 commDimPartitioned(2), 00428 commDimPartitioned(3)); 00429 #endif 00430 00431 return ; 00432 00433 } 00434 00435 void 00436 usage_extra(char** argv ) 00437 { 00438 printfQuda("Extra options:\n"); 00439 printfQuda(" --test <0/1> # Test method\n"); 00440 printfQuda(" 0: standard method\n"); 00441 printfQuda(" 1: extended volume method\n"); 00442 printfQuda(" --verify # Verify the GPU results using CPU results\n"); 00443 printfQuda(" --gauge-order <qdp/milc> # ordering of the input gauge-field\n"); 00444 return ; 00445 } 00446 00447 int 00448 main(int argc, char **argv) 00449 { 00450 00451 int test = 0; 00452 00453 //default to 18 reconstruct, 8^3 x 8 00454 link_recon = QUDA_RECONSTRUCT_NO; 00455 xdim=ydim=zdim=tdim=8; 00456 cpu_prec = prec = QUDA_DOUBLE_PRECISION; 00457 00458 int i; 00459 for (i =1;i < argc; i++){ 00460 00461 if(process_command_line_option(argc, argv, &i) == 0){ 00462 continue; 00463 } 00464 00465 00466 if( strcmp(argv[i], "--test") == 0){ 00467 if (i+1 >= argc){ 00468 usage(argv); 00469 } 00470 test = atoi(argv[i+1]); 00471 i++; 00472 continue; 00473 } 00474 00475 00476 if( strcmp(argv[i], "--gauge-order") == 0){ 00477 if(i+1 >= argc){ 00478 usage(argv); 00479 } 00480 00481 if(strcmp(argv[i+1], "milc") == 0){ 00482 gauge_order = QUDA_MILC_GAUGE_ORDER; 00483 }else if(strcmp(argv[i+1], "qdp") == 0){ 00484 gauge_order = QUDA_QDP_GAUGE_ORDER; 00485 }else{ 00486 fprintf(stderr, "Error: unsupported gauge-field order\n"); 00487 exit(1); 00488 } 00489 i++; 00490 continue; 00491 } 00492 00493 00494 if( strcmp(argv[i], "--verify") == 0){ 00495 verify_results=1; 00496 continue; 00497 } 00498 00499 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00500 usage(argv); 00501 } 00502 00503 #ifdef MULTI_GPU 00504 if(gauge_order == QUDA_MILC_GAUGE_ORDER && test == 0){ 00505 errorQuda("ERROR: milc format for multi-gpu with test0 is not supported yet!\n"); 00506 } 00507 #endif 00508 00509 00510 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00511 00512 00513 display_test_info(test); 00514 00515 00516 int accuracy_level = llfat_test(test); 00517 00518 printfQuda("accuracy_level=%d\n", accuracy_level); 00519 00520 endCommsQuda(); 00521 00522 int ret; 00523 if(accuracy_level >=3 ){ 00524 ret = 0; 00525 }else{ 00526 ret = 1; //we delclare the test failed 00527 } 00528 00529 return ret; 00530 } 00531 00532