QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <string.h> 00004 00005 #include <quda.h> 00006 #include <test_util.h> 00007 #include <gauge_field.h> 00008 #include "misc.h" 00009 #include "gauge_force_reference.h" 00010 #include "gauge_force_quda.h" 00011 #include <sys/time.h> 00012 #include "fat_force_quda.h" 00013 00014 #ifdef MULTI_GPU 00015 #include <face_quda.h> 00016 #endif 00017 00018 #define GPU_DIRECT 00019 extern void initCommonConstants(const LatticeField &lat); 00020 00021 extern int device; 00022 00023 static QudaGaugeParam qudaGaugeParam; 00024 QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER; 00025 static int verify_results = 0; 00026 extern int tdim; 00027 extern QudaPrecision prec; 00028 extern int xdim; 00029 extern int ydim; 00030 extern int zdim; 00031 extern int tdim; 00032 extern void usage(char** argv); 00033 int attempts = 1; 00034 00035 int Z[4]; 00036 int V; 00037 int Vh; 00038 00039 int V_ex; 00040 int Vh_ex; 00041 00042 static int X1, X1h, X2, X3, X4; 00043 static int E1, E1h, E2, E3, E4; 00044 00045 int E[4]; 00046 00047 extern QudaReconstructType link_recon; 00048 QudaPrecision link_prec = QUDA_SINGLE_PRECISION; 00049 00050 extern int gridsize_from_cmdline[]; 00051 00052 00053 int length[]={ 00054 3, 00055 3, 00056 3, 00057 3, 00058 3, 00059 3, 00060 5, 00061 5, 00062 5, 00063 5, 00064 5, 00065 5, 00066 5, 00067 5, 00068 5, 00069 5, 00070 5, 00071 5, 00072 5, 00073 5, 00074 5, 00075 5, 00076 5, 00077 5, 00078 5, 00079 5, 00080 5, 00081 5, 00082 5, 00083 5, 00084 5, 00085 5, 00086 5, 00087 5, 00088 5, 00089 5, 00090 5, 00091 5, 00092 5, 00093 5, 00094 5, 00095 5, 00096 5, 00097 5, 00098 5, 00099 5, 00100 5, 00101 5, 00102 }; 00103 00104 00105 float loop_coeff_f[]={ 00106 1.1, 00107 1.2, 00108 1.3, 00109 1.4, 00110 1.5, 00111 1.6, 00112 2.5, 00113 2.6, 00114 2.7, 00115 2.8, 00116 2.9, 00117 3.0, 00118 3.1, 00119 3.2, 00120 3.3, 00121 3.4, 00122 3.5, 00123 3.6, 00124 3.7, 00125 3.8, 00126 3.9, 00127 4.0, 00128 4.1, 00129 4.2, 00130 4.3, 00131 4.4, 00132 4.5, 00133 4.6, 00134 4.7, 00135 4.8, 00136 4.9, 00137 5.0, 00138 5.1, 00139 5.2, 00140 5.3, 00141 5.4, 00142 5.5, 00143 5.6, 00144 5.7, 00145 5.8, 00146 5.9, 00147 5.0, 00148 6.1, 00149 6.2, 00150 6.3, 00151 6.4, 00152 6.5, 00153 6.6, 00154 }; 00155 00156 int path_dir_x[][5] = { 00157 {1, 7, 6 }, 00158 {6, 7, 1 }, 00159 {2, 7, 5 }, 00160 {5, 7, 2 }, 00161 {3, 7, 4 }, 00162 {4, 7, 3 }, 00163 {0, 1, 7, 7, 6 }, 00164 {1, 7, 7, 6, 0 }, 00165 {6, 7, 7, 1, 0 }, 00166 {0, 6, 7, 7, 1 }, 00167 {0, 2, 7, 7, 5 }, 00168 {2, 7, 7, 5, 0 }, 00169 {5, 7, 7, 2, 0 }, 00170 {0, 5, 7, 7, 2 }, 00171 {0, 3, 7, 7, 4 }, 00172 {3, 7, 7, 4, 0 }, 00173 {4, 7, 7, 3, 0 }, 00174 {0, 4, 7, 7, 3 }, 00175 {6, 6, 7, 1, 1 }, 00176 {1, 1, 7, 6, 6 }, 00177 {5, 5, 7, 2, 2 }, 00178 {2, 2, 7, 5, 5 }, 00179 {4, 4, 7, 3, 3 }, 00180 {3, 3, 7, 4, 4 }, 00181 {1, 2, 7, 6, 5 }, 00182 {5, 6, 7, 2, 1 }, 00183 {1, 5, 7, 6, 2 }, 00184 {2, 6, 7, 5, 1 }, 00185 {6, 2, 7, 1, 5 }, 00186 {5, 1, 7, 2, 6 }, 00187 {6, 5, 7, 1, 2 }, 00188 {2, 1, 7, 5, 6 }, 00189 {1, 3, 7, 6, 4 }, 00190 {4, 6, 7, 3, 1 }, 00191 {1, 4, 7, 6, 3 }, 00192 {3, 6, 7, 4, 1 }, 00193 {6, 3, 7, 1, 4 }, 00194 {4, 1, 7, 3, 6 }, 00195 {6, 4, 7, 1, 3 }, 00196 {3, 1, 7, 4, 6 }, 00197 {2, 3, 7, 5, 4 }, 00198 {4, 5, 7, 3, 2 }, 00199 {2, 4, 7, 5, 3 }, 00200 {3, 5, 7, 4, 2 }, 00201 {5, 3, 7, 2, 4 }, 00202 {4, 2, 7, 3, 5 }, 00203 {5, 4, 7, 2, 3 }, 00204 {3, 2, 7, 4, 5 }, 00205 }; 00206 00207 00208 int path_dir_y[][5] = { 00209 { 2 ,6 ,5 }, 00210 { 5 ,6 ,2 }, 00211 { 3 ,6 ,4 }, 00212 { 4 ,6 ,3 }, 00213 { 0 ,6 ,7 }, 00214 { 7 ,6 ,0 }, 00215 { 1 ,2 ,6 ,6 ,5 }, 00216 { 2 ,6 ,6 ,5 ,1 }, 00217 { 5 ,6 ,6 ,2 ,1 }, 00218 { 1 ,5 ,6 ,6 ,2 }, 00219 { 1 ,3 ,6 ,6 ,4 }, 00220 { 3 ,6 ,6 ,4 ,1 }, 00221 { 4 ,6 ,6 ,3 ,1 }, 00222 { 1 ,4 ,6 ,6 ,3 }, 00223 { 1 ,0 ,6 ,6 ,7 }, 00224 { 0 ,6 ,6 ,7 ,1 }, 00225 { 7 ,6 ,6 ,0 ,1 }, 00226 { 1 ,7 ,6 ,6 ,0 }, 00227 { 5 ,5 ,6 ,2 ,2 }, 00228 { 2 ,2 ,6 ,5 ,5 }, 00229 { 4 ,4 ,6 ,3 ,3 }, 00230 { 3 ,3 ,6 ,4 ,4 }, 00231 { 7 ,7 ,6 ,0 ,0 }, 00232 { 0 ,0 ,6 ,7 ,7 }, 00233 { 2 ,3 ,6 ,5 ,4 }, 00234 { 4 ,5 ,6 ,3 ,2 }, 00235 { 2 ,4 ,6 ,5 ,3 }, 00236 { 3 ,5 ,6 ,4 ,2 }, 00237 { 5 ,3 ,6 ,2 ,4 }, 00238 { 4 ,2 ,6 ,3 ,5 }, 00239 { 5 ,4 ,6 ,2 ,3 }, 00240 { 3 ,2 ,6 ,4 ,5 }, 00241 { 2 ,0 ,6 ,5 ,7 }, 00242 { 7 ,5 ,6 ,0 ,2 }, 00243 { 2 ,7 ,6 ,5 ,0 }, 00244 { 0 ,5 ,6 ,7 ,2 }, 00245 { 5 ,0 ,6 ,2 ,7 }, 00246 { 7 ,2 ,6 ,0 ,5 }, 00247 { 5 ,7 ,6 ,2 ,0 }, 00248 { 0 ,2 ,6 ,7 ,5 }, 00249 { 3 ,0 ,6 ,4 ,7 }, 00250 { 7 ,4 ,6 ,0 ,3 }, 00251 { 3 ,7 ,6 ,4 ,0 }, 00252 { 0 ,4 ,6 ,7 ,3 }, 00253 { 4 ,0 ,6 ,3 ,7 }, 00254 { 7 ,3 ,6 ,0 ,4 }, 00255 { 4 ,7 ,6 ,3 ,0 }, 00256 { 0 ,3 ,6 ,7 ,4 } 00257 }; 00258 00259 int path_dir_z[][5] = { 00260 { 3 ,5 ,4 }, 00261 { 4 ,5 ,3 }, 00262 { 0 ,5 ,7 }, 00263 { 7 ,5 ,0 }, 00264 { 1 ,5 ,6 }, 00265 { 6 ,5 ,1 }, 00266 { 2 ,3 ,5 ,5 ,4 }, 00267 { 3 ,5 ,5 ,4 ,2 }, 00268 { 4 ,5 ,5 ,3 ,2 }, 00269 { 2 ,4 ,5 ,5 ,3 }, 00270 { 2 ,0 ,5 ,5 ,7 }, 00271 { 0 ,5 ,5 ,7 ,2 }, 00272 { 7 ,5 ,5 ,0 ,2 }, 00273 { 2 ,7 ,5 ,5 ,0 }, 00274 { 2 ,1 ,5 ,5 ,6 }, 00275 { 1 ,5 ,5 ,6 ,2 }, 00276 { 6 ,5 ,5 ,1 ,2 }, 00277 { 2 ,6 ,5 ,5 ,1 }, 00278 { 4 ,4 ,5 ,3 ,3 }, 00279 { 3 ,3 ,5 ,4 ,4 }, 00280 { 7 ,7 ,5 ,0 ,0 }, 00281 { 0 ,0 ,5 ,7 ,7 }, 00282 { 6 ,6 ,5 ,1 ,1 }, 00283 { 1 ,1 ,5 ,6 ,6 }, 00284 { 3 ,0 ,5 ,4 ,7 }, 00285 { 7 ,4 ,5 ,0 ,3 }, 00286 { 3 ,7 ,5 ,4 ,0 }, 00287 { 0 ,4 ,5 ,7 ,3 }, 00288 { 4 ,0 ,5 ,3 ,7 }, 00289 { 7 ,3 ,5 ,0 ,4 }, 00290 { 4 ,7 ,5 ,3 ,0 }, 00291 { 0 ,3 ,5 ,7 ,4 }, 00292 { 3 ,1 ,5 ,4 ,6 }, 00293 { 6 ,4 ,5 ,1 ,3 }, 00294 { 3 ,6 ,5 ,4 ,1 }, 00295 { 1 ,4 ,5 ,6 ,3 }, 00296 { 4 ,1 ,5 ,3 ,6 }, 00297 { 6 ,3 ,5 ,1 ,4 }, 00298 { 4 ,6 ,5 ,3 ,1 }, 00299 { 1 ,3 ,5 ,6 ,4 }, 00300 { 0 ,1 ,5 ,7 ,6 }, 00301 { 6 ,7 ,5 ,1 ,0 }, 00302 { 0 ,6 ,5 ,7 ,1 }, 00303 { 1 ,7 ,5 ,6 ,0 }, 00304 { 7 ,1 ,5 ,0 ,6 }, 00305 { 6 ,0 ,5 ,1 ,7 }, 00306 { 7 ,6 ,5 ,0 ,1 }, 00307 { 1 ,0 ,5 ,6 ,7 } 00308 }; 00309 00310 int path_dir_t[][5] = { 00311 { 0 ,4 ,7 }, 00312 { 7 ,4 ,0 }, 00313 { 1 ,4 ,6 }, 00314 { 6 ,4 ,1 }, 00315 { 2 ,4 ,5 }, 00316 { 5 ,4 ,2 }, 00317 { 3 ,0 ,4 ,4 ,7 }, 00318 { 0 ,4 ,4 ,7 ,3 }, 00319 { 7 ,4 ,4 ,0 ,3 }, 00320 { 3 ,7 ,4 ,4 ,0 }, 00321 { 3 ,1 ,4 ,4 ,6 }, 00322 { 1 ,4 ,4 ,6 ,3 }, 00323 { 6 ,4 ,4 ,1 ,3 }, 00324 { 3 ,6 ,4 ,4 ,1 }, 00325 { 3 ,2 ,4 ,4 ,5 }, 00326 { 2 ,4 ,4 ,5 ,3 }, 00327 { 5 ,4 ,4 ,2 ,3 }, 00328 { 3 ,5 ,4 ,4 ,2 }, 00329 { 7 ,7 ,4 ,0 ,0 }, 00330 { 0 ,0 ,4 ,7 ,7 }, 00331 { 6 ,6 ,4 ,1 ,1 }, 00332 { 1 ,1 ,4 ,6 ,6 }, 00333 { 5 ,5 ,4 ,2 ,2 }, 00334 { 2 ,2 ,4 ,5 ,5 }, 00335 { 0 ,1 ,4 ,7 ,6 }, 00336 { 6 ,7 ,4 ,1 ,0 }, 00337 { 0 ,6 ,4 ,7 ,1 }, 00338 { 1 ,7 ,4 ,6 ,0 }, 00339 { 7 ,1 ,4 ,0 ,6 }, 00340 { 6 ,0 ,4 ,1 ,7 }, 00341 { 7 ,6 ,4 ,0 ,1 }, 00342 { 1 ,0 ,4 ,6 ,7 }, 00343 { 0 ,2 ,4 ,7 ,5 }, 00344 { 5 ,7 ,4 ,2 ,0 }, 00345 { 0 ,5 ,4 ,7 ,2 }, 00346 { 2 ,7 ,4 ,5 ,0 }, 00347 { 7 ,2 ,4 ,0 ,5 }, 00348 { 5 ,0 ,4 ,2 ,7 }, 00349 { 7 ,5 ,4 ,0 ,2 }, 00350 { 2 ,0 ,4 ,5 ,7 }, 00351 { 1 ,2 ,4 ,6 ,5 }, 00352 { 5 ,6 ,4 ,2 ,1 }, 00353 { 1 ,5 ,4 ,6 ,2 }, 00354 { 2 ,6 ,4 ,5 ,1 }, 00355 { 6 ,2 ,4 ,1 ,5 }, 00356 { 5 ,1 ,4 ,2 ,6 }, 00357 { 6 ,5 ,4 ,1 ,2 }, 00358 { 2 ,1 ,4 ,5 ,6 } 00359 }; 00360 00361 00362 00363 void 00364 setDims(int *X) { 00365 V = 1; 00366 for (int d=0; d< 4; d++) { 00367 V *= X[d]; 00368 Z[d] = X[d]; 00369 } 00370 Vh = V/2; 00371 00372 V_ex = 1; 00373 for (int d=0; d< 4; d++) { 00374 V_ex *= X[d]+4; 00375 } 00376 Vh_ex = V_ex/2; 00377 00378 X1=X[0]; X2 = X[1]; X3=X[2]; X4=X[3]; 00379 X1h=X1/2; 00380 E1=X1+4; E2=X2+4; E3=X3+4; E4=X4+4; 00381 E1h=E1/2; 00382 00383 E[0] = E1; 00384 E[1] = E2; 00385 E[2] = E3; 00386 E[3] = E4; 00387 } 00388 00389 00390 00391 static int 00392 gauge_force_test(void) 00393 { 00394 int max_length = 6; 00395 00396 initQuda(device); 00397 00398 qudaGaugeParam = newQudaGaugeParam(); 00399 00400 qudaGaugeParam.X[0] = xdim; 00401 qudaGaugeParam.X[1] = ydim; 00402 qudaGaugeParam.X[2] = zdim; 00403 qudaGaugeParam.X[3] = tdim; 00404 00405 setDims(qudaGaugeParam.X); 00406 00407 qudaGaugeParam.anisotropy = 1.0; 00408 qudaGaugeParam.cpu_prec = link_prec; 00409 qudaGaugeParam.cuda_prec = link_prec; 00410 qudaGaugeParam.reconstruct = link_recon; 00411 qudaGaugeParam.type = QUDA_WILSON_LINKS; // in this context, just means these are site links 00412 00413 qudaGaugeParam.gauge_order = gauge_order; 00414 00415 int gSize = qudaGaugeParam.cpu_prec; 00416 00417 void* sitelink; 00418 void* sitelink_1d; 00419 00420 #ifdef GPU_DIRECT 00421 cudaMallocHost(&sitelink_1d, 4*V*gaugeSiteSize*gSize); 00422 #else 00423 sitelink_1d= malloc(4*V*gaugeSiteSize*gSize); 00424 #endif 00425 if(sitelink_1d == NULL){ 00426 printf("ERROR: malloc failed for sitelink_1d\n"); 00427 exit(1); 00428 } 00429 00430 // this is a hack to have site link generated in 2d 00431 // then copied to 1d array in "MILC" format 00432 void* sitelink_2d[4]; 00433 for(int i=0;i < 4;i++){ 00434 #ifdef GPU_DIRECT 00435 cudaMallocHost(&sitelink_2d[i], V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 00436 #else 00437 sitelink_2d[i] = malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 00438 #endif 00439 } 00440 00441 // fills the gauge field with random numbers 00442 createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0); 00443 00444 //copy the 2d sitelink to 1d milc format 00445 00446 for(int dir = 0; dir < 4; dir++){ 00447 for(int i=0; i < V; i++){ 00448 char* src = ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; 00449 char* dst = ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; 00450 memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); 00451 } 00452 } 00453 if (qudaGaugeParam.gauge_order == QUDA_MILC_GAUGE_ORDER){ 00454 sitelink = sitelink_1d; 00455 }else{ //QUDA_QDP_GAUGE_ORDER 00456 sitelink = (void**)sitelink_2d; 00457 } 00458 00459 #ifdef MULTI_GPU 00460 void* sitelink_ex; 00461 void* sitelink_ex_2d[4]; 00462 void* sitelink_ex_1d; 00463 00464 cudaMallocHost((void**)&sitelink_ex_1d, 4*V_ex*gaugeSiteSize*gSize); 00465 for(int i=0;i < 4;i++){ 00466 cudaMallocHost((void**)&sitelink_ex_2d[i], V_ex*gaugeSiteSize*gSize); 00467 if(sitelink_ex_2d[i] == NULL){ 00468 errorQuda("ERROR; allocate sitelink_ex[%d] failed\n", i); 00469 } 00470 } 00471 00472 for(int i=0; i < V_ex; i++){ 00473 int sid = i; 00474 int oddBit=0; 00475 if(i >= Vh_ex){ 00476 sid = i - Vh_ex; 00477 oddBit = 1; 00478 } 00479 00480 int za = sid/E1h; 00481 int x1h = sid - za*E1h; 00482 int zb = za/E2; 00483 int x2 = za - zb*E2; 00484 int x4 = zb/E3; 00485 int x3 = zb - x4*E3; 00486 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00487 int x1 = 2*x1h + x1odd; 00488 00489 00490 if( x1< 2 || x1 >= X1 +2 00491 || x2< 2 || x2 >= X2 +2 00492 || x3< 2 || x3 >= X3 +2 00493 || x4< 2 || x4 >= X4 +2){ 00494 continue; 00495 } 00496 00497 00498 00499 x1 = (x1 - 2 + X1) % X1; 00500 x2 = (x2 - 2 + X2) % X2; 00501 x3 = (x3 - 2 + X3) % X3; 00502 x4 = (x4 - 2 + X4) % X4; 00503 00504 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; 00505 if(oddBit){ 00506 idx += Vh; 00507 } 00508 for(int dir= 0; dir < 4; dir++){ 00509 char* src = (char*)sitelink_2d[dir]; 00510 char* dst = (char*)sitelink_ex_2d[dir]; 00511 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); 00512 }//dir 00513 }//i 00514 00515 00516 for(int dir = 0; dir < 4; dir++){ 00517 for(int i=0; i < V_ex; i++){ 00518 char* src = ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; 00519 char* dst = ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; 00520 memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); 00521 } 00522 } 00523 00524 if(qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER){ 00525 sitelink_ex = sitelink_ex_2d; 00526 }else{ 00527 sitelink_ex = sitelink_ex_1d; 00528 } 00529 00530 #endif 00531 00532 00533 00534 void* mom = malloc(4*V*momSiteSize*gSize); 00535 void* refmom = malloc(4*V*momSiteSize*gSize); 00536 if(mom == NULL || refmom == NULL){ 00537 printf("ERROR: malloc failed for mom/refmom\n"); 00538 exit(1); 00539 } 00540 memset(mom, 0, 4*V*momSiteSize*gSize); 00541 //initiaze some data in cpuMom 00542 createMomCPU(mom, qudaGaugeParam.cpu_prec); 00543 memcpy(refmom, mom, 4*V*momSiteSize*qudaGaugeParam.cpu_prec); 00544 00545 00546 double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)]; 00547 for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){ 00548 loop_coeff_d[i] = loop_coeff_f[i]; 00549 } 00550 00551 void* loop_coeff; 00552 if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){ 00553 loop_coeff = (void*)&loop_coeff_f[0]; 00554 }else{ 00555 loop_coeff = loop_coeff_d; 00556 } 00557 double eb3 = 0.3; 00558 int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]); 00559 00560 int** input_path_buf[4]; 00561 for(int dir =0; dir < 4; dir++){ 00562 input_path_buf[dir] = (int**)malloc(num_paths*sizeof(int*)); 00563 if (input_path_buf[dir] == NULL){ 00564 printf("ERORR: malloc failed for input path\n"); 00565 exit(1); 00566 } 00567 00568 for(int i=0;i < num_paths;i++){ 00569 input_path_buf[dir][i] = (int*)malloc(length[i]*sizeof(int)); 00570 if (input_path_buf[dir][i] == NULL){ 00571 printf("ERROR: malloc failed for input_path_buf[dir][%d]\n", i); 00572 exit(1); 00573 } 00574 if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int)); 00575 else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int)); 00576 else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int)); 00577 else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int)); 00578 } 00579 } 00580 00581 00582 struct timeval t0, t1; 00583 double timeinfo[3]; 00584 /* Multiple execution to exclude warmup time in the first run*/ 00585 for (int i =0;i < attempts; i++){ 00586 gettimeofday(&t0, NULL); 00587 #ifdef MULTI_GPU 00588 computeGaugeForceQuda(mom, sitelink_ex, input_path_buf, length, 00589 loop_coeff, num_paths, max_length, eb3, 00590 &qudaGaugeParam, timeinfo); 00591 00592 #else 00593 computeGaugeForceQuda(mom, sitelink, input_path_buf, length, 00594 loop_coeff, num_paths, max_length, eb3, 00595 &qudaGaugeParam, timeinfo); 00596 #endif 00597 gettimeofday(&t1, NULL); 00598 } 00599 00600 double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); 00601 //The number comes from CPU implementation in MILC, gauge_force_imp.c 00602 int flops=153004; 00603 00604 if (verify_results){ 00605 for(int i = 0;i < attempts;i++){ 00606 #ifdef MULTI_GPU 00607 //last arg=0 means no optimization for communication, i.e. exchange data in all directions 00608 //even they are not partitioned 00609 exchange_cpu_sitelink_ex(qudaGaugeParam.X, (void**)sitelink_ex_2d, 00610 QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0); 00611 gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec, 00612 input_path_buf, length, loop_coeff, num_paths); 00613 #else 00614 gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec, 00615 input_path_buf, length, loop_coeff, num_paths); 00616 #endif 00617 } 00618 } 00619 00620 int res; 00621 res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec); 00622 00623 int accuracy_level; 00624 accuracy_level = strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec); 00625 00626 printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); 00627 00628 double perf = 1.0* flops*V/(total_time*1e+9); 00629 double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9); 00630 printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n", 00631 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3); 00632 printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFOPS\n", kernel_perf, perf); 00633 00634 for(int dir = 0; dir < 4; dir++){ 00635 for(int i=0;i < num_paths; i++){ 00636 free(input_path_buf[dir][i]); 00637 } 00638 free(input_path_buf[dir]); 00639 } 00640 00641 #ifdef GPU_DIRECT 00642 cudaFreeHost(sitelink_1d); 00643 #else 00644 free(sitelink_1d); 00645 #endif 00646 for(int dir=0;dir < 4;dir++){ 00647 #ifdef GPU_DIRECT 00648 cudaFreeHost(sitelink_2d[dir]); 00649 #else 00650 free(sitelink_2d[dir]); 00651 #endif 00652 } 00653 00654 #ifdef MULTI_GPU 00655 cudaFreeHost(sitelink_ex_1d); 00656 for(int dir=0; dir < 4; dir++){ 00657 cudaFreeHost(sitelink_ex_2d[dir]); 00658 } 00659 #endif 00660 00661 00662 free(mom); 00663 free(refmom); 00664 endQuda(); 00665 00666 if (res == 0){//failed 00667 printf("\n"); 00668 printf("Warning: you test failed. \n"); 00669 printf(" Did you use --verify?\n"); 00670 printf(" Did you check the GPU health by running cuda memtest?\n"); 00671 } 00672 00673 return accuracy_level; 00674 } 00675 00676 00677 static void 00678 display_test_info() 00679 { 00680 printf("running the following test:\n"); 00681 00682 printf("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order Attempts\n"); 00683 printf("%s %s %d/%d/%d %d %s %d\n", 00684 get_prec_str(link_prec), 00685 get_recon_str(link_recon), 00686 xdim,ydim,zdim, tdim, 00687 get_gauge_order_str(gauge_order), 00688 attempts); 00689 return ; 00690 00691 } 00692 00693 void 00694 usage_extra(char** argv ) 00695 { 00696 printf("Extra options:\n"); 00697 printf(" --gauge-order <qdp/milc> # Gauge storing order in CPU\n"); 00698 printf(" --attempts <n> # Number of tests\n"); 00699 printf(" --verify # Verify the GPU results using CPU results\n"); 00700 return ; 00701 } 00702 00703 int 00704 main(int argc, char **argv) 00705 { 00706 int i; 00707 for (i =1;i < argc; i++){ 00708 00709 if(process_command_line_option(argc, argv, &i) == 0){ 00710 continue; 00711 } 00712 00713 if( strcmp(argv[i], "--gauge-order") == 0){ 00714 if(i+1 >= argc){ 00715 usage(argv); 00716 } 00717 00718 if(strcmp(argv[i+1], "milc") == 0){ 00719 gauge_order = QUDA_MILC_GAUGE_ORDER; 00720 }else if(strcmp(argv[i+1], "qdp") == 0){ 00721 gauge_order = QUDA_QDP_GAUGE_ORDER; 00722 }else{ 00723 fprintf(stderr, "Error: unsupported gauge-field order\n"); 00724 exit(1); 00725 } 00726 i++; 00727 continue; 00728 } 00729 if( strcmp(argv[i], "--attempts") == 0){ 00730 if(i+1 >= argc){ 00731 usage(argv); 00732 } 00733 00734 attempts = atoi(argv[i+1]); 00735 if(attempts <= 0){ 00736 printf("ERROR: invalid number of attempts(%d)\n", attempts); 00737 } 00738 i++; 00739 continue; 00740 } 00741 00742 if( strcmp(argv[i], "--verify") == 0){ 00743 verify_results=1; 00744 continue; 00745 } 00746 00747 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00748 usage(argv); 00749 } 00750 00751 00752 link_prec = prec; 00753 #ifdef MULTI_GPU 00754 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00755 #endif 00756 00757 display_test_info(); 00758 00759 int accuracy_level = gauge_force_test(); 00760 printfQuda("accuracy_level=%d\n", accuracy_level); 00761 00762 #ifdef MULTI_GPU 00763 endCommsQuda(); 00764 #endif 00765 00766 int ret; 00767 if(accuracy_level >=3 ){ 00768 ret = 0; 00769 }else{ 00770 ret = 1; //we delclare the test failed 00771 } 00772 00773 return ret; 00774 }