QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 00004 #include <quda_internal.h> 00005 #include <color_spinor_field.h> 00006 #include <blas_quda.h> 00007 00008 #include <test_util.h> 00009 00010 00011 // volume per GPU (full lattice dimensions) 00012 const int LX = 16; 00013 const int LY = 16; 00014 const int LZ = 16; 00015 const int LT = 16; 00016 const int Nspin = 4; 00017 00018 // corresponds to 1 iterations for V=16^4, Nspin = 4, at half precision 00019 const int Niter = max(1, 1 * (16*16*16*16*4) / (LX * LY * LZ * LT * Nspin)); 00020 00021 const int Nkernels = 31; 00022 00023 cpuColorSpinorField *xH, *yH, *zH, *wH, *vH, *hH, *lH; 00024 cudaColorSpinorField *xD, *yD, *zD, *wD, *vD, *hD, *lD; 00025 00026 void setPrec(ColorSpinorParam ¶m, const QudaPrecision precision) 00027 { 00028 param.precision = precision; 00029 if (Nspin == 1 || precision == QUDA_DOUBLE_PRECISION) { 00030 param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00031 } else { 00032 param.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00033 } 00034 } 00035 00036 void initFields(int prec) 00037 { 00038 // precisions used for the source field in the copyCuda() benchmark 00039 QudaPrecision high_aux_prec; 00040 QudaPrecision low_aux_prec; 00041 00042 ColorSpinorParam param; 00043 param.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00044 param.nColor = 3; 00045 param.nSpin = Nspin; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor 00046 param.nDim = 4; // number of spacetime dimensions 00047 00048 param.pad = 0; // padding must be zero for cpu fields 00049 param.siteSubset = QUDA_PARITY_SITE_SUBSET; 00050 if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) param.x[0] = LX/2; 00051 else param.x[0] = LX; 00052 param.x[1] = LY; 00053 param.x[2] = LZ; 00054 param.x[3] = LT; 00055 00056 param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00057 param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00058 param.precision = QUDA_DOUBLE_PRECISION; 00059 param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00060 00061 param.create = QUDA_ZERO_FIELD_CREATE; 00062 00063 vH = new cpuColorSpinorField(param); 00064 wH = new cpuColorSpinorField(param); 00065 xH = new cpuColorSpinorField(param); 00066 yH = new cpuColorSpinorField(param); 00067 zH = new cpuColorSpinorField(param); 00068 hH = new cpuColorSpinorField(param); 00069 lH = new cpuColorSpinorField(param); 00070 00071 vH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00072 wH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00073 xH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00074 yH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00075 zH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00076 hH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00077 lH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0); 00078 00079 // Now set the parameters for the cuda fields 00080 param.pad = 0; //LX*LY*LZ/2; 00081 00082 if (param.nSpin == 4) param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00083 param.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00084 param.create = QUDA_ZERO_FIELD_CREATE; 00085 00086 switch(prec) { 00087 case 0: 00088 setPrec(param, QUDA_HALF_PRECISION); 00089 high_aux_prec = QUDA_DOUBLE_PRECISION; 00090 low_aux_prec = QUDA_SINGLE_PRECISION; 00091 break; 00092 case 1: 00093 setPrec(param, QUDA_SINGLE_PRECISION); 00094 high_aux_prec = QUDA_DOUBLE_PRECISION; 00095 low_aux_prec = QUDA_HALF_PRECISION; 00096 break; 00097 case 2: 00098 setPrec(param, QUDA_DOUBLE_PRECISION); 00099 high_aux_prec = QUDA_SINGLE_PRECISION; 00100 low_aux_prec = QUDA_HALF_PRECISION; 00101 break; 00102 } 00103 00104 checkCudaError(); 00105 00106 vD = new cudaColorSpinorField(param); 00107 wD = new cudaColorSpinorField(param); 00108 xD = new cudaColorSpinorField(param); 00109 yD = new cudaColorSpinorField(param); 00110 zD = new cudaColorSpinorField(param); 00111 00112 setPrec(param, high_aux_prec); 00113 hD = new cudaColorSpinorField(param); 00114 00115 setPrec(param, low_aux_prec); 00116 lD = new cudaColorSpinorField(param); 00117 00118 // check for successful allocation 00119 checkCudaError(); 00120 00121 *vD = *vH; 00122 *wD = *wH; 00123 *xD = *xH; 00124 *yD = *yH; 00125 *zD = *zH; 00126 *hD = *hH; 00127 *lD = *lH; 00128 } 00129 00130 00131 void freeFields() 00132 { 00133 00134 // release memory 00135 delete vD; 00136 delete wD; 00137 delete xD; 00138 delete yD; 00139 delete zD; 00140 delete hD; 00141 delete lD; 00142 00143 // release memory 00144 delete vH; 00145 delete wH; 00146 delete xH; 00147 delete yH; 00148 delete zH; 00149 delete hH; 00150 delete lH; 00151 } 00152 00153 00154 double benchmark(int kernel, const int niter) { 00155 00156 double a, b, c; 00157 quda::Complex a2, b2, c2; 00158 00159 cudaEvent_t start, end; 00160 cudaEventCreate(&start); 00161 cudaEventCreate(&end); 00162 cudaEventRecord(start, 0); 00163 00164 for (int i=0; i < niter; ++i) { 00165 00166 switch (kernel) { 00167 00168 case 0: 00169 copyCuda(*yD, *hD); 00170 break; 00171 00172 case 1: 00173 copyCuda(*yD, *lD); 00174 break; 00175 00176 case 2: 00177 axpbyCuda(a, *xD, b, *yD); 00178 break; 00179 00180 case 3: 00181 xpyCuda(*xD, *yD); 00182 break; 00183 00184 case 4: 00185 axpyCuda(a, *xD, *yD); 00186 break; 00187 00188 case 5: 00189 xpayCuda(*xD, a, *yD); 00190 break; 00191 00192 case 6: 00193 mxpyCuda(*xD, *yD); 00194 break; 00195 00196 case 7: 00197 axCuda(a, *xD); 00198 break; 00199 00200 case 8: 00201 caxpyCuda(a2, *xD, *yD); 00202 break; 00203 00204 case 9: 00205 caxpbyCuda(a2, *xD, b2, *yD); 00206 break; 00207 00208 case 10: 00209 cxpaypbzCuda(*xD, a2, *yD, b2, *zD); 00210 break; 00211 00212 case 11: 00213 axpyBzpcxCuda(a, *xD, *yD, b, *zD, c); 00214 break; 00215 00216 case 12: 00217 axpyZpbxCuda(a, *xD, *yD, *zD, b); 00218 break; 00219 00220 case 13: 00221 caxpbypzYmbwCuda(a2, *xD, b2, *yD, *zD, *wD); 00222 break; 00223 00224 case 14: 00225 cabxpyAxCuda(a, b2, *xD, *yD); 00226 break; 00227 00228 case 15: 00229 caxpbypzCuda(a2, *xD, b2, *yD, *zD); 00230 break; 00231 00232 case 16: 00233 caxpbypczpwCuda(a2, *xD, b2, *yD, c2, *zD, *wD); 00234 break; 00235 00236 case 17: 00237 caxpyXmazCuda(a2, *xD, *yD, *zD); 00238 break; 00239 00240 // double 00241 case 18: 00242 normCuda(*xD); 00243 break; 00244 00245 case 19: 00246 reDotProductCuda(*xD, *yD); 00247 break; 00248 00249 case 20: 00250 axpyNormCuda(a, *xD, *yD); 00251 break; 00252 00253 case 21: 00254 xmyNormCuda(*xD, *yD); 00255 break; 00256 00257 case 22: 00258 caxpyNormCuda(a2, *xD, *yD); 00259 break; 00260 00261 case 23: 00262 caxpyXmazNormXCuda(a2, *xD, *yD, *zD); 00263 break; 00264 00265 case 24: 00266 cabxpyAxNormCuda(a, b2, *xD, *yD); 00267 break; 00268 00269 // double2 00270 case 25: 00271 cDotProductCuda(*xD, *yD); 00272 break; 00273 00274 case 26: 00275 xpaycDotzyCuda(*xD, a, *yD, *zD); 00276 break; 00277 00278 case 27: 00279 caxpyDotzyCuda(a2, *xD, *yD, *zD); 00280 break; 00281 00282 // double3 00283 case 28: 00284 cDotProductNormACuda(*xD, *yD); 00285 break; 00286 00287 case 29: 00288 cDotProductNormBCuda(*xD, *yD); 00289 break; 00290 00291 case 30: 00292 caxpbypzYmbwcDotProductUYNormYCuda(a2, *xD, b2, *yD, *zD, *wD, *vD); 00293 break; 00294 00295 default: 00296 errorQuda("Undefined blas kernel %d\n", kernel); 00297 } 00298 } 00299 00300 cudaEventRecord(end, 0); 00301 cudaEventSynchronize(end); 00302 float runTime; 00303 cudaEventElapsedTime(&runTime, start, end); 00304 cudaEventDestroy(start); 00305 cudaEventDestroy(end); 00306 00307 double secs = runTime / 1000; 00308 return secs; 00309 } 00310 00311 #define ERROR(a) fabs(norm2(*a##D) - norm2(*a##H)) / norm2(*a##H) 00312 00313 double test(int kernel) { 00314 00315 double a = 1.5, b = 2.5, c = 3.5; 00316 quda::Complex a2(a, b), b2(b, -c), c2(a+b, c*a); 00317 double error = 0; 00318 00319 switch (kernel) { 00320 00321 case 0: 00322 *hD = *hH; 00323 copyCuda(*yD, *hD); 00324 yH->copy(*hH); 00325 error = ERROR(y); 00326 break; 00327 00328 case 1: 00329 *lD = *lH; 00330 copyCuda(*yD, *lD); 00331 yH->copy(*lH); 00332 error = ERROR(y); 00333 break; 00334 00335 case 2: 00336 *xD = *xH; 00337 *yD = *yH; 00338 axpbyCuda(a, *xD, b, *yD); 00339 axpbyCpu(a, *xH, b, *yH); 00340 error = ERROR(y); 00341 break; 00342 00343 case 3: 00344 *xD = *xH; 00345 *yD = *yH; 00346 xpyCuda(*xD, *yD); 00347 xpyCpu(*xH, *yH); 00348 error = ERROR(y); 00349 break; 00350 00351 case 4: 00352 *xD = *xH; 00353 *yD = *yH; 00354 axpyCuda(a, *xD, *yD); 00355 axpyCpu(a, *xH, *yH); 00356 error = ERROR(y); 00357 break; 00358 00359 case 5: 00360 *xD = *xH; 00361 *yD = *yH; 00362 xpayCuda(*xD, a, *yD); 00363 xpayCpu(*xH, a, *yH); 00364 error = ERROR(y); 00365 break; 00366 00367 case 6: 00368 *xD = *xH; 00369 *yD = *yH; 00370 mxpyCuda(*xD, *yD); 00371 mxpyCpu(*xH, *yH); 00372 error = ERROR(y); 00373 break; 00374 00375 case 7: 00376 *xD = *xH; 00377 axCuda(a, *xD); 00378 axCpu(a, *xH); 00379 error = ERROR(x); 00380 break; 00381 00382 case 8: 00383 *xD = *xH; 00384 *yD = *yH; 00385 caxpyCuda(a2, *xD, *yD); 00386 caxpyCpu(a2, *xH, *yH); 00387 error = ERROR(y); 00388 break; 00389 00390 case 9: 00391 *xD = *xH; 00392 *yD = *yH; 00393 caxpbyCuda(a2, *xD, b2, *yD); 00394 caxpbyCpu(a2, *xH, b2, *yH); 00395 error = ERROR(y); 00396 break; 00397 00398 case 10: 00399 *xD = *xH; 00400 *yD = *yH; 00401 *zD = *zH; 00402 cxpaypbzCuda(*xD, a2, *yD, b2, *zD); 00403 cxpaypbzCpu(*xH, a2, *yH, b2, *zH); 00404 error = ERROR(z); 00405 break; 00406 00407 case 11: 00408 *xD = *xH; 00409 *yD = *yH; 00410 *zD = *zH; 00411 axpyBzpcxCuda(a, *xD, *yD, b, *zD, c); 00412 axpyBzpcxCpu(a, *xH, *yH, b, *zH, c); 00413 error = ERROR(x) + ERROR(y); 00414 break; 00415 00416 case 12: 00417 *xD = *xH; 00418 *yD = *yH; 00419 *zD = *zH; 00420 axpyZpbxCuda(a, *xD, *yD, *zD, b); 00421 axpyZpbxCpu(a, *xH, *yH, *zH, b); 00422 error = ERROR(x) + ERROR(y); 00423 break; 00424 00425 case 13: 00426 *xD = *xH; 00427 *yD = *yH; 00428 *zD = *zH; 00429 *wD = *wH; 00430 caxpbypzYmbwCuda(a2, *xD, b2, *yD, *zD, *wD); 00431 caxpbypzYmbwCpu(a2, *xH, b2, *yH, *zH, *wH); 00432 error = ERROR(z) + ERROR(y); 00433 break; 00434 00435 case 14: 00436 *xD = *xH; 00437 *yD = *yH; 00438 cabxpyAxCuda(a, b2, *xD, *yD); 00439 cabxpyAxCpu(a, b2, *xH, *yH); 00440 error = ERROR(y) + ERROR(x); 00441 break; 00442 00443 case 15: 00444 *xD = *xH; 00445 *yD = *yH; 00446 *zD = *zH; 00447 {caxpbypzCuda(a2, *xD, b2, *yD, *zD); 00448 caxpbypzCpu(a2, *xH, b2, *yH, *zH); 00449 error = ERROR(z); } 00450 break; 00451 00452 case 16: 00453 *xD = *xH; 00454 *yD = *yH; 00455 *zD = *zH; 00456 *wD = *wH; 00457 {caxpbypczpwCuda(a2, *xD, b2, *yD, c2, *zD, *wD); 00458 caxpbypczpwCpu(a2, *xH, b2, *yH, c2, *zH, *wH); 00459 error = ERROR(w); } 00460 break; 00461 00462 case 17: 00463 *xD = *xH; 00464 *yD = *yH; 00465 *zD = *zH; 00466 {caxpyXmazCuda(a, *xD, *yD, *zD); 00467 caxpyXmazCpu(a, *xH, *yH, *zH); 00468 error = ERROR(y) + ERROR(x);} 00469 break; 00470 00471 // double 00472 case 18: 00473 *xD = *xH; 00474 error = fabs(normCuda(*xD) - normCpu(*xH)) / normCpu(*xH); 00475 break; 00476 00477 case 19: 00478 *xD = *xH; 00479 *yD = *yH; 00480 error = fabs(reDotProductCuda(*xD, *yD) - reDotProductCpu(*xH, *yH)) / fabs(reDotProductCpu(*xH, *yH)); 00481 break; 00482 00483 case 20: 00484 *xD = *xH; 00485 *yD = *yH; 00486 {double d = axpyNormCuda(a, *xD, *yD); 00487 double h = axpyNormCpu(a, *xH, *yH); 00488 error = ERROR(y) + fabs(d-h)/fabs(h);} 00489 break; 00490 00491 case 21: 00492 *xD = *xH; 00493 *yD = *yH; 00494 {double d = xmyNormCuda(*xD, *yD); 00495 double h = xmyNormCpu(*xH, *yH); 00496 error = ERROR(y) + fabs(d-h)/fabs(h);} 00497 break; 00498 00499 case 22: 00500 *xD = *xH; 00501 *yD = *yH; 00502 {double d = caxpyNormCuda(a, *xD, *yD); 00503 double h = caxpyNormCpu(a, *xH, *yH); 00504 error = ERROR(y) + fabs(d-h)/fabs(h);} 00505 break; 00506 00507 case 23: 00508 *xD = *xH; 00509 *yD = *yH; 00510 *zD = *zH; 00511 {double d = caxpyXmazNormXCuda(a, *xD, *yD, *zD); 00512 double h = caxpyXmazNormXCpu(a, *xH, *yH, *zH); 00513 error = ERROR(y) + ERROR(x) + fabs(d-h)/fabs(h);} 00514 break; 00515 00516 case 24: 00517 *xD = *xH; 00518 *yD = *yH; 00519 {double d = cabxpyAxNormCuda(a, b2, *xD, *yD); 00520 double h = cabxpyAxNormCpu(a, b2, *xH, *yH); 00521 error = ERROR(x) + ERROR(y) + fabs(d-h)/fabs(h);} 00522 break; 00523 00524 // double2 00525 case 25: 00526 *xD = *xH; 00527 *yD = *yH; 00528 error = abs(cDotProductCuda(*xD, *yD) - cDotProductCpu(*xH, *yH)) / abs(cDotProductCpu(*xH, *yH)); 00529 break; 00530 00531 case 26: 00532 *xD = *xH; 00533 *yD = *yH; 00534 *zD = *zH; 00535 { quda::Complex d = xpaycDotzyCuda(*xD, a, *yD, *zD); 00536 quda::Complex h = xpaycDotzyCpu(*xH, a, *yH, *zH); 00537 error = fabs(norm2(*yD) - norm2(*yH)) / norm2(*yH) + abs(d-h)/abs(h); 00538 } 00539 break; 00540 00541 case 27: 00542 *xD = *xH; 00543 *yD = *yH; 00544 *zD = *zH; 00545 {quda::Complex d = caxpyDotzyCuda(a, *xD, *yD, *zD); 00546 quda::Complex h = caxpyDotzyCpu(a, *xH, *yH, *zH); 00547 error = ERROR(y) + abs(d-h)/abs(h);} 00548 break; 00549 00550 // double3 00551 case 28: 00552 *xD = *xH; 00553 *yD = *yH; 00554 { double3 d = cDotProductNormACuda(*xD, *yD); 00555 double3 h = cDotProductNormACpu(*xH, *yH); 00556 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); } 00557 break; 00558 00559 case 29: 00560 *xD = *xH; 00561 *yD = *yH; 00562 { double3 d = cDotProductNormBCuda(*xD, *yD); 00563 double3 h = cDotProductNormBCpu(*xH, *yH); 00564 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); } 00565 break; 00566 00567 case 30: 00568 *xD = *xH; 00569 *yD = *yH; 00570 *zD = *zH; 00571 *wD = *wH; 00572 *vD = *vH; 00573 { double3 d = caxpbypzYmbwcDotProductUYNormYCuda(a2, *xD, b2, *yD, *zD, *wD, *vD); 00574 double3 h = caxpbypzYmbwcDotProductUYNormYCpu(a2, *xH, b2, *yH, *zH, *wH, *vH); 00575 error = ERROR(z) + ERROR(y) + fabs(d.x - h.x) / fabs(h.x) + 00576 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); } 00577 break; 00578 00579 default: 00580 errorQuda("Undefined blas kernel %d\n", kernel); 00581 } 00582 00583 return error; 00584 } 00585 00586 int main(int argc, char** argv) 00587 { 00588 00589 int ndim=4, dims[] = {1, 1, 1, 1}; 00590 initCommsQuda(argc, argv, dims, ndim); 00591 00592 int dev = 0; 00593 if (argc == 2) dev = atoi(argv[1]); 00594 initQuda(dev); 00595 00596 char *names[] = { 00597 "copyHS", 00598 "copyLS", 00599 "axpby", 00600 "xpy", 00601 "axpy", 00602 "xpay", 00603 "mxpy", 00604 "ax", 00605 "caxpy", 00606 "caxpby", 00607 "cxpaypbz", 00608 "axpyBzpcx", 00609 "axpyZpbx", 00610 "caxpbypzYmbw", 00611 "cabxpyAx", 00612 "caxpbypz", 00613 "caxpbypczpw", 00614 "caxpyXmaz", 00615 "norm", 00616 "reDotProduct", 00617 "axpyNorm", 00618 "xmyNorm", 00619 "caxpyNorm", 00620 "caxpyXmazNormX", 00621 "cabxpyAxNorm", 00622 "cDotProduct", 00623 "xpaycDotzy", 00624 "caxpyDotzy", 00625 "cDotProductNormA", 00626 "cDotProductNormB", 00627 "caxpbypzYmbwcDotProductWYNormY" 00628 }; 00629 00630 char *prec_str[] = {"half", "single", "double"}; 00631 00632 // Only benchmark double precision if supported 00633 #if (__COMPUTE_CAPABILITY__ >= 130) 00634 int Nprec = 3; 00635 #else 00636 int Nprec = 2; 00637 #endif 00638 00639 int niter = Niter; 00640 00641 // enable the tuning 00642 quda::setBlasTuning(QUDA_TUNE_YES, QUDA_SILENT); 00643 00644 for (int prec = 0; prec < Nprec; prec++) { 00645 00646 printf("\nBenchmarking %s precision with %d iterations...\n\n", prec_str[prec], niter); 00647 initFields(prec); 00648 00649 for (int kernel = 0; kernel < Nkernels; kernel++) { 00650 // only benchmark "high precision" copyCuda() if double is supported 00651 if ((Nprec < 3) && (kernel == 0)) continue; 00652 00653 // do the initial tune 00654 benchmark(kernel, 1); 00655 00656 // now rerun with more iterations to get accurate speed measurements 00657 quda::blas_flops = 0; 00658 quda::blas_bytes = 0; 00659 00660 double secs = benchmark(kernel, 500*niter); 00661 00662 double gflops = (quda::blas_flops*1e-9)/(secs); 00663 double gbytes = quda::blas_bytes/(secs*1e9); 00664 00665 printf("%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n", names[kernel], gflops, gbytes); 00666 } 00667 freeFields(); 00668 00669 // halve the number of iterations for the next precision 00670 niter /= 2; 00671 if (niter==0) niter = 1; 00672 } 00673 00674 // clear the error state 00675 cudaGetLastError(); 00676 00677 // lastly check for correctness 00678 for (int prec = 0; prec < Nprec; prec++) { 00679 printf("\nTesting %s precision...\n\n", prec_str[prec]); 00680 initFields(prec); 00681 00682 for (int kernel = 0; kernel < Nkernels; kernel++) { 00683 // only benchmark "high precision" copyCuda() if double is supported 00684 if ((Nprec < 3) && (kernel == 0)) continue; 00685 double error = test(kernel); 00686 printfQuda("%-35s error = %e, \n", names[kernel], error); 00687 } 00688 freeFields(); 00689 } 00690 00691 endQuda(); 00692 00693 endCommsQuda(); 00694 }