|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 00004 #include <quda_internal.h> 00005 #include <color_spinor_field.h> 00006 #include <blas_quda.h> 00007 00008 #include <test_util.h> 00009 00010 // volume per GPU 00011 const int LX = 24; 00012 const int LY = 24; 00013 const int LZ = 24; 00014 const int LT = 24; 00015 const int Nspin = 4; 00016 00017 // corresponds to 10 iterations for V=24^4, Nspin = 4, at half precision 00018 const int Niter = 10 * (24*24*24*24*4) / (LX * LY * LZ * LT * Nspin); 00019 00020 const int Nkernels = 24; 00021 const int ThreadMin = 32; 00022 const int ThreadMax = 1024; 00023 const int GridMin = 1; 00024 const int GridMax = 65536; 00025 00026 cudaColorSpinorField *x, *y, *z, *w, *v, *h, *l; 00027 00028 // defines blas_threads[][] and blas_blocks[][] 00029 #include "../lib/blas_param.h" 00030 00031 00032 void setPrec(ColorSpinorParam ¶m, const QudaPrecision precision) 00033 { 00034 param.precision = precision; 00035 if (precision == QUDA_DOUBLE_PRECISION) { 00036 param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00037 } else { 00038 param.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00039 } 00040 } 00041 00042 00043 // returns true if the specified kernel performs a reduction 00044 bool isReduction(int kernel) 00045 { 00046 return (kernel > 13); 00047 } 00048 00049 00050 void initFields(int prec) 00051 { 00052 // precisions used for the source field in the copyCuda() benchmark 00053 QudaPrecision high_aux_prec; 00054 QudaPrecision low_aux_prec; 00055 00056 ColorSpinorParam param; 00057 param.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00058 param.nColor = 3; 00059 param.nSpin = Nspin; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor 00060 param.nDim = 4; // number of spacetime dimensions 00061 param.x[0] = LX; 00062 param.x[1] = LY; 00063 param.x[2] = LZ; 00064 param.x[3] = LT; 00065 00066 param.pad = LX*LY*LZ/2; 00067 param.siteSubset = QUDA_PARITY_SITE_SUBSET; 00068 param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00069 00070 param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00071 param.create = QUDA_NULL_FIELD_CREATE; 00072 00073 switch(prec) { 00074 case 0: 00075 setPrec(param, QUDA_HALF_PRECISION); 00076 high_aux_prec = QUDA_DOUBLE_PRECISION; 00077 low_aux_prec = QUDA_SINGLE_PRECISION; 00078 break; 00079 case 1: 00080 setPrec(param, QUDA_SINGLE_PRECISION); 00081 high_aux_prec = QUDA_DOUBLE_PRECISION; 00082 low_aux_prec = QUDA_HALF_PRECISION; 00083 break; 00084 case 2: 00085 setPrec(param, QUDA_DOUBLE_PRECISION); 00086 high_aux_prec = QUDA_SINGLE_PRECISION; 00087 low_aux_prec = QUDA_HALF_PRECISION; 00088 break; 00089 } 00090 00091 v = new cudaColorSpinorField(param); 00092 checkCudaError(); 00093 00094 w = new cudaColorSpinorField(param); 00095 x = new cudaColorSpinorField(param); 00096 y = new cudaColorSpinorField(param); 00097 z = new cudaColorSpinorField(param); 00098 00099 setPrec(param, high_aux_prec); 00100 h = new cudaColorSpinorField(param); 00101 00102 setPrec(param, low_aux_prec); 00103 l = new cudaColorSpinorField(param); 00104 00105 // check for successful allocation 00106 checkCudaError(); 00107 00108 // turn off error checking in blas kernels 00109 setBlasTuning(1); 00110 } 00111 00112 00113 void freeFields() 00114 { 00115 // release memory 00116 delete v; 00117 delete w; 00118 delete x; 00119 delete y; 00120 delete z; 00121 delete h; 00122 delete l; 00123 } 00124 00125 00126 double benchmark(int kernel, int niter) { 00127 00128 double a, b, c; 00129 Complex a2, b2; 00130 00131 cudaEvent_t start, end; 00132 cudaEventCreate(&start); 00133 cudaEventRecord(start, 0); 00134 cudaEventSynchronize(start); 00135 00136 for (int i=0; i < niter; ++i) { 00137 00138 switch (kernel) { 00139 00140 case 0: 00141 copyCuda(*y, *h); 00142 break; 00143 00144 case 1: 00145 copyCuda(*y, *l); 00146 break; 00147 00148 case 2: 00149 axpbyCuda(a, *x, b, *y); 00150 break; 00151 00152 case 3: 00153 xpyCuda(*x, *y); 00154 break; 00155 00156 case 4: 00157 axpyCuda(a, *x, *y); 00158 break; 00159 00160 case 5: 00161 xpayCuda(*x, a, *y); 00162 break; 00163 00164 case 6: 00165 mxpyCuda(*x, *y); 00166 break; 00167 00168 case 7: 00169 axCuda(a, *x); 00170 break; 00171 00172 case 8: 00173 caxpyCuda(a2, *x, *y); 00174 break; 00175 00176 case 9: 00177 caxpbyCuda(a2, *x, b2, *y); 00178 break; 00179 00180 case 10: 00181 cxpaypbzCuda(*x, a2, *y, b2, *z); 00182 break; 00183 00184 case 11: 00185 axpyBzpcxCuda(a, *x, *y, b, *z, c); 00186 break; 00187 00188 case 12: 00189 axpyZpbxCuda(a, *x, *y, *z, b); 00190 break; 00191 00192 case 13: 00193 caxpbypzYmbwCuda(a2, *x, b2, *y, *z, *w); 00194 break; 00195 00196 // double 00197 case 14: 00198 sumCuda(*x); 00199 break; 00200 00201 case 15: 00202 normCuda(*x); 00203 break; 00204 00205 case 16: 00206 reDotProductCuda(*x, *y); 00207 break; 00208 00209 case 17: 00210 axpyNormCuda(a, *x, *y); 00211 break; 00212 00213 case 18: 00214 xmyNormCuda(*x, *y); 00215 break; 00216 00217 // double2 00218 case 19: 00219 cDotProductCuda(*x, *y); 00220 break; 00221 00222 case 20: 00223 xpaycDotzyCuda(*x, a, *y, *z); 00224 break; 00225 00226 // double3 00227 case 21: 00228 cDotProductNormACuda(*x, *y); 00229 break; 00230 00231 case 22: 00232 cDotProductNormBCuda(*x, *y); 00233 break; 00234 00235 case 23: 00236 caxpbypzYmbwcDotProductWYNormYCuda(a2, *x, b2, *y, *z, *w, *v); 00237 break; 00238 00239 default: 00240 errorQuda("Undefined blas kernel %d\n", kernel); 00241 } 00242 } 00243 00244 cudaEventCreate(&end); 00245 cudaEventRecord(end, 0); 00246 cudaEventSynchronize(end); 00247 float runTime; 00248 cudaEventElapsedTime(&runTime, start, end); 00249 cudaEventDestroy(start); 00250 cudaEventDestroy(end); 00251 00252 double secs = runTime / 1000; 00253 return secs; 00254 } 00255 00256 00257 void write(char *names[], int threads[][3], int blocks[][3]) 00258 { 00259 printf("\nWriting optimal parameters to blas_param.h\n"); 00260 00261 FILE *fp = fopen("blas_param.h", "w"); 00262 fprintf(fp, "//\n// Auto-tuned blas CUDA parameters, generated by blas_test\n//\n\n"); 00263 00264 fprintf(fp, "static int blas_threads[%d][3] = {\n", Nkernels); 00265 00266 for (int i=0; i<Nkernels; i++) { 00267 fprintf(fp, " {%4d, %4d, %4d}%c // Kernel %2d: %s\n", threads[i][0], threads[i][1], threads[i][2], 00268 ((i == Nkernels-1) ? ' ' : ','), i, names[i]); 00269 } 00270 fprintf(fp, "};\n\n"); 00271 00272 fprintf(fp, "static int blas_blocks[%d][3] = {\n", Nkernels); 00273 00274 for (int i=0; i<Nkernels; i++) { 00275 fprintf(fp, " {%5d, %5d, %5d}%c // Kernel %2d: %s\n", blocks[i][0], blocks[i][1], blocks[i][2], 00276 ((i == Nkernels-1) ? ' ' : ','), i, names[i]); 00277 } 00278 fprintf(fp, "};\n"); 00279 00280 fclose(fp); 00281 } 00282 00283 00284 int main(int argc, char** argv) 00285 { 00286 int dev = 0; 00287 if (argc == 2) dev = atoi(argv[1]); 00288 initQuda(dev); 00289 00290 char *names[] = { 00291 "copyCuda (high source precision)", 00292 "copyCuda (low source precision)", 00293 "axpbyCuda", 00294 "xpyCuda", 00295 "axpyCuda", 00296 "xpayCuda", 00297 "mxpyCuda", 00298 "axCuda", 00299 "caxpyCuda", 00300 "caxpbyCuda", 00301 "cxpaypbzCuda", 00302 "axpyBzpcxCuda", 00303 "axpyZpbxCuda", 00304 "caxpbypzYmbwCuda", 00305 "sumCuda", 00306 "normCuda", 00307 "reDotProductCuda", 00308 "axpyNormCuda", 00309 "xmyNormCuda", 00310 "cDotProductCuda", 00311 "xpaycDotzyCuda", 00312 "cDotProductNormACuda", 00313 "cDotProductNormBCuda", 00314 "caxpbypzYmbwcDotProductWYNormYCuda" 00315 }; 00316 00317 char *prec_str[] = {"half", "single", "double"}; 00318 00319 // Only benchmark double precision if supported 00320 #if (__CUDA_ARCH__ >= 130) 00321 int Nprec = 3; 00322 #else 00323 int Nprec = 2; 00324 #endif 00325 00326 int niter = Niter; 00327 00328 for (int prec = 0; prec < Nprec; prec++) { 00329 00330 printf("\nBenchmarking %s precision with %d iterations...\n\n", prec_str[prec], niter); 00331 initFields(prec); 00332 00333 for (int kernel = 0; kernel < Nkernels; kernel++) { 00334 00335 double gflops_max = 0.0; 00336 double gbytes_max = 0.0; 00337 int threads_max = 0; 00338 int blocks_max = 0; 00339 00340 cudaError_t error; 00341 00342 // only benchmark "high precision" copyCuda() if double is supported 00343 if ((Nprec < 3) && (kernel == 0)) continue; 00344 00345 for (unsigned int thread = ThreadMin; thread <= ThreadMax; thread+=32) { 00346 00347 // for reduction kernels, the number of threads must be a power of two 00348 if (isReduction(kernel) && (thread & (thread-1))) continue; 00349 00350 for (unsigned int grid = GridMin; grid <= GridMax; grid *= 2) { 00351 setBlasParam(kernel, prec, thread, grid); 00352 00353 // first do warmup run 00354 benchmark(kernel, 1); 00355 00356 blas_quda_flops = 0; 00357 blas_quda_bytes = 0; 00358 00359 double secs = benchmark(kernel, niter); 00360 error = cudaGetLastError(); 00361 double flops = blas_quda_flops; 00362 double bytes = blas_quda_bytes; 00363 00364 double gflops = (flops*1e-9)/(secs); 00365 double gbytes = bytes/(secs*(1<<30)); 00366 00367 // prevents selection of failed parameters 00368 if (gbytes > gbytes_max && error == cudaSuccess) { 00369 gflops_max = gflops; 00370 gbytes_max = gbytes; 00371 threads_max = thread; 00372 blocks_max = grid; 00373 } 00374 //printf("%d %d %-35s %f s, flops = %e, Gflop/s = %f, GiB/s = %f\n", 00375 // thread, grid, names[kernel], secs, flops, gflops, gbytes); 00376 } 00377 } 00378 00379 if (threads_max == 0) { 00380 errorQuda("Autotuning failed for %s kernel: %s", names[kernel], cudaGetErrorString(error)); 00381 } else { 00382 // now rerun with more iterations to get accurate speed measurements 00383 setBlasParam(kernel, prec, threads_max, blocks_max); 00384 benchmark(kernel, 1); 00385 blas_quda_flops = 0; 00386 blas_quda_bytes = 0; 00387 00388 double secs = benchmark(kernel, 100*niter); 00389 00390 gflops_max = (blas_quda_flops*1e-9)/(secs); 00391 gbytes_max = blas_quda_bytes/(secs*(1<<30)); 00392 } 00393 00394 printf("%-35s: %4d threads per block, %5d blocks per grid, Gflop/s = %8.4f, GiB/s = %8.4f\n", 00395 names[kernel], threads_max, blocks_max, gflops_max, gbytes_max); 00396 00397 blas_threads[kernel][prec] = threads_max; 00398 blas_blocks[kernel][prec] = blocks_max; 00399 } 00400 freeFields(); 00401 00402 // halve the number of iterations for the next precision 00403 niter /= 2; 00404 if (niter==0) niter = 1; 00405 } 00406 write(names, blas_threads, blas_blocks); 00407 endQuda(); 00408 }
1.7.3