QUDA: quda/tests/blas_test.cu Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 
00004 #include <quda_internal.h>
00005 #include <color_spinor_field.h>
00006 #include <blas_quda.h>
00007 
00008 #include <test_util.h>
00009 
00010 // volume per GPU
00011 const int LX = 24;
00012 const int LY = 24;
00013 const int LZ = 24;
00014 const int LT = 24;
00015 const int Nspin = 4;
00016 
00017 // corresponds to 10 iterations for V=24^4, Nspin = 4, at half precision
00018 const int Niter = 10 * (24*24*24*24*4) / (LX * LY * LZ * LT * Nspin);
00019 
00020 const int Nkernels = 24;
00021 const int ThreadMin = 32;
00022 const int ThreadMax = 1024;
00023 const int GridMin = 1;
00024 const int GridMax = 65536;
00025 
00026 cudaColorSpinorField *x, *y, *z, *w, *v, *h, *l;
00027 
00028 // defines blas_threads[][] and blas_blocks[][]
00029 #include "../lib/blas_param.h"
00030 
00031 
00032 void setPrec(ColorSpinorParam &param, const QudaPrecision precision)
00033 {
00034   param.precision = precision;
00035   if (precision == QUDA_DOUBLE_PRECISION) {
00036     param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00037   } else {
00038     param.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
00039   }
00040 }
00041 
00042 
00043 // returns true if the specified kernel performs a reduction
00044 bool isReduction(int kernel)
00045 {
00046   return (kernel > 13);
00047 }
00048 
00049 
00050 void initFields(int prec)
00051 {
00052   // precisions used for the source field in the copyCuda() benchmark
00053   QudaPrecision high_aux_prec;
00054   QudaPrecision low_aux_prec;
00055 
00056   ColorSpinorParam param;
00057   param.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00058   param.nColor = 3;
00059   param.nSpin = Nspin; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor
00060   param.nDim = 4; // number of spacetime dimensions
00061   param.x[0] = LX;
00062   param.x[1] = LY;
00063   param.x[2] = LZ;
00064   param.x[3] = LT;
00065 
00066   param.pad = LX*LY*LZ/2;
00067   param.siteSubset = QUDA_PARITY_SITE_SUBSET;
00068   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00069   
00070   param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
00071   param.create = QUDA_NULL_FIELD_CREATE;
00072 
00073   switch(prec) {
00074   case 0:
00075     setPrec(param, QUDA_HALF_PRECISION);
00076     high_aux_prec = QUDA_DOUBLE_PRECISION;
00077     low_aux_prec = QUDA_SINGLE_PRECISION;
00078     break;
00079   case 1:
00080     setPrec(param, QUDA_SINGLE_PRECISION);
00081     high_aux_prec = QUDA_DOUBLE_PRECISION;
00082     low_aux_prec = QUDA_HALF_PRECISION;
00083     break;
00084   case 2:
00085     setPrec(param, QUDA_DOUBLE_PRECISION);
00086     high_aux_prec = QUDA_SINGLE_PRECISION;
00087     low_aux_prec = QUDA_HALF_PRECISION;
00088     break;
00089   }
00090 
00091   v = new cudaColorSpinorField(param);
00092   checkCudaError();
00093 
00094   w = new cudaColorSpinorField(param);
00095   x = new cudaColorSpinorField(param);
00096   y = new cudaColorSpinorField(param);
00097   z = new cudaColorSpinorField(param);
00098 
00099   setPrec(param, high_aux_prec);
00100   h = new cudaColorSpinorField(param);
00101 
00102   setPrec(param, low_aux_prec);
00103   l = new cudaColorSpinorField(param);
00104 
00105   // check for successful allocation
00106   checkCudaError();
00107 
00108   // turn off error checking in blas kernels
00109   setBlasTuning(1);
00110 }
00111 
00112 
00113 void freeFields()
00114 {
00115   // release memory
00116   delete v;
00117   delete w;
00118   delete x;
00119   delete y;
00120   delete z;
00121   delete h;
00122   delete l;
00123 }
00124 
00125 
00126 double benchmark(int kernel, int niter) {
00127 
00128   double a, b, c;
00129   Complex a2, b2;
00130 
00131   cudaEvent_t start, end;
00132   cudaEventCreate(&start);
00133   cudaEventRecord(start, 0);
00134   cudaEventSynchronize(start);
00135 
00136   for (int i=0; i < niter; ++i) {
00137 
00138     switch (kernel) {
00139 
00140     case 0:
00141       copyCuda(*y, *h);
00142       break;
00143 
00144     case 1:
00145       copyCuda(*y, *l);
00146       break;
00147       
00148     case 2:
00149       axpbyCuda(a, *x, b, *y);
00150       break;
00151 
00152     case 3:
00153       xpyCuda(*x, *y);
00154       break;
00155 
00156     case 4:
00157       axpyCuda(a, *x, *y);
00158       break;
00159 
00160     case 5:
00161       xpayCuda(*x, a, *y);
00162       break;
00163 
00164     case 6:
00165       mxpyCuda(*x, *y);
00166       break;
00167 
00168     case 7:
00169       axCuda(a, *x);
00170       break;
00171 
00172     case 8:
00173       caxpyCuda(a2, *x, *y);
00174       break;
00175 
00176     case 9:
00177       caxpbyCuda(a2, *x, b2, *y);
00178       break;
00179 
00180     case 10:
00181       cxpaypbzCuda(*x, a2, *y, b2, *z);
00182       break;
00183 
00184     case 11:
00185       axpyBzpcxCuda(a, *x, *y, b, *z, c);
00186       break;
00187 
00188     case 12:
00189       axpyZpbxCuda(a, *x, *y, *z, b);
00190       break;
00191 
00192     case 13:
00193       caxpbypzYmbwCuda(a2, *x, b2, *y, *z, *w);
00194       break;
00195       
00196     // double
00197     case 14:
00198       sumCuda(*x);
00199       break;
00200 
00201     case 15:
00202       normCuda(*x);
00203       break;
00204 
00205     case 16:
00206       reDotProductCuda(*x, *y);
00207       break;
00208 
00209     case 17:
00210       axpyNormCuda(a, *x, *y);
00211       break;
00212 
00213     case 18:
00214       xmyNormCuda(*x, *y);
00215       break;
00216       
00217     // double2
00218     case 19:
00219       cDotProductCuda(*x, *y);
00220       break;
00221 
00222     case 20:
00223       xpaycDotzyCuda(*x, a, *y, *z);
00224       break;
00225       
00226     // double3
00227     case 21:
00228       cDotProductNormACuda(*x, *y);
00229       break;
00230 
00231     case 22:
00232       cDotProductNormBCuda(*x, *y);
00233       break;
00234 
00235     case 23:
00236       caxpbypzYmbwcDotProductWYNormYCuda(a2, *x, b2, *y, *z, *w, *v);
00237       break;
00238 
00239     default:
00240       errorQuda("Undefined blas kernel %d\n", kernel);
00241     }
00242   }
00243   
00244   cudaEventCreate(&end);
00245   cudaEventRecord(end, 0);
00246   cudaEventSynchronize(end);
00247   float runTime;
00248   cudaEventElapsedTime(&runTime, start, end);
00249   cudaEventDestroy(start);
00250   cudaEventDestroy(end);
00251 
00252   double secs = runTime / 1000;
00253   return secs;
00254 }
00255 
00256 
00257 void write(char *names[], int threads[][3], int blocks[][3])
00258 {
00259   printf("\nWriting optimal parameters to blas_param.h\n");
00260 
00261   FILE *fp = fopen("blas_param.h", "w");
00262   fprintf(fp, "//\n// Auto-tuned blas CUDA parameters, generated by blas_test\n//\n\n");
00263 
00264   fprintf(fp, "static int blas_threads[%d][3] = {\n", Nkernels);
00265 
00266   for (int i=0; i<Nkernels; i++) {
00267     fprintf(fp, "  {%4d, %4d, %4d}%c  // Kernel %2d: %s\n", threads[i][0], threads[i][1], threads[i][2],
00268             ((i == Nkernels-1) ? ' ' : ','), i, names[i]);
00269   }
00270   fprintf(fp, "};\n\n");
00271 
00272   fprintf(fp, "static int blas_blocks[%d][3] = {\n", Nkernels);
00273 
00274   for (int i=0; i<Nkernels; i++) {
00275     fprintf(fp, "  {%5d, %5d, %5d}%c  // Kernel %2d: %s\n", blocks[i][0], blocks[i][1], blocks[i][2],
00276             ((i == Nkernels-1) ? ' ' : ','), i, names[i]);
00277   }
00278   fprintf(fp, "};\n");
00279 
00280   fclose(fp);
00281 }
00282 
00283 
00284 int main(int argc, char** argv)
00285 {
00286   int dev = 0;
00287   if (argc == 2) dev = atoi(argv[1]);
00288   initQuda(dev);
00289 
00290   char *names[] = {
00291     "copyCuda (high source precision)",
00292     "copyCuda (low source precision)",
00293     "axpbyCuda",
00294     "xpyCuda",
00295     "axpyCuda",
00296     "xpayCuda",
00297     "mxpyCuda",
00298     "axCuda",
00299     "caxpyCuda",
00300     "caxpbyCuda",
00301     "cxpaypbzCuda",
00302     "axpyBzpcxCuda",
00303     "axpyZpbxCuda",
00304     "caxpbypzYmbwCuda",
00305     "sumCuda",
00306     "normCuda",
00307     "reDotProductCuda",
00308     "axpyNormCuda",
00309     "xmyNormCuda",
00310     "cDotProductCuda",
00311     "xpaycDotzyCuda",
00312     "cDotProductNormACuda",
00313     "cDotProductNormBCuda",
00314     "caxpbypzYmbwcDotProductWYNormYCuda"
00315   };
00316 
00317   char *prec_str[] = {"half", "single", "double"};
00318   
00319   // Only benchmark double precision if supported
00320 #if (__CUDA_ARCH__ >= 130)
00321   int Nprec = 3;
00322 #else
00323   int Nprec = 2;
00324 #endif
00325 
00326   int niter = Niter;
00327 
00328   for (int prec = 0; prec < Nprec; prec++) {
00329 
00330     printf("\nBenchmarking %s precision with %d iterations...\n\n", prec_str[prec], niter);
00331     initFields(prec);
00332 
00333     for (int kernel = 0; kernel < Nkernels; kernel++) {
00334 
00335       double gflops_max = 0.0;
00336       double gbytes_max = 0.0;
00337       int threads_max = 0; 
00338       int blocks_max = 0;
00339 
00340       cudaError_t error;
00341 
00342       // only benchmark "high precision" copyCuda() if double is supported
00343       if ((Nprec < 3) && (kernel == 0)) continue;
00344 
00345       for (unsigned int thread = ThreadMin; thread <= ThreadMax; thread+=32) {
00346 
00347         // for reduction kernels, the number of threads must be a power of two
00348         if (isReduction(kernel) && (thread & (thread-1))) continue;
00349 
00350         for (unsigned int grid = GridMin; grid <= GridMax; grid *= 2) {
00351           setBlasParam(kernel, prec, thread, grid);
00352           
00353           // first do warmup run
00354           benchmark(kernel, 1);
00355           
00356           blas_quda_flops = 0;
00357           blas_quda_bytes = 0;
00358 
00359           double secs = benchmark(kernel, niter);
00360           error = cudaGetLastError();
00361           double flops = blas_quda_flops;
00362           double bytes = blas_quda_bytes;
00363           
00364           double gflops = (flops*1e-9)/(secs);
00365           double gbytes = bytes/(secs*(1<<30));
00366 
00367           // prevents selection of failed parameters
00368           if (gbytes > gbytes_max && error == cudaSuccess) { 
00369             gflops_max = gflops;
00370             gbytes_max = gbytes;
00371             threads_max = thread;
00372             blocks_max = grid;
00373           }
00374           //printf("%d %d %-35s %f s, flops = %e, Gflop/s = %f, GiB/s = %f\n", 
00375           // thread, grid, names[kernel], secs, flops, gflops, gbytes);
00376         }
00377       }
00378 
00379       if (threads_max == 0) {
00380         errorQuda("Autotuning failed for %s kernel: %s", names[kernel], cudaGetErrorString(error));
00381       } else {
00382         // now rerun with more iterations to get accurate speed measurements
00383         setBlasParam(kernel, prec, threads_max, blocks_max);
00384         benchmark(kernel, 1);
00385         blas_quda_flops = 0;
00386         blas_quda_bytes = 0;
00387         
00388         double secs = benchmark(kernel, 100*niter);
00389         
00390         gflops_max = (blas_quda_flops*1e-9)/(secs);
00391         gbytes_max = blas_quda_bytes/(secs*(1<<30));
00392       }
00393 
00394       printf("%-35s: %4d threads per block, %5d blocks per grid, Gflop/s = %8.4f, GiB/s = %8.4f\n", 
00395              names[kernel], threads_max, blocks_max, gflops_max, gbytes_max);
00396 
00397       blas_threads[kernel][prec] = threads_max;
00398       blas_blocks[kernel][prec] = blocks_max;
00399     }
00400     freeFields();
00401 
00402     // halve the number of iterations for the next precision
00403     niter /= 2; 
00404     if (niter==0) niter = 1;
00405   }
00406   write(names, blas_threads, blas_blocks);
00407   endQuda();
00408 }