QUDA: quda/tests/domain_wall_dslash

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 
00005 #include <quda.h>
00006 #include <quda_internal.h>
00007 #include <dirac_quda.h>
00008 #include <dslash_quda.h>
00009 #include <invert_quda.h>
00010 #include <util_quda.h>
00011 #include <blas_quda.h>
00012 
00013 #include <test_util.h>
00014 #include <domain_wall_dslash_reference.h>
00015 
00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00017 const int test_type = 1;
00018 
00019 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
00020 const QudaDagType dagger = QUDA_DAG_NO;     // apply Dslash or Dslash dagger?
00021 const int transfer = 0; // include transfer time in the benchmark?
00022 
00023 const int loops = 100;
00024 
00025 const int Ls = 16;
00026 double kappa5;
00027 
00028 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00029 QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
00030 
00031 QudaGaugeParam gauge_param;
00032 QudaInvertParam inv_param;
00033 
00034 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00035 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0;
00036 
00037 void *hostGauge[4];
00038 
00039 Dirac *dirac;
00040 
00041 void init() {
00042 
00043   gauge_param = newQudaGaugeParam();
00044   inv_param = newQudaInvertParam();
00045 
00046   gauge_param.X[0] = 16;
00047   gauge_param.X[1] = 16;
00048   gauge_param.X[2] = 16;
00049   gauge_param.X[3] = 16;
00050   
00051   setDims(gauge_param.X, Ls);
00052 
00053   gauge_param.anisotropy = 2.3;
00054 
00055   gauge_param.type = QUDA_WILSON_LINKS;
00056   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00057   gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
00058 
00059   gauge_param.cpu_prec = cpu_prec;
00060   gauge_param.cuda_prec = cuda_prec;
00061   gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
00062   gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
00063   gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
00064   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00065   gauge_param.type = QUDA_WILSON_LINKS;
00066 
00067   inv_param.inv_type = QUDA_CG_INVERTER;
00068 
00069   inv_param.mass = 0.01;
00070   inv_param.m5 = -1.5;
00071   kappa5 = 0.5/(5 + inv_param.m5);
00072 
00073   inv_param.Ls = Ls;
00074   
00075   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00076   inv_param.dagger = dagger;
00077 
00078   inv_param.cpu_prec = cpu_prec;
00079   inv_param.cuda_prec = cuda_prec;
00080 
00081   gauge_param.ga_pad = 0;
00082   inv_param.sp_pad = 0;
00083   inv_param.cl_pad = 0;
00084 
00085   inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00086   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00087 
00088   if (test_type == 2) {
00089     inv_param.solution_type = QUDA_MAT_SOLUTION;
00090   } else {
00091     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00092   }
00093 
00094   inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH;
00095 
00096   inv_param.verbosity = QUDA_VERBOSE;
00097 
00098   // construct input fields
00099   for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
00100 
00101   ColorSpinorParam csParam;
00102   
00103   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00104   csParam.nColor = 3;
00105   csParam.nSpin = 4;
00106   csParam.nDim = 5;
00107   for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
00108   csParam.x[4] = Ls;
00109   csParam.precision = inv_param.cpu_prec;
00110   csParam.pad = 0;
00111   if (test_type < 2) {
00112     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00113     csParam.x[0] /= 2;
00114   } else {
00115     csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00116   }    
00117   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00118   csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00119   csParam.gammaBasis = inv_param.gamma_basis;
00120   csParam.create = QUDA_ZERO_FIELD_CREATE;
00121   
00122   spinor = new cpuColorSpinorField(csParam);
00123   spinorOut = new cpuColorSpinorField(csParam);
00124   spinorRef = new cpuColorSpinorField(csParam);
00125 
00126   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00127   csParam.x[0] = gauge_param.X[0];
00128   
00129   printfQuda("Randomizing fields... ");
00130 
00131   construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
00132   spinor->Source(QUDA_RANDOM_SOURCE);
00133 
00134   printfQuda("done.\n"); fflush(stdout);
00135   
00136   int dev = 0;
00137   initQuda(dev);
00138 
00139   printfQuda("Sending gauge field to GPU\n");
00140 
00141   loadGaugeQuda(hostGauge, &gauge_param);
00142 
00143   if (!transfer) {
00144     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00145     csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
00146     csParam.pad = inv_param.sp_pad;
00147     csParam.precision = inv_param.cuda_prec;
00148     if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
00149       csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00150     } else {
00151       /* Single and half */
00152       csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
00153     }
00154  
00155     if (test_type < 2) {
00156       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00157       csParam.x[0] /= 2;
00158     }
00159 
00160     printfQuda("Creating cudaSpinor\n");
00161     cudaSpinor = new cudaColorSpinorField(csParam);
00162     printfQuda("Creating cudaSpinorOut\n");
00163     cudaSpinorOut = new cudaColorSpinorField(csParam);
00164 
00165     if (test_type == 2) csParam.x[0] /= 2;
00166 
00167     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00168     tmp = new cudaColorSpinorField(csParam);
00169 
00170     printfQuda("Sending spinor field to GPU\n");
00171     *cudaSpinor = *spinor;
00172 
00173     std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
00174       norm2(*cudaSpinor) << std::endl;
00175 
00176     bool pc = (test_type != 2);
00177     DiracParam diracParam;
00178     setDiracParam(diracParam, &inv_param, pc);
00179     diracParam.verbose = QUDA_VERBOSE;
00180     diracParam.tmp1 = tmp;
00181     diracParam.tmp2 = tmp2;
00182     
00183     dirac = Dirac::create(diracParam);
00184 
00185   } else {
00186     std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
00187   }
00188     
00189 }
00190 
00191 void end() {
00192   if (!transfer) {
00193     delete dirac;
00194     delete cudaSpinor;
00195     delete cudaSpinorOut;
00196     delete tmp;
00197   }
00198 
00199   // release memory
00200   delete spinor;
00201   delete spinorOut;
00202   delete spinorRef;
00203 
00204   for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
00205   endQuda();
00206 }
00207 
00208 // execute kernel
00209 double dslashCUDA() {
00210 
00211   printfQuda("Executing %d kernel loops...\n", loops);
00212   fflush(stdout);
00213 
00214   cudaEvent_t start, end;
00215   cudaEventCreate(&start);
00216   cudaEventRecord(start, 0);
00217   cudaEventSynchronize(start);
00218 
00219   for (int i = 0; i < loops; i++) {
00220     switch (test_type) {
00221     case 0:
00222       if (transfer) {
00223         dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity);
00224       } else {
00225         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00226       }
00227       break;
00228     case 1:
00229     case 2:
00230       if (transfer) {
00231         MatQuda(spinorOut->V(), spinor->V(), &inv_param);
00232       } else {
00233         dirac->M(*cudaSpinorOut, *cudaSpinor);
00234       }
00235       break;
00236     }
00237   }
00238     
00239   cudaEventCreate(&end);
00240   cudaEventRecord(end, 0);
00241   cudaEventSynchronize(end);
00242   float runTime;
00243   cudaEventElapsedTime(&runTime, start, end);
00244   cudaEventDestroy(start);
00245   cudaEventDestroy(end);
00246 
00247   double secs = runTime / 1000; //stopwatchReadSeconds();
00248 
00249   // check for errors
00250   cudaError_t stat = cudaGetLastError();
00251   if (stat != cudaSuccess)
00252     printf("with ERROR: %s\n", cudaGetErrorString(stat));
00253 
00254   printf("done.\n\n");
00255 
00256   return secs;
00257 }
00258 
00259 void dslashRef() {
00260 
00261   // FIXME: remove once reference clover is finished
00262   if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
00263     inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00264   } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
00265     inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
00266   }
00267 
00268   // compare to dslash reference implementation
00269   printf("Calculating reference implementation...");
00270   fflush(stdout);
00271   switch (test_type) {
00272   case 0:
00273     dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, 
00274            inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00275     break;
00276   case 1:    
00277     matpc(spinorRef->V(), hostGauge, spinor->V(), kappa5, inv_param.matpc_type, dagger, 
00278           inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00279     break;
00280   case 2:
00281     mat(spinorRef->V(), hostGauge, spinor->V(), kappa5, dagger, 
00282         inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00283     break;
00284   default:
00285     printf("Test type not defined\n");
00286     exit(-1);
00287   }
00288 
00289   printf("done.\n");
00290     
00291 }
00292 
00293 int main(int argc, char **argv)
00294 {
00295   init();
00296 
00297   float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
00298   printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
00299   printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
00300   
00301   int attempts = 1;
00302   dslashRef();
00303 
00304   for (int i=0; i<attempts; i++) {
00305     
00306     if (tune) { // warm-up run
00307       printfQuda("Tuning...\n");
00308       setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
00309       dslashCUDA(1);
00310     }
00311 
00312     double secs = dslashCUDA();
00313 
00314     if (!transfer) *spinorOut = *cudaSpinorOut;
00315 
00316     // print timing information
00317     printf("%fms per loop\n", 1000*secs);
00318     
00319     unsigned long long flops = 0;
00320     if (!transfer) flops = dirac->Flops();
00321 
00322     int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24;
00323     if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
00324       spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short
00325     int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;
00326 
00327     printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
00328     printfQuda("GB/s = %f\n\n", 
00329                (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));
00330 
00331 
00332     if (!transfer) {
00333       std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
00334         ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00335     } else {
00336       std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00337     }
00338     
00339     cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
00340   }    
00341   end();
00342 }