QUDA: quda/tests/wilson_dslash_test.cpp Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 
00005 #include <quda.h>
00006 #include <quda_internal.h>
00007 #include <dirac_quda.h>
00008 #include <dslash_quda.h>
00009 #include <invert_quda.h>
00010 #include <util_quda.h>
00011 #include <blas_quda.h>
00012 
00013 #include <test_util.h>
00014 #include <wilson_dslash_reference.h>
00015 
00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00017 const int test_type = 1;
00018 // clover-improved? (0 = plain Wilson, 1 = clover)
00019 const int clover_yes = 0;
00020 
00021 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
00022 const QudaDagType dagger = QUDA_DAG_NO;     // apply Dslash or Dslash dagger?
00023 const int transfer = 0; // include transfer time in the benchmark?
00024 
00025 const int loops = 100;
00026 
00027 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00028 QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION;
00029 
00030 QudaGaugeParam gauge_param;
00031 QudaInvertParam inv_param;
00032 
00033 FullGauge gauge;
00034 FullClover clover, cloverInv;
00035 
00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0;
00038 
00039 void *hostGauge[4], *hostClover, *hostCloverInv;
00040 
00041 Dirac *dirac;
00042 
00043 void init() {
00044 
00045   gauge_param = newQudaGaugeParam();
00046   inv_param = newQudaInvertParam();
00047 
00048   gauge_param.X[0] = 24;
00049   gauge_param.X[1] = 24;
00050   gauge_param.X[2] = 24;
00051   gauge_param.X[3] = 24;
00052   setDims(gauge_param.X);
00053 
00054   gauge_param.anisotropy = 2.3;
00055 
00056   gauge_param.type = QUDA_WILSON_LINKS;
00057   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00058   gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
00059 
00060   gauge_param.cpu_prec = cpu_prec;
00061   gauge_param.cuda_prec = cuda_prec;
00062   gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
00063   gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
00064   gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
00065   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00066 
00067   inv_param.kappa = 0.1;
00068 
00069   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00070   inv_param.dagger = dagger;
00071 
00072   inv_param.cpu_prec = cpu_prec;
00073   inv_param.cuda_prec = cuda_prec;
00074 
00075   gauge_param.ga_pad = 0;
00076   inv_param.sp_pad = 0;
00077   inv_param.cl_pad = 0;
00078 
00079   //gauge_param.ga_pad = 24*24*24;
00080   //inv_param.sp_pad = 24*24*24;
00081   //inv_param.cl_pad = 24*24*24;
00082 
00083   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00084 
00085   if (test_type == 2) {
00086     inv_param.solution_type = QUDA_MAT_SOLUTION;
00087   } else {
00088     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00089   }
00090 
00091   if (clover_yes) {
00092     inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
00093     inv_param.clover_cpu_prec = cpu_prec;
00094     inv_param.clover_cuda_prec = cuda_prec;
00095     inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
00096     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
00097     if (test_type > 0) {
00098       hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
00099       hostCloverInv = hostClover; // fake it
00100     } else {
00101       hostClover = NULL;
00102       hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
00103     }
00104   } else {
00105     inv_param.dslash_type = QUDA_WILSON_DSLASH;
00106   }
00107 
00108   inv_param.verbosity = QUDA_VERBOSE;
00109 
00110   // construct input fields
00111   for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
00112 
00113   ColorSpinorParam csParam;
00114   
00115   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00116   csParam.nColor = 3;
00117   csParam.nSpin = 4;
00118   csParam.nDim = 4;
00119   for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
00120   csParam.precision = inv_param.cpu_prec;
00121   csParam.pad = 0;
00122   if (test_type < 2) {
00123     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00124     csParam.x[0] /= 2;
00125   } else {
00126     csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00127   }    
00128   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00129   csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00130   csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00131   csParam.create = QUDA_ZERO_FIELD_CREATE;
00132   
00133   spinor = new cpuColorSpinorField(csParam);
00134   spinorOut = new cpuColorSpinorField(csParam);
00135   spinorRef = new cpuColorSpinorField(csParam);
00136 
00137   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00138   csParam.x[0] = gauge_param.X[0];
00139   
00140   printfQuda("Randomizing fields... ");
00141 
00142   construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
00143   spinor->Source(QUDA_RANDOM_SOURCE);
00144 
00145   if (clover_yes) {
00146     double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
00147     double diag = 1.0; // constant added to the diagonal
00148 
00149     if (test_type == 2) {
00150       construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
00151     } else {
00152       construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
00153     }
00154   }
00155   printfQuda("done.\n"); fflush(stdout);
00156   
00157   int dev = 0;
00158   initQuda(dev);
00159 
00160   printfQuda("Sending gauge field to GPU\n");
00161 
00162   loadGaugeQuda(hostGauge, &gauge_param);
00163   gauge = cudaGaugePrecise;
00164 
00165   if (clover_yes) {
00166     printfQuda("Sending clover field to GPU\n");
00167     loadCloverQuda(hostClover, hostCloverInv, &inv_param);
00168     clover = cudaCloverPrecise;
00169     cloverInv = cudaCloverInvPrecise;
00170   }
00171 
00172   if (!transfer) {
00173     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00174     csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
00175     csParam.pad = inv_param.sp_pad;
00176     csParam.precision = inv_param.cuda_prec;
00177     if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
00178       csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00179     } else {
00180       /* Single and half */
00181       csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
00182     }
00183  
00184     if (test_type < 2) {
00185       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00186       csParam.x[0] /= 2;
00187     }
00188 
00189     printfQuda("Creating cudaSpinor\n");
00190     cudaSpinor = new cudaColorSpinorField(csParam);
00191     printfQuda("Creating cudaSpinorOut\n");
00192     cudaSpinorOut = new cudaColorSpinorField(csParam);
00193 
00194     if (test_type == 2) csParam.x[0] /= 2;
00195 
00196     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00197     tmp = new cudaColorSpinorField(csParam);
00198     if (clover_yes) tmp2 = new cudaColorSpinorField(csParam);
00199 
00200     printfQuda("Sending spinor field to GPU\n");
00201     *cudaSpinor = *spinor;
00202 
00203     std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
00204       norm2(*cudaSpinor) << std::endl;
00205 
00206     bool pc = (test_type != 2);
00207     DiracParam diracParam;
00208     setDiracParam(diracParam, &inv_param, pc);
00209     diracParam.verbose = QUDA_VERBOSE;
00210     diracParam.tmp1 = tmp;
00211     diracParam.tmp2 = tmp2;
00212     
00213     dirac = Dirac::create(diracParam);
00214 
00215   } else {
00216     std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
00217   }
00218     
00219 }
00220 
00221 void end() {
00222   if (!transfer) {
00223     delete dirac;
00224     delete cudaSpinor;
00225     delete cudaSpinorOut;
00226     delete tmp;
00227     if (clover_yes) delete tmp2;
00228   }
00229 
00230   // release memory
00231   delete spinor;
00232   delete spinorOut;
00233   delete spinorRef;
00234 
00235   for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
00236   if (clover_yes) {
00237     if (test_type == 2) free(hostClover);
00238     else free(hostCloverInv);
00239   }
00240   endQuda();
00241 }
00242 
00243 // execute kernel
00244 double dslashCUDA() {
00245 
00246   printfQuda("Executing %d kernel loops...\n", loops);
00247   fflush(stdout);
00248   stopwatchStart();
00249   for (int i = 0; i < loops; i++) {
00250     switch (test_type) {
00251     case 0:
00252       if (transfer) {
00253         dslashQuda(spinorOut->v, spinor->v, &inv_param, parity);
00254       } else {
00255         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00256       }
00257       break;
00258     case 1:
00259     case 2:
00260       if (transfer) {
00261         MatQuda(spinorOut->v, spinor->v, &inv_param);
00262       } else {
00263         dirac->M(*cudaSpinorOut, *cudaSpinor);
00264       }
00265       break;
00266     }
00267   }
00268     
00269   // check for errors
00270   cudaError_t stat = cudaGetLastError();
00271   if (stat != cudaSuccess)
00272     printf("with ERROR: %s\n", cudaGetErrorString(stat));
00273 
00274   cudaThreadSynchronize();
00275   double secs = stopwatchReadSeconds();
00276   printf("done.\n\n");
00277 
00278   return secs;
00279 }
00280 
00281 void dslashRef() {
00282 
00283   // FIXME: remove once reference clover is finished
00284   if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
00285     inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00286   } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
00287     inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
00288   }
00289 
00290   // compare to dslash reference implementation
00291   printf("Calculating reference implementation...");
00292   fflush(stdout);
00293   switch (test_type) {
00294   case 0:
00295     dslash(spinorRef->v, hostGauge, spinor->v, parity, dagger, 
00296            inv_param.cpu_prec, gauge_param.cpu_prec);
00297     break;
00298   case 1:    
00299     matpc(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, inv_param.matpc_type, dagger, 
00300           inv_param.cpu_prec, gauge_param.cpu_prec);
00301     break;
00302   case 2:
00303     mat(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, dagger, 
00304         inv_param.cpu_prec, gauge_param.cpu_prec);
00305     break;
00306   default:
00307     printf("Test type not defined\n");
00308     exit(-1);
00309   }
00310 
00311   printf("done.\n");
00312     
00313 }
00314 
00315 int main(int argc, char **argv)
00316 {
00317   init();
00318 
00319   float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
00320   float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10);
00321   printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
00322   printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
00323   printf("Shared mem: %.3f KB\n", sharedKB);
00324   
00325   int attempts = 1;
00326   dslashRef();
00327   for (int i=0; i<attempts; i++) {
00328     
00329     double secs = dslashCUDA();
00330 
00331     if (!transfer) *spinorOut = *cudaSpinorOut;
00332 
00333     // print timing information
00334     printf("%fms per loop\n", 1000*secs);
00335     
00336     unsigned long long flops = 0;
00337     if (!transfer) flops = dirac->Flops();
00338     int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24;
00339     if (clover_yes) {
00340       floats += test_type ? 72*2 : 72;
00341     }
00342     printf("GFLOPS = %f\n", 1.0e-9*flops/secs);
00343     printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/((secs/loops)*(1<<30)));
00344     
00345     if (!transfer) {
00346       std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
00347         ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00348     } else {
00349       std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00350     }
00351     
00352     cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
00353   }    
00354   end();
00355 }