QUDA: quda/tests/domain_wall_dslash

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 
00005 #include <quda.h>
00006 #include <quda_internal.h>
00007 #include <dirac_quda.h>
00008 #include <dslash_quda.h>
00009 #include <invert_quda.h>
00010 #include <util_quda.h>
00011 #include <blas_quda.h>
00012 
00013 #include <test_util.h>
00014 #include <domain_wall_dslash_reference.h>
00015 
00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00017 const int test_type = 2;
00018 
00019 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
00020 const QudaDagType dagger = QUDA_DAG_NO;     // apply Dslash or Dslash dagger?
00021 const int transfer = 0; // include transfer time in the benchmark?
00022 
00023 const int loops = 100;
00024 
00025 const int Ls = 16;
00026 double kappa5;
00027 
00028 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00029 QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION;
00030 
00031 QudaGaugeParam gauge_param;
00032 QudaInvertParam inv_param;
00033 
00034 FullGauge gauge;
00035 
00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0;
00038 
00039 void *hostGauge[4];
00040 
00041 Dirac *dirac;
00042 
00043 void init() {
00044 
00045   gauge_param = newQudaGaugeParam();
00046   inv_param = newQudaInvertParam();
00047 
00048   gauge_param.X[0] = 12;
00049   gauge_param.X[1] = 12;
00050   gauge_param.X[2] = 12;
00051   gauge_param.X[3] = 12;
00052   
00053   setDims(gauge_param.X, Ls);
00054 
00055   gauge_param.anisotropy = 2.3;
00056 
00057   gauge_param.type = QUDA_WILSON_LINKS;
00058   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00059   gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
00060 
00061   gauge_param.cpu_prec = cpu_prec;
00062   gauge_param.cuda_prec = cuda_prec;
00063   gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
00064   gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
00065   gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
00066   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00067   gauge_param.type = QUDA_WILSON_LINKS;
00068 
00069   inv_param.inv_type = QUDA_CG_INVERTER;
00070 
00071   inv_param.mass = 0.01;
00072   inv_param.m5 = -1.5;
00073   kappa5 = 0.5/(5 + inv_param.m5);
00074 
00075   inv_param.Ls = Ls;
00076   
00077   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00078   inv_param.dagger = dagger;
00079 
00080   inv_param.cpu_prec = cpu_prec;
00081   inv_param.cuda_prec = cuda_prec;
00082 
00083   gauge_param.ga_pad = 0;
00084   inv_param.sp_pad = 0;
00085   inv_param.cl_pad = 0;
00086 
00087   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00088 
00089   if (test_type == 2) {
00090     inv_param.solution_type = QUDA_MAT_SOLUTION;
00091   } else {
00092     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00093   }
00094 
00095   inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH;
00096 
00097   inv_param.verbosity = QUDA_VERBOSE;
00098 
00099   // construct input fields
00100   for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
00101 
00102   ColorSpinorParam csParam;
00103   
00104   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00105   csParam.nColor = 3;
00106   csParam.nSpin = 4;
00107   csParam.nDim = 5;
00108   for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
00109   csParam.x[4] = Ls;
00110   csParam.precision = inv_param.cpu_prec;
00111   csParam.pad = 0;
00112   if (test_type < 2) {
00113     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00114     csParam.x[0] /= 2;
00115   } else {
00116     csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00117   }    
00118   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00119   csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00120   csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00121   csParam.create = QUDA_ZERO_FIELD_CREATE;
00122   
00123   spinor = new cpuColorSpinorField(csParam);
00124   spinorOut = new cpuColorSpinorField(csParam);
00125   spinorRef = new cpuColorSpinorField(csParam);
00126 
00127   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00128   csParam.x[0] = gauge_param.X[0];
00129   
00130   printfQuda("Randomizing fields... ");
00131 
00132   construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
00133   spinor->Source(QUDA_RANDOM_SOURCE);
00134 
00135   printfQuda("done.\n"); fflush(stdout);
00136   
00137   int dev = 0;
00138   initQuda(dev);
00139 
00140   printfQuda("Sending gauge field to GPU\n");
00141 
00142   loadGaugeQuda(hostGauge, &gauge_param);
00143   gauge = cudaGaugePrecise;
00144 
00145   if (!transfer) {
00146     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00147     csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
00148     csParam.pad = inv_param.sp_pad;
00149     csParam.precision = inv_param.cuda_prec;
00150     if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
00151       csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00152     } else {
00153       /* Single and half */
00154       csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
00155     }
00156  
00157     if (test_type < 2) {
00158       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00159       csParam.x[0] /= 2;
00160     }
00161 
00162     printfQuda("Creating cudaSpinor\n");
00163     cudaSpinor = new cudaColorSpinorField(csParam);
00164     printfQuda("Creating cudaSpinorOut\n");
00165     cudaSpinorOut = new cudaColorSpinorField(csParam);
00166 
00167     if (test_type == 2) csParam.x[0] /= 2;
00168 
00169     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00170     tmp = new cudaColorSpinorField(csParam);
00171 
00172     printfQuda("Sending spinor field to GPU\n");
00173     *cudaSpinor = *spinor;
00174 
00175     std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
00176       norm2(*cudaSpinor) << std::endl;
00177 
00178     bool pc = (test_type != 2);
00179     DiracParam diracParam;
00180     setDiracParam(diracParam, &inv_param, pc);
00181     diracParam.verbose = QUDA_VERBOSE;
00182     diracParam.tmp1 = tmp;
00183     diracParam.tmp2 = tmp2;
00184     
00185     dirac = Dirac::create(diracParam);
00186 
00187   } else {
00188     std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
00189   }
00190     
00191 }
00192 
00193 void end() {
00194   if (!transfer) {
00195     delete dirac;
00196     delete cudaSpinor;
00197     delete cudaSpinorOut;
00198     delete tmp;
00199   }
00200 
00201   // release memory
00202   delete spinor;
00203   delete spinorOut;
00204   delete spinorRef;
00205 
00206   for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
00207   endQuda();
00208 }
00209 
00210 // execute kernel
00211 double dslashCUDA() {
00212 
00213   printfQuda("Executing %d kernel loops...\n", loops);
00214   fflush(stdout);
00215   stopwatchStart();
00216   for (int i = 0; i < loops; i++) {
00217     switch (test_type) {
00218     case 0:
00219       if (transfer) {
00220         dslashQuda(spinorOut->v, spinor->v, &inv_param, parity);
00221       } else {
00222         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00223       }
00224       break;
00225     case 1:
00226     case 2:
00227       if (transfer) {
00228         MatQuda(spinorOut->v, spinor->v, &inv_param);
00229       } else {
00230         dirac->M(*cudaSpinorOut, *cudaSpinor);
00231       }
00232       break;
00233     }
00234   }
00235     
00236   // check for errors
00237   cudaError_t stat = cudaGetLastError();
00238   if (stat != cudaSuccess)
00239     printf("with ERROR: %s\n", cudaGetErrorString(stat));
00240 
00241   cudaThreadSynchronize();
00242   double secs = stopwatchReadSeconds();
00243   printf("done.\n\n");
00244 
00245   return secs;
00246 }
00247 
00248 void dslashRef() {
00249 
00250   // FIXME: remove once reference clover is finished
00251   if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
00252     inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00253   } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
00254     inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
00255   }
00256 
00257   // compare to dslash reference implementation
00258   printf("Calculating reference implementation...");
00259   fflush(stdout);
00260   switch (test_type) {
00261   case 0:
00262     dslash(spinorRef->v, hostGauge, spinor->v, parity, dagger, 
00263            inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00264     break;
00265   case 1:    
00266     matpc(spinorRef->v, hostGauge, spinor->v, kappa5, inv_param.matpc_type, dagger, 
00267           inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00268     break;
00269   case 2:
00270     mat(spinorRef->v, hostGauge, spinor->v, kappa5, dagger, 
00271         inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
00272     break;
00273   default:
00274     printf("Test type not defined\n");
00275     exit(-1);
00276   }
00277 
00278   printf("done.\n");
00279     
00280 }
00281 
00282 int main(int argc, char **argv)
00283 {
00284   init();
00285 
00286   float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
00287   float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10);
00288   printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
00289   printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
00290   printf("Shared mem: %.3f KB\n", sharedKB);
00291   
00292   int attempts = 1;
00293   dslashRef();
00294 
00295   for (int i=0; i<attempts; i++) {
00296     
00297     double secs = dslashCUDA();
00298 
00299     if (!transfer) *spinorOut = *cudaSpinorOut;
00300 
00301     // print timing information
00302     printf("%fms per loop\n", 1000*secs);
00303     
00304     unsigned long long flops = 0;
00305     if (!transfer) flops = dirac->Flops();
00306     int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24;
00307     printf("GFLOPS = %f\n", 1.0e-9*flops/secs);
00308     printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/((secs/loops)*(1<<30)));
00309     
00310     if (!transfer) {
00311       std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
00312         ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00313     } else {
00314       std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00315     }
00316     
00317     cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
00318   }    
00319   end();
00320 }
quda/tests/domain_wall_dslash_test.cpp