QUDA: quda/tests/wilson_dslash_test.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <string.h>
00005 
00006 #include <quda.h>
00007 #include <quda_internal.h>
00008 #include <dirac_quda.h>
00009 #include <dslash_quda.h>
00010 #include <invert_quda.h>
00011 #include <util_quda.h>
00012 #include <blas_quda.h>
00013 
00014 #include <test_util.h>
00015 #include <wilson_dslash_reference.h>
00016 #include "misc.h"
00017 
00018 #include <gauge_qio.h>
00019 
00020 #define MAX(a,b) ((a)>(b)?(a):(b))
00021 
00022 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00023 const int test_type = 0;
00024 
00025 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
00026 const int transfer = 0; // include transfer time in the benchmark?
00027 
00028 const int loops = 100;
00029 
00030 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00031 QudaPrecision cuda_prec;
00032 
00033 QudaGaugeParam gauge_param;
00034 QudaInvertParam inv_param;
00035 
00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp1=0, *tmp2=0;
00038 
00039 void *hostGauge[4], *hostClover, *hostCloverInv;
00040 
00041 Dirac *dirac;
00042 
00043 // Dirac operator type
00044 extern QudaDslashType dslash_type;
00045 
00046 extern bool tune;
00047 
00048 extern int device;
00049 extern int xdim;
00050 extern int ydim;
00051 extern int zdim;
00052 extern int tdim;
00053 extern int gridsize_from_cmdline[];
00054 extern QudaReconstructType link_recon;
00055 extern QudaPrecision prec;
00056 extern bool kernelPackT;
00057 extern QudaDagType dagger;
00058 
00059 extern char latfile[];
00060 
00061 void init(int argc, char **argv) {
00062 
00063 
00064   kernelPackT = false; // Set true for kernel T face packing
00065   cuda_prec= prec;
00066 
00067   gauge_param = newQudaGaugeParam();
00068   inv_param = newQudaInvertParam();
00069 
00070   gauge_param.X[0] = xdim;
00071   gauge_param.X[1] = ydim;
00072   gauge_param.X[2] = zdim;
00073   gauge_param.X[3] = tdim;
00074   setDims(gauge_param.X);
00075 
00076   gauge_param.anisotropy = 1.0;
00077 
00078   gauge_param.type = QUDA_WILSON_LINKS;
00079   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00080   gauge_param.t_boundary = QUDA_PERIODIC_T;
00081 
00082   gauge_param.cpu_prec = cpu_prec;
00083   gauge_param.cuda_prec = cuda_prec;
00084   gauge_param.reconstruct = link_recon;
00085   gauge_param.reconstruct_sloppy = link_recon;
00086   gauge_param.cuda_prec_sloppy = cuda_prec;
00087   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00088 
00089   inv_param.kappa = 0.1;
00090 
00091   if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00092     inv_param.mu = 0.01;
00093     inv_param.twist_flavor = QUDA_TWIST_MINUS;
00094   }
00095 
00096   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00097   inv_param.dagger = dagger;
00098 
00099   inv_param.cpu_prec = cpu_prec;
00100   if (inv_param.cpu_prec != gauge_param.cpu_prec) 
00101     errorQuda("Gauge and spinor cpu precisions must match");
00102 
00103   inv_param.cuda_prec = cuda_prec;
00104 
00105 #ifndef MULTI_GPU // free parameter for single GPU
00106   gauge_param.ga_pad = 0;
00107 #else // must be this one c/b face for multi gpu
00108   int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
00109   int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
00110   int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
00111   int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
00112   int pad_size =MAX(x_face_size, y_face_size);
00113   pad_size = MAX(pad_size, z_face_size);
00114   pad_size = MAX(pad_size, t_face_size);
00115   gauge_param.ga_pad = pad_size;    
00116 #endif
00117   inv_param.sp_pad = 0;
00118   inv_param.cl_pad = 0;
00119 
00120   //inv_param.sp_pad = 24*24*24;
00121   //inv_param.cl_pad = 24*24*24;
00122 
00123   inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis
00124   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00125 
00126   if (test_type == 2) {
00127     inv_param.solution_type = QUDA_MAT_SOLUTION;
00128   } else {
00129     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00130   }
00131 
00132   inv_param.dslash_type = dslash_type;
00133 
00134   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00135     inv_param.clover_cpu_prec = cpu_prec;
00136     inv_param.clover_cuda_prec = cuda_prec;
00137     inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
00138     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
00139     //if (test_type > 0) {
00140       hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
00141       hostCloverInv = hostClover; // fake it
00142       /*} else {
00143       hostClover = NULL;
00144       hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
00145       }*/
00146   } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00147 
00148   }
00149 
00150   //inv_param.verbosity = QUDA_VERBOSE;
00151 
00152   // construct input fields
00153   for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
00154 
00155   ColorSpinorParam csParam;
00156   
00157   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00158   csParam.nColor = 3;
00159   csParam.nSpin = 4;
00160   if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00161     csParam.twistFlavor = inv_param.twist_flavor;
00162   }
00163   csParam.nDim = 4;
00164   for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
00165   csParam.precision = inv_param.cpu_prec;
00166   csParam.pad = 0;
00167   if (test_type < 2) {
00168     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00169     csParam.x[0] /= 2;
00170   } else {
00171     csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00172   }    
00173   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00174   csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00175   csParam.gammaBasis = inv_param.gamma_basis; 
00176   csParam.create = QUDA_ZERO_FIELD_CREATE;
00177 
00178   //csParam.verbose = QUDA_DEBUG_VERBOSE;
00179 
00180   spinor = new cpuColorSpinorField(csParam);
00181   spinorOut = new cpuColorSpinorField(csParam);
00182   spinorRef = new cpuColorSpinorField(csParam);
00183 
00184   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00185   csParam.x[0] = gauge_param.X[0];
00186   
00187   printfQuda("Randomizing fields... ");
00188 
00189   if (strcmp(latfile,"")) {  // load in the command line supplied gauge field
00190     read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
00191     construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param);
00192   } else { // else generate a random SU(3) field
00193     construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
00194   }
00195 
00196   spinor->Source(QUDA_RANDOM_SOURCE);
00197 
00198   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00199     double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
00200     double diag = 1.0; // constant added to the diagonal
00201 
00202     if (test_type == 2) {
00203       construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
00204     } else {
00205       construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
00206     }
00207   }
00208   printfQuda("done.\n"); fflush(stdout);
00209   
00210   initQuda(device);
00211 
00212   printfQuda("Sending gauge field to GPU\n");
00213   loadGaugeQuda(hostGauge, &gauge_param);
00214 
00215   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00216     printfQuda("Sending clover field to GPU\n");
00217     loadCloverQuda(hostClover, hostCloverInv, &inv_param);
00218     //clover = cudaCloverPrecise;
00219   }
00220 
00221   if (!transfer) {
00222     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00223     csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
00224     csParam.pad = inv_param.sp_pad;
00225     csParam.precision = inv_param.cuda_prec;
00226     if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
00227       csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00228     } else {
00229       /* Single and half */
00230       csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
00231     }
00232  
00233     if (test_type < 2) {
00234       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00235       csParam.x[0] /= 2;
00236     }
00237 
00238     printfQuda("Creating cudaSpinor\n");
00239     cudaSpinor = new cudaColorSpinorField(csParam);
00240     printfQuda("Creating cudaSpinorOut\n");
00241     cudaSpinorOut = new cudaColorSpinorField(csParam);
00242 
00243     if (test_type == 2) csParam.x[0] /= 2;
00244 
00245     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00246     tmp1 = new cudaColorSpinorField(csParam);
00247     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH ||
00248         dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00249       tmp2 = new cudaColorSpinorField(csParam);
00250     }
00251 
00252     printfQuda("Sending spinor field to GPU\n");
00253     *cudaSpinor = *spinor;
00254 
00255     std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
00256       norm2(*cudaSpinor) << std::endl;
00257 
00258     bool pc = (test_type != 2);
00259     DiracParam diracParam;
00260     setDiracParam(diracParam, &inv_param, pc);
00261     diracParam.verbose = QUDA_VERBOSE;
00262     diracParam.tmp1 = tmp1;
00263     diracParam.tmp2 = tmp2;
00264     
00265     dirac = Dirac::create(diracParam);
00266   } else {
00267     std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
00268   }
00269     
00270 }
00271 
00272 void end() {
00273   if (!transfer) {
00274     delete dirac;
00275     delete cudaSpinor;
00276     delete cudaSpinorOut;
00277     delete tmp1;
00278     delete tmp2;
00279   }
00280 
00281   // release memory
00282   delete spinor;
00283   delete spinorOut;
00284   delete spinorRef;
00285 
00286   for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
00287   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00288     if (test_type == 2) free(hostClover);
00289     else free(hostCloverInv);
00290   }
00291   endQuda();
00292 
00293 }
00294 
00295 // execute kernel
00296 double dslashCUDA(int niter) {
00297 
00298   cudaEvent_t start, end;
00299   cudaEventCreate(&start);
00300   cudaEventCreate(&end);
00301   cudaEventRecord(start, 0);
00302 
00303   for (int i = 0; i < niter; i++) {
00304     switch (test_type) {
00305     case 0:
00306       if (transfer) {
00307         dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity);
00308       } else {
00309         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00310       }
00311       break;
00312     case 1:
00313     case 2:
00314       if (transfer) {
00315         MatQuda(spinorOut->V(), spinor->V(), &inv_param);
00316       } else {
00317         dirac->M(*cudaSpinorOut, *cudaSpinor);
00318       }
00319       break;
00320     }
00321   }
00322     
00323   cudaEventRecord(end, 0);
00324   cudaEventSynchronize(end);
00325   float runTime;
00326   cudaEventElapsedTime(&runTime, start, end);
00327   cudaEventDestroy(start);
00328   cudaEventDestroy(end);
00329 
00330   double secs = runTime / 1000; //stopwatchReadSeconds();
00331 
00332   // check for errors
00333   cudaError_t stat = cudaGetLastError();
00334   if (stat != cudaSuccess)
00335     printfQuda("with ERROR: %s\n", cudaGetErrorString(stat));
00336 
00337   return secs;
00338 }
00339 
00340 void dslashRef() {
00341 
00342   // FIXME: remove once reference clover is finished
00343   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00344     if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
00345       inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00346     } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
00347       inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
00348     }
00349   }
00350 
00351   // compare to dslash reference implementation
00352   printfQuda("Calculating reference implementation...");
00353   fflush(stdout);
00354 
00355   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH ||
00356       dslash_type == QUDA_WILSON_DSLASH) {
00357     switch (test_type) {
00358     case 0:
00359       wil_dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param);
00360       break;
00361     case 1:    
00362       wil_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.matpc_type, dagger, 
00363                 inv_param.cpu_prec, gauge_param);
00364       break;
00365     case 2:
00366       wil_mat(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param);
00367       break;
00368     default:
00369       printfQuda("Test type not defined\n");
00370       exit(-1);
00371     }
00372   } else { // twisted mass
00373     switch (test_type) {
00374     case 0:
00375       tm_dslash(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
00376                 parity, dagger, inv_param.cpu_prec, gauge_param);
00377       break;
00378     case 1:    
00379       tm_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
00380                inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param);
00381       break;
00382     case 2:
00383       tm_mat(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
00384              dagger, inv_param.cpu_prec, gauge_param);
00385       break;
00386     default:
00387       printfQuda("Test type not defined\n");
00388       exit(-1);
00389     }
00390   }
00391 
00392   printfQuda("done.\n");
00393 }
00394 
00395 
00396 void display_test_info()
00397 {
00398   printfQuda("running the following test:\n");
00399  
00400   printfQuda("prec recon   test_type     dagger   S_dim         T_dimension   dslash_type\n");
00401   printfQuda("%s   %s       %d           %d       %d/%d/%d        %d            %s\n", 
00402              get_prec_str(prec), get_recon_str(link_recon), 
00403              test_type, dagger, xdim, ydim, zdim, tdim, 
00404              get_dslash_type_str(dslash_type));
00405   printfQuda("Grid partition info:     X  Y  Z  T\n"); 
00406   printfQuda("                         %d  %d  %d  %d\n", 
00407              commDimPartitioned(0),
00408              commDimPartitioned(1),
00409              commDimPartitioned(2),
00410              commDimPartitioned(3));
00411 
00412   return ;
00413     
00414 }
00415 
00416 extern void usage(char**);
00417 
00418 
00419 int main(int argc, char **argv)
00420 {
00421 
00422   for (int i =1;i < argc; i++){    
00423     if(process_command_line_option(argc, argv, &i) == 0){
00424       continue;
00425     }  
00426     
00427     fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00428     usage(argv);
00429   }
00430 
00431 
00432   initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00433 
00434   display_test_info();
00435 
00436   init(argc, argv);
00437 
00438   float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30);
00439   printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB);
00440   printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
00441   
00442   int attempts = 1;
00443   dslashRef();
00444   for (int i=0; i<attempts; i++) {
00445 
00446     if (tune) { // warm-up run
00447       printfQuda("Tuning...\n");
00448       setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);
00449       dslashCUDA(1);
00450     }
00451     printfQuda("Executing %d kernel loops...\n", loops);
00452     dirac->Flops();
00453     double secs = dslashCUDA(loops);
00454     printfQuda("done.\n\n");
00455 
00456 #ifdef DSLASH_PROFILING
00457     printDslashProfile();
00458 #endif
00459 
00460     if (!transfer) *spinorOut = *cudaSpinorOut;
00461 
00462     // print timing information
00463     printfQuda("%fms per loop\n", 1000*secs);
00464     
00465     unsigned long long flops = 0;
00466     if (!transfer) flops = dirac->Flops();
00467     int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24;
00468     if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
00469       spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short
00470     int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;
00471     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00472       gauge_floats += test_type ? 72*2 : 72;
00473     }
00474     printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
00475     printfQuda("GB/s = %f\n\n", 
00476                Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));
00477     
00478     if (!transfer) {
00479       double norm2_cpu = norm2(*spinorRef);
00480       double norm2_cuda= norm2(*cudaSpinorOut);
00481       double norm2_cpu_cuda= norm2(*spinorOut);
00482       printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
00483     } else {
00484       double norm2_cpu = norm2(*spinorRef);
00485       double norm2_cpu_cuda= norm2(*spinorOut);
00486       printfQuda("Result: CPU = %f, CPU-QUDA = %f\n",  norm2_cpu, norm2_cpu_cuda);
00487     }
00488     
00489     cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
00490   }    
00491   end();
00492 
00493   endCommsQuda();
00494 }