QUDA: quda/tests/staggered_dslash

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <string.h>
00005 
00006 #include <quda.h>
00007 #include <quda_internal.h>
00008 #include <dirac_quda.h>
00009 #include <dslash_quda.h>
00010 #include <invert_quda.h>
00011 #include <util_quda.h>
00012 #include <blas_quda.h>
00013 
00014 #include <misc.h>
00015 #include <test_util.h>
00016 #include <staggered_dslash_reference.h>
00017 #include <gauge_field.h>
00018 
00019 #include <face_quda.h>
00020 
00021 #include <assert.h>
00022 
00023 #define MAX(a,b) ((a)>(b)?(a):(b))
00024 #define staggeredSpinorSiteSize 6
00025 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00026 
00027 extern void usage(char** argv );
00028 
00029 int test_type = 0;
00030 
00031 extern bool tune;
00032 
00033 QudaGaugeParam gaugeParam;
00034 QudaInvertParam inv_param;
00035 
00036 cpuGaugeField *cpuFat = NULL;
00037 cpuGaugeField *cpuLong = NULL;
00038 
00039 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00040 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut;
00041 
00042 cudaColorSpinorField* tmp;
00043 
00044 void *hostGauge[4];
00045 void *fatlink[4], *longlink[4];
00046 
00047 #ifdef MULTI_GPU
00048 const void **ghost_fatlink, **ghost_longlink;
00049 #endif
00050 
00051 const int loops = 100;
00052 
00053 QudaParity parity;
00054 extern QudaDagType dagger;
00055 int transfer = 0; // include transfer time in the benchmark?
00056 extern int xdim;
00057 extern int ydim;
00058 extern int zdim;
00059 extern int tdim;
00060 extern int gridsize_from_cmdline[];
00061 extern QudaReconstructType link_recon;
00062 extern QudaPrecision prec;
00063 
00064 extern int device;
00065 
00066 int X[4];
00067 
00068 
00069 Dirac* dirac;
00070 extern int Z[4];
00071 extern int V;
00072 extern int Vh;
00073 static int Vs_x, Vs_y, Vs_z, Vs_t;
00074 extern int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
00075 static int Vsh[4];
00076 
00077 void
00078 setDimConstants(int *X)
00079 {
00080   V = 1;
00081   for (int d=0; d< 4; d++) {
00082     V *= X[d];
00083     Z[d] = X[d];
00084   }
00085   Vh = V/2;
00086 
00087   Vs_x = X[1]*X[2]*X[3];
00088   Vs_y = X[0]*X[2]*X[3];
00089   Vs_z = X[0]*X[1]*X[3];
00090   Vs_t = X[0]*X[1]*X[2];
00091 
00092 
00093   Vsh_x = Vs_x/2;
00094   Vsh_y = Vs_y/2;
00095   Vsh_z = Vs_z/2;
00096   Vsh_t = Vs_t/2;
00097 
00098   Vsh[0] = Vsh_x;
00099   Vsh[1] = Vsh_y;
00100   Vsh[2] = Vsh_z;
00101   Vsh[3] = Vsh_t;
00102 }
00103 
00104 void init()
00105 {    
00106 
00107   initQuda(device);
00108 
00109   gaugeParam = newQudaGaugeParam();
00110   inv_param = newQudaInvertParam();
00111   
00112   gaugeParam.X[0] = X[0] = xdim;
00113   gaugeParam.X[1] = X[1] = ydim;
00114   gaugeParam.X[2] = X[2] = zdim;
00115   gaugeParam.X[3] = X[3] = tdim;
00116 
00117   setDims(gaugeParam.X);
00118 
00119   setDimConstants(gaugeParam.X);
00120 
00121   gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
00122   gaugeParam.cuda_prec = prec;
00123   gaugeParam.reconstruct = link_recon;
00124   gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
00125   gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
00126     
00127   gaugeParam.anisotropy = 1.0;
00128   gaugeParam.tadpole_coeff = 0.8;
00129   gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
00130   gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
00131   gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
00132   gaugeParam.gaugeGiB = 0;
00133     
00134   inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
00135   inv_param.cuda_prec = prec;
00136   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00137   inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00138   inv_param.dagger = dagger;
00139   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00140   inv_param.dslash_type = QUDA_ASQTAD_DSLASH;
00141 
00142   int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
00143   tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
00144   tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
00145   
00146   
00147   gaugeParam.ga_pad = tmpint;
00148   inv_param.sp_pad = tmpint;
00149 
00150   ColorSpinorParam csParam;
00151   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00152   csParam.nColor=3;
00153   csParam.nSpin=1;
00154   csParam.nDim=4;
00155   for(int d = 0; d < 4; d++) {
00156     csParam.x[d] = gaugeParam.X[d];
00157   }
00158   csParam.precision = inv_param.cpu_prec;
00159   csParam.pad = 0;
00160   if (test_type < 2) {
00161     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00162     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00163     csParam.x[0] /= 2;
00164   } else {
00165     inv_param.solution_type = QUDA_MAT_SOLUTION;
00166     csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 
00167   }
00168 
00169   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00170   csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00171   csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
00172   csParam.create = QUDA_ZERO_FIELD_CREATE;    
00173 
00174   spinor = new cpuColorSpinorField(csParam);
00175   spinorOut = new cpuColorSpinorField(csParam);
00176   spinorRef = new cpuColorSpinorField(csParam);
00177 
00178   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00179   csParam.x[0] = gaugeParam.X[0];
00180     
00181   printfQuda("Randomizing fields ...\n");
00182     
00183   spinor->Source(QUDA_RANDOM_SOURCE);
00184 
00185   size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00186     
00187   for (int dir = 0; dir < 4; dir++) {
00188     fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
00189     longlink[dir] = malloc(V*gaugeSiteSize*gSize);
00190   }
00191   if (fatlink == NULL || longlink == NULL){
00192     errorQuda("ERROR: malloc failed for fatlink/longlink");
00193   }
00194   construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
00195   
00196 #ifdef MULTI_GPU
00197   gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
00198   gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
00199   GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
00200   cpuFat = new cpuGaugeField(cpuFatParam);
00201   cpuFat->exchangeGhost();
00202   ghost_fatlink = cpuFat->Ghost();
00203 
00204   gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
00205   GaugeFieldParam cpuLongParam(longlink, gaugeParam);
00206   cpuLong = new cpuGaugeField(cpuLongParam);
00207   cpuLong->exchangeGhost();
00208   ghost_longlink = cpuLong->Ghost();
00209 
00210   int x_face_size = X[1]*X[2]*X[3]/2;
00211   int y_face_size = X[0]*X[2]*X[3]/2;
00212   int z_face_size = X[0]*X[1]*X[3]/2;
00213   int t_face_size = X[0]*X[1]*X[2]/2;
00214   int pad_size =MAX(x_face_size, y_face_size);
00215   pad_size = MAX(pad_size, z_face_size);
00216   pad_size = MAX(pad_size, t_face_size);
00217   gaugeParam.ga_pad = pad_size;    
00218 #endif
00219 
00220   gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
00221   gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
00222 
00223   printfQuda("Fat links sending..."); 
00224   loadGaugeQuda(fatlink, &gaugeParam);
00225   printfQuda("Fat links sent\n"); 
00226   
00227   gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;  
00228 
00229 #ifdef MULTI_GPU
00230   gaugeParam.ga_pad = 3*pad_size;
00231 #endif
00232 
00233   gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = link_recon;
00234   printfQuda("Long links sending..."); 
00235   loadGaugeQuda(longlink, &gaugeParam);
00236   printfQuda("Long links sent...\n"); 
00237 
00238     printfQuda("Sending fields to GPU..."); 
00239     
00240   if (!transfer) {
00241 
00242     //csParam.verbose = QUDA_DEBUG_VERBOSE;
00243         
00244     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00245     csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00246     csParam.pad = inv_param.sp_pad;
00247     csParam.precision = inv_param.cuda_prec;
00248     if (test_type < 2){
00249       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00250       csParam.x[0] /=2;
00251     }
00252 
00253     printfQuda("Creating cudaSpinor\n");
00254     cudaSpinor = new cudaColorSpinorField(csParam);
00255 
00256     printfQuda("Creating cudaSpinorOut\n");
00257     cudaSpinorOut = new cudaColorSpinorField(csParam);
00258         
00259     printfQuda("Sending spinor field to GPU\n");
00260     *cudaSpinor = *spinor;
00261         
00262     cudaThreadSynchronize();
00263     checkCudaError();
00264         
00265     double spinor_norm2 = norm2(*spinor);
00266     double cuda_spinor_norm2=  norm2(*cudaSpinor);
00267     printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2);
00268         
00269     if(test_type == 2){
00270       csParam.x[0] /=2;
00271     }
00272     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00273     tmp = new cudaColorSpinorField(csParam);
00274 
00275     bool pc = (test_type != 2);
00276     DiracParam diracParam;
00277     setDiracParam(diracParam, &inv_param, pc);
00278 
00279     diracParam.verbose = QUDA_VERBOSE;
00280     diracParam.tmp1=tmp;
00281 
00282     dirac = Dirac::create(diracParam);
00283         
00284   } else {
00285     errorQuda("Error not suppported");
00286   }
00287     
00288   return;
00289 }
00290 
00291 void end(void) 
00292 {
00293   for (int dir = 0; dir < 4; dir++) {
00294     free(fatlink[dir]);
00295     free(longlink[dir]);
00296   }
00297 
00298   if (!transfer){
00299     delete dirac;
00300     delete cudaSpinor;
00301     delete cudaSpinorOut;
00302     delete tmp;
00303   }
00304     
00305   delete spinor;
00306   delete spinorOut;
00307   delete spinorRef;
00308 
00309   if (cpuFat) delete cpuFat;
00310   if (cpuLong) delete cpuLong;
00311     
00312   endQuda();
00313 }
00314 
00315 double dslashCUDA(int niter) {
00316     
00317   cudaEvent_t start, end;
00318   cudaEventCreate(&start);
00319   cudaEventRecord(start, 0);
00320   cudaEventSynchronize(start);
00321 
00322   for (int i = 0; i < niter; i++) {
00323     switch (test_type) {
00324     case 0:
00325       parity = QUDA_EVEN_PARITY;
00326       if (transfer){
00327         //dslashQuda(spinorOdd, spinorEven, &inv_param, parity);
00328       } else {
00329         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00330       }    
00331       break;
00332     case 1:
00333       parity = QUDA_ODD_PARITY;
00334       if (transfer){
00335         //MatPCQuda(spinorOdd, spinorEven, &inv_param);
00336       } else {
00337         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00338       }
00339       break;
00340     case 2:
00341       if (transfer){
00342         //MatQuda(spinorGPU, spinor, &inv_param);
00343       } else {
00344         dirac->M(*cudaSpinorOut, *cudaSpinor);
00345       }
00346     }
00347   }
00348     
00349   cudaEventCreate(&end);
00350   cudaEventRecord(end, 0);
00351   cudaEventSynchronize(end);
00352   float runTime;
00353   cudaEventElapsedTime(&runTime, start, end);
00354   cudaEventDestroy(start);
00355   cudaEventDestroy(end);
00356 
00357   double secs = runTime / 1000; //stopwatchReadSeconds();
00358 
00359   // check for errors
00360   cudaError_t stat = cudaGetLastError();
00361   if (stat != cudaSuccess)
00362     errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
00363     
00364   return secs;
00365 }
00366 
00367 void staggeredDslashRef()
00368 {
00369 #ifndef MULTI_GPU
00370   int cpu_parity = 0;
00371 #endif
00372 
00373   // compare to dslash reference implementation
00374   printfQuda("Calculating reference implementation...");
00375   fflush(stdout);
00376   switch (test_type) {
00377   case 0:    
00378 #ifdef MULTI_GPU
00379 
00380     staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 
00381                             spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec);
00382 #else
00383     cpu_parity = 0; //EVEN
00384     staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 
00385                      inv_param.cpu_prec, gaugeParam.cpu_prec);
00386     
00387 #endif    
00388 
00389 
00390     break;
00391   case 1: 
00392 #ifdef MULTI_GPU
00393     staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 
00394                             spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec);    
00395     
00396 #else
00397     cpu_parity=1; //ODD
00398     staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 
00399                      inv_param.cpu_prec, gaugeParam.cpu_prec);
00400 #endif
00401     break;
00402   case 2:
00403     //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger, 
00404     //inv_param.cpu_prec, gaugeParam.cpu_prec);
00405     break;
00406   default:
00407     errorQuda("Test type not defined");
00408   }
00409     
00410   printfQuda("done.\n");
00411     
00412 }
00413 
00414 static int dslashTest() 
00415 {
00416   int accuracy_level = 0;
00417   
00418   init();
00419     
00420   int attempts = 1;
00421     
00422   for (int i=0; i<attempts; i++) {
00423 
00424     if (tune) { // warm-up run
00425       printfQuda("Tuning...\n");
00426       setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
00427       dslashCUDA(1);
00428     }
00429     printfQuda("Executing %d kernel loops...", loops);  
00430     double secs = dslashCUDA(loops);
00431 
00432 #ifdef DSLASH_PROFILING
00433     printDslashProfile();
00434 #endif
00435     
00436     if (!transfer) *spinorOut = *cudaSpinorOut;
00437       
00438     printfQuda("\n%fms per loop\n", 1000*secs);
00439     staggeredDslashRef();
00440         
00441     unsigned long long flops = dirac->Flops();
00442     int link_floats = 8*gaugeParam.reconstruct+8*18;
00443     int spinor_floats = 8*6*2 + 6;
00444     int link_float_size = prec;
00445     int spinor_float_size = 0;
00446     
00447     link_floats = test_type ? (2*link_floats) : link_floats;
00448     spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;
00449 
00450     int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
00451     if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4; 
00452 
00453     printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
00454     printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9));
00455         
00456     if (!transfer) {
00457       double spinor_ref_norm2 = norm2(*spinorRef);
00458       double cuda_spinor_out_norm2 =  norm2(*cudaSpinorOut);
00459       double spinor_out_norm2 =  norm2(*spinorOut);
00460       printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n",  spinor_ref_norm2, cuda_spinor_out_norm2,
00461                  spinor_out_norm2);
00462     } else {
00463       double spinor_ref_norm2 = norm2(*spinorRef);
00464       double spinor_out_norm2 =  norm2(*spinorOut);
00465       printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2);
00466     }
00467     
00468     accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut);      
00469   }
00470   end();
00471   
00472   return accuracy_level;
00473 }
00474 
00475 
00476 void display_test_info()
00477 {
00478   printfQuda("running the following test:\n");
00479  
00480   printfQuda("prec recon   test_type     dagger   S_dim         T_dimension\n");
00481   printfQuda("%s   %s       %d           %d       %d/%d/%d        %d \n", 
00482              get_prec_str(prec), get_recon_str(link_recon), 
00483              test_type, dagger, xdim, ydim, zdim, tdim);
00484   printfQuda("Grid partition info:     X  Y  Z  T\n"); 
00485   printfQuda("                         %d  %d  %d  %d\n", 
00486              commDimPartitioned(0),
00487              commDimPartitioned(1),
00488              commDimPartitioned(2),
00489              commDimPartitioned(3));
00490 
00491   return ;
00492     
00493 }
00494 
00495 
00496 void
00497 usage_extra(char** argv )
00498 {
00499   printfQuda("Extra options:\n");
00500   printfQuda("    --test <0/1>                             # Test method\n");
00501   printfQuda("                                                0: Even destination spinor\n");
00502   printfQuda("                                                1: Odd destination spinor\n");
00503   return ;
00504 }
00505 
00506 int main(int argc, char **argv) 
00507 {
00508 
00509   int i;
00510   for (i =1;i < argc; i++){
00511     
00512     if(process_command_line_option(argc, argv, &i) == 0){
00513       continue;
00514     }    
00515     
00516     if( strcmp(argv[i], "--test") == 0){
00517       if (i+1 >= argc){
00518         usage(argv);
00519       }     
00520       test_type =  atoi(argv[i+1]);
00521       if (test_type < 0 || test_type > 2){
00522         errorQuda("Error: invalid test type");
00523       }
00524       i++;
00525       continue;     
00526     }
00527     
00528     fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00529     usage(argv);
00530   }
00531   
00532   initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00533   
00534   display_test_info();
00535 
00536   int ret =1;
00537   int accuracy_level = dslashTest();
00538 
00539   printfQuda("accuracy_level =%d\n", accuracy_level);
00540 
00541   if (accuracy_level >= 1) ret = 0;    //probably no error, -1 means no matching  
00542   endCommsQuda();
00543   return ret;
00544 }
00545