QUDA v0.4.0
A library for QCD on GPUs
quda/tests/hisq_paths_force_test.cpp
Go to the documentation of this file.
00001 #include <cstdio>
00002 #include <cstdlib>
00003 #include <cstring>
00004 
00005 #include <quda.h>
00006 #include "test_util.h"
00007 #include "gauge_field.h"
00008 #include "fat_force_quda.h"
00009 #include "misc.h"
00010 #include "hisq_force_reference.h"
00011 #include "hisq_force_quda.h"
00012 #include "hisq_force_utils.h"
00013 #include "hw_quda.h"
00014 #include <sys/time.h>
00015 
00016 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
00017 
00018 
00019 
00020 #include "fermion_force_reference.h"
00021 using namespace hisq::fermion_force;
00022 
00023 extern void usage(char** argv);
00024 static int device = 0;
00025 cudaGaugeField *cudaGauge = NULL;
00026 cpuGaugeField  *cpuGauge  = NULL;
00027 
00028 cudaGaugeField *cudaForce = NULL;
00029 cpuGaugeField  *cpuForce = NULL;
00030 
00031 cudaGaugeField *cudaMom = NULL;
00032 cpuGaugeField *cpuMom  = NULL;
00033 cpuGaugeField *refMom  = NULL;
00034 
00035 static FullHw cudaHw;
00036 static QudaGaugeParam gaugeParam;
00037 static void* hw; // the array of half_wilson_vector
00038 
00039 
00040 cpuGaugeField *cpuOprod = NULL;
00041 cudaGaugeField *cudaOprod = NULL;
00042 cpuGaugeField *cpuLongLinkOprod = NULL;
00043 cudaGaugeField *cudaLongLinkOprod = NULL;
00044 
00045 int verify_results = 0;
00046 int ODD_BIT = 1;
00047 extern int xdim, ydim, zdim, tdim;
00048 
00049 extern QudaPrecision prec;
00050 extern QudaReconstructType link_recon;
00051 QudaPrecision link_prec = QUDA_DOUBLE_PRECISION;
00052 QudaPrecision hw_prec = QUDA_DOUBLE_PRECISION;
00053 QudaPrecision cpu_hw_prec = QUDA_DOUBLE_PRECISION;
00054 QudaPrecision mom_prec = QUDA_DOUBLE_PRECISION;
00055 
00056 
00057 
00058 static void setPrecision(QudaPrecision precision)
00059 {
00060   link_prec = precision;
00061   hw_prec = precision;
00062   cpu_hw_prec = precision;
00063   mom_prec = precision;
00064   return;
00065 }
00066 
00067 int Z[4];
00068 int V;
00069 int Vh;
00070 
00071 
00072 void
00073 setDims(int *X){
00074   V = 1;
00075   for(int dir=0; dir<4; ++dir){
00076     V *= X[dir];
00077     Z[dir] = X[dir];
00078   }
00079   Vh = V/2;
00080   return;
00081 }
00082 
00083 
00084 void
00085 total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double* io, double* flops)
00086 {
00087   //total IO counting for the middle/side/all link kernels
00088   //Explanation about these numbers can be founed in the corresnponding kernel functions in
00089   //the hisq kernel core file
00090   int linksize = prec*recon;
00091   int cmsize = prec*18;
00092   
00093   int matrix_mul_flops = 198;
00094   int matrix_add_flops = 18;
00095 
00096   int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
00097   int middle_link_data_io[6][2] = {
00098     {3,6},
00099     {3,4},
00100     {3,7},
00101     {3,5},
00102     {3,5},
00103     {3,2}
00104   };
00105   int middle_link_data_flops[6][2] = {
00106     {3,1},
00107     {2,0},
00108     {4,1},
00109     {3,0},
00110     {4,1},
00111     {2,0}
00112   };
00113 
00114 
00115   int num_calls_side_link[2]= {192, 48};
00116   int side_link_data_io[2][2] = {
00117     {1, 6},
00118     {0, 3}
00119   };
00120   int side_link_data_flops[2][2] = {
00121     {2, 2},
00122     {0, 1}
00123   };
00124 
00125 
00126 
00127   int num_calls_all_link[2] ={192, 192};
00128   int all_link_data_io[2][2] = {
00129     {3, 8},
00130     {3, 6}
00131   };
00132   int all_link_data_flops[2][2] = {
00133     {6, 3},
00134     {4, 2}
00135   };
00136 
00137   
00138   double total_io = 0;
00139   for(int i = 0;i < 6; i++){
00140     total_io += num_calls_middle_link[i]
00141       *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
00142   }
00143   
00144   for(int i = 0;i < 2; i++){
00145     total_io += num_calls_side_link[i]
00146       *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
00147   }
00148   for(int i = 0;i < 2; i++){
00149     total_io += num_calls_all_link[i]
00150       *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
00151   }     
00152   total_io *= V;
00153 
00154 
00155   double total_flops = 0;
00156   for(int i = 0;i < 6; i++){
00157     total_flops += num_calls_middle_link[i]
00158       *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
00159   }
00160   
00161   for(int i = 0;i < 2; i++){
00162     total_flops += num_calls_side_link[i]
00163       *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
00164   }
00165   for(int i = 0;i < 2; i++){
00166     total_flops += num_calls_all_link[i]
00167       *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
00168   }     
00169   total_flops *= V;
00170 
00171   *io=total_io;
00172   *flops = total_flops;
00173 
00174   printfQuda("flop/byte =%.1f\n", total_flops/total_io);
00175   return ;  
00176 }
00177 
00178 
00179 void initDslashConstants(const cudaGaugeField &gauge, const int sp_stride);
00180 
00181 
00182 // allocate memory
00183 // set the layout, etc.
00184 static void
00185 hisq_force_init()
00186 {
00187   initQuda(device);
00188 
00189   gaugeParam.X[0] = xdim;
00190   gaugeParam.X[1] = ydim;
00191   gaugeParam.X[2] = zdim;
00192   gaugeParam.X[3] = tdim;
00193 
00194   setDims(gaugeParam.X);
00195 
00196 
00197   gaugeParam.cpu_prec = link_prec;
00198   gaugeParam.cuda_prec = link_prec;
00199   gaugeParam.reconstruct = link_recon;
00200 
00201   gaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
00202 
00203   GaugeFieldParam gParam(0, gaugeParam);
00204   gParam.create = QUDA_NULL_FIELD_CREATE;
00205 
00206   cpuGauge = new cpuGaugeField(gParam);
00207 
00208   // this is a hack to get the gauge field to appear as a void** rather than void*
00209   void* siteLink_2d[4];
00210   for(int i=0;i < 4;i++){
00211     siteLink_2d[i] = malloc(V*gaugeSiteSize* gaugeParam.cpu_prec);
00212     if(siteLink_2d[i] == NULL){
00213       errorQuda("malloc failed for siteLink_2d\n");
00214     }
00215   }
00216   
00217   // fills the gauge field with random numbers
00218   createSiteLinkCPU(siteLink_2d, gaugeParam.cpu_prec, 1);
00219   
00220   for(int dir = 0; dir < 4; dir++){
00221     for(int i = 0;i < V; i++){
00222       char* src = (char*)siteLink_2d[dir];
00223       char* dst = (char*)cpuGauge->Gauge_p();
00224       memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);   
00225     }
00226   }
00227 
00228   gParam.precision = gaugeParam.cuda_prec;
00229   gParam.reconstruct = link_recon;
00230   cudaGauge = new cudaGaugeField(gParam);
00231 
00232   // create the force matrix
00233   // cannot reconstruct, since the force matrix is not in SU(3)
00234   gParam.precision = gaugeParam.cpu_prec;
00235   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
00236   cpuForce = new cpuGaugeField(gParam); 
00237   memset(cpuForce->Gauge_p(), 0, 4*cpuForce->Volume()*gaugeSiteSize*gaugeParam.cpu_prec);
00238 
00239   gParam.precision = gaugeParam.cuda_prec;
00240   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
00241   cudaForce = new cudaGaugeField(gParam); 
00242   cudaMemset((void**)(cudaForce->Gauge_p()), 0, cudaForce->Bytes());
00243 
00244   // create the momentum matrix
00245   gParam.reconstruct = QUDA_RECONSTRUCT_10;
00246   gParam.precision = gaugeParam.cpu_prec;
00247   cpuMom = new cpuGaugeField(gParam);
00248   refMom = new cpuGaugeField(gParam);
00249 
00250   createMomCPU(cpuMom->Gauge_p(), mom_prec);
00251 
00252 
00253   memset(cpuMom->Gauge_p(), 0, cpuMom->Bytes());
00254   memset(refMom->Gauge_p(), 0, refMom->Bytes());
00255 
00256   gParam.precision = gaugeParam.cuda_prec;
00257   cudaMom = new cudaGaugeField(gParam); // Are the elements initialised to zero? - No!
00258 
00259   hw = malloc(4*cpuGauge->Volume()*hwSiteSize*gaugeParam.cpu_prec);
00260   if (hw == NULL){
00261     fprintf(stderr, "ERROR: malloc failed for hw\n");
00262     exit(1);
00263   }
00264 
00265   createHwCPU(hw, hw_prec);
00266   cudaHw = createHwQuda(gaugeParam.X, hw_prec);
00267 
00268 
00269 
00270   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
00271   gParam.precision = gaugeParam.cpu_prec;
00272   cpuOprod = new cpuGaugeField(gParam);
00273   computeLinkOrderedOuterProduct(hw, cpuOprod->Gauge_p(), hw_prec, 1);
00274 
00275   cpuLongLinkOprod = new cpuGaugeField(gParam);
00276   computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3);
00277 
00278 
00279   gParam.precision = hw_prec;
00280   cudaOprod = new cudaGaugeField(gParam);
00281   cudaLongLinkOprod = new cudaGaugeField(gParam);
00282 
00283   for(int i = 0;i < 4; i++){
00284     free(siteLink_2d[i]);
00285   }
00286   return;
00287 }
00288 
00289 
00290 static void 
00291 hisq_force_end()
00292 {
00293   delete cudaMom;
00294   delete cudaForce;
00295   delete cudaGauge;
00296   delete cudaOprod;
00297   delete cudaLongLinkOprod;
00298   freeHwQuda(cudaHw);
00299 
00300   delete cpuGauge;
00301   delete cpuForce;
00302   delete cpuMom;
00303   delete refMom;
00304   delete cpuOprod;  
00305   delete cpuLongLinkOprod;
00306   free(hw);
00307 
00308   endQuda();
00309 
00310   return;
00311 }
00312 
00313 static int 
00314 hisq_force_test(void)
00315 {
00316   hisq_force_init();
00317 
00318   initDslashConstants(*cudaGauge, cudaGauge->VolumeCB());
00319   hisqForceInitCuda(&gaugeParam);
00320 
00321 
00322    
00323   float weight = 1.0;
00324   float act_path_coeff[6];
00325 
00326   act_path_coeff[0] = 0.625000;
00327   act_path_coeff[1] = -0.058479;
00328   act_path_coeff[2] = -0.087719;
00329   act_path_coeff[3] = 0.030778;
00330   act_path_coeff[4] = -0.007200;
00331   act_path_coeff[5] = -0.123113;
00332 
00333 
00334   double d_weight = 1.0;
00335   double d_act_path_coeff[6];
00336   for(int i=0; i<6; ++i){
00337     d_act_path_coeff[i] = act_path_coeff[i];
00338   }
00339 
00340 
00341 
00342 
00343 
00344   // copy the momentum field to the GPU
00345   cudaMom->loadCPUField(*refMom, QUDA_CPU_FIELD_LOCATION);
00346   // copy the gauge field to the GPU
00347   cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
00348   // copy the outer product field to the GPU
00349   cudaOprod->loadCPUField(*cpuOprod, QUDA_CPU_FIELD_LOCATION);
00350   // load the three-link outer product to the GPU
00351   cudaLongLinkOprod->loadCPUField(*cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
00352 
00353   loadHwToGPU(cudaHw, hw, cpu_hw_prec);
00354 
00355   struct timeval ht0, ht1;
00356   gettimeofday(&ht0, NULL);
00357   if (verify_results){
00358     if(cpu_hw_prec == QUDA_SINGLE_PRECISION){
00359       const float eps = 0.5;
00360       fermion_force_reference(eps, weight, 0, act_path_coeff, hw, cpuGauge->Gauge_p(), refMom->Gauge_p());
00361     }else if(cpu_hw_prec == QUDA_DOUBLE_PRECISION){
00362       const double eps = 0.5;
00363       fermion_force_reference(eps, d_weight, 0, d_act_path_coeff, hw, cpuGauge->Gauge_p(), refMom->Gauge_p());
00364     }
00365   }
00366   gettimeofday(&ht1, NULL);
00367 
00368   struct timeval t0, t1, t2, t3, t4, t5;
00369 
00370   gettimeofday(&t0, NULL);
00371   hisqStaplesForceCuda(d_act_path_coeff, gaugeParam, *cudaOprod, *cudaGauge, cudaForce);
00372   cudaThreadSynchronize();
00373   gettimeofday(&t1, NULL);
00374   checkCudaError();
00375  
00376   gettimeofday(&t2, NULL);
00377   hisqLongLinkForceCuda(d_act_path_coeff[1], gaugeParam, *cudaLongLinkOprod, *cudaGauge, cudaForce);
00378   cudaThreadSynchronize();
00379   gettimeofday(&t3, NULL);
00380   checkCudaError();
00381   gettimeofday(&t4, NULL);
00382   hisqCompleteForceCuda(gaugeParam, *cudaForce, *cudaGauge, cudaMom);
00383   cudaThreadSynchronize();
00384   checkCudaError();
00385   gettimeofday(&t5, NULL);
00386 
00387 
00388 
00389   cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
00390 
00391   int res;
00392   res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, gaugeParam.cpu_prec);
00393 
00394   int accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), gaugeParam.cpu_prec);
00395   printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
00396 
00397   double total_io;
00398   double total_flops;
00399   total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
00400   
00401   float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
00402   float perf = total_io / (TDIFF(t0, t1)) *1e-9;
00403   printf("Staples time: %.2f ms, perf =%.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
00404   printf("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t2,t3)*1000, TDIFF(t4,t5)*1000);
00405   printf("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
00406 
00407   hisq_force_end();
00408 
00409   return accuracy_level;
00410 }
00411 
00412 
00413 static void
00414 display_test_info()
00415 {
00416   printf("running the following fermion force computation test:\n");
00417     
00418   printf("link_precision           link_reconstruct           space_dim(x/y/z)         T_dimension\n");
00419   printf("%s                       %s                         %d/%d/%d                  %d \n", 
00420          get_prec_str(link_prec),
00421          get_recon_str(link_recon), 
00422          xdim, ydim, zdim, tdim);
00423   return ;
00424     
00425 }
00426 
00427 void
00428 usage_extra(char** argv )
00429 {
00430   printf("Extra options: \n");
00431   printf("    --verify                                  # Verify the GPU results using CPU results\n");
00432   return ;
00433 }
00434 int 
00435 main(int argc, char **argv) 
00436 {
00437   int i;
00438   for (i =1;i < argc; i++){
00439         
00440     if(process_command_line_option(argc, argv, &i) == 0){
00441       continue;
00442     }    
00443 
00444     if( strcmp(argv[i], "--verify") == 0){
00445       verify_results=1;
00446       continue;     
00447     }   
00448     fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00449     usage(argv);
00450   }
00451 
00452 #ifdef MULTI_GPU
00453     initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00454 #endif
00455 
00456   setPrecision(prec);
00457 
00458   display_test_info();
00459     
00460   int accuracy_level = hisq_force_test();
00461 
00462 
00463 #ifdef MULTI_GPU
00464   endCommsQuda();
00465 #endif
00466 
00467   if(accuracy_level >=3 ){
00468     return EXIT_SUCCESS;
00469   }else{
00470     return -1;
00471   }
00472   
00473 }
00474 
00475 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines