QUDA: quda/tests/llfat_test.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <string.h>
00004 #include <sys/time.h>
00005 #include <cuda.h>
00006 #include <cuda_runtime.h>
00007 
00008 #include "quda.h"
00009 #include "test_util.h"
00010 #include "llfat_reference.h"
00011 #include "misc.h"
00012 #include "util_quda.h"
00013 
00014 #ifdef MULTI_GPU
00015 #include "face_quda.h"
00016 #include "comm_quda.h"
00017 #endif
00018 
00019 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
00020 
00021 extern void usage(char** argv);
00022 static int verify_results = 0;
00023 
00024 extern int device;
00025 int Z[4];
00026 int V;
00027 int Vh;
00028 int Vs[4];
00029 int Vsh[4];
00030 static int Vs_x, Vs_y, Vs_z, Vs_t;
00031 static int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
00032 
00033 static int V_ex;
00034 static int Vh_ex;
00035 
00036 static int X1, X1h, X2, X3, X4;
00037 static int E1, E1h, E2, E3, E4;
00038 
00039 
00040 extern int xdim, ydim, zdim, tdim;
00041 extern int gridsize_from_cmdline[];
00042 
00043 extern QudaReconstructType link_recon;
00044 extern QudaPrecision prec;
00045 static QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00046 static QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
00047 
00048 static size_t gSize;
00049 
00050 void
00051 setDims(int *X) {
00052   V = 1;
00053   for (int d=0; d< 4; d++) {
00054     V *= X[d];
00055     Z[d] = X[d];
00056   }
00057   Vh = V/2;
00058 
00059   Vs[0] = Vs_x = X[1]*X[2]*X[3];
00060   Vs[1] = Vs_y = X[0]*X[2]*X[3];
00061   Vs[2] = Vs_z = X[0]*X[1]*X[3];
00062   Vs[3] = Vs_t = X[0]*X[1]*X[2];
00063   Vsh[0] = Vsh_x = Vs_x/2;
00064   Vsh[1] = Vsh_y = Vs_y/2;
00065   Vsh[2] = Vsh_z = Vs_z/2;
00066   Vsh[3] = Vsh_t = Vs_t/2;
00067 
00068 
00069   V_ex = 1;
00070   for (int d=0; d< 4; d++) {
00071     V_ex *= X[d]+4;
00072   }
00073   Vh_ex = V_ex/2;
00074 
00075   X1=X[0]; X2 = X[1]; X3=X[2]; X4=X[3];
00076   X1h=X1/2;
00077   E1=X1+4; E2=X2+4; E3=X3+4; E4=X4+4;
00078   E1h=E1/2;
00079   
00080   
00081 }
00082 
00083 static int
00084 llfat_test(int test)
00085 {
00086 
00087   QudaGaugeParam qudaGaugeParam;
00088 #ifdef MULTI_GPU
00089   void* ghost_sitelink[4];
00090   void* ghost_sitelink_diag[16];
00091 #endif
00092   
00093 
00094   initQuda(device);
00095 
00096   cpu_prec = prec;
00097   gSize = cpu_prec;  
00098   qudaGaugeParam = newQudaGaugeParam();
00099   
00100   qudaGaugeParam.anisotropy = 1.0;
00101   
00102   qudaGaugeParam.X[0] = xdim;
00103   qudaGaugeParam.X[1] = ydim;
00104   qudaGaugeParam.X[2] = zdim;
00105   qudaGaugeParam.X[3] = tdim;
00106 
00107   setDims(qudaGaugeParam.X);
00108    
00109   qudaGaugeParam.cpu_prec = cpu_prec;
00110   qudaGaugeParam.cuda_prec = prec;
00111   qudaGaugeParam.gauge_order = gauge_order;
00112   qudaGaugeParam.type=QUDA_WILSON_LINKS;
00113   qudaGaugeParam.reconstruct = link_recon;
00114   /*
00115   qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
00116     | QUDA_FAT_PRESERVE_GPU_GAUGE
00117     | QUDA_FAT_PRESERVE_COMM_MEM;
00118   */
00119   qudaGaugeParam.preserve_gauge =0;
00120   void* fatlink;
00121   cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize);
00122   if(fatlink == NULL){
00123     errorQuda("ERROR: allocating fatlink failed\n");
00124   }
00125   
00126   void* sitelink[4];
00127   for(int i=0;i < 4;i++){
00128     cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize);
00129     if(sitelink[i] == NULL){
00130       errorQuda("ERROR; allocate sitelink[%d] failed\n", i);
00131     }
00132   }
00133   
00134   void* sitelink_ex[4];
00135   for(int i=0;i < 4;i++){
00136     cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize);
00137     if(sitelink_ex[i] == NULL){
00138       errorQuda("ERROR; allocate sitelink_ex[%d] failed\n", i);
00139     }
00140   }
00141 
00142 
00143   void* milc_sitelink;
00144   milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
00145   if(milc_sitelink == NULL){
00146     errorQuda("ERROR: allocating milc_sitelink failed\n");
00147   }
00148 
00149   void* milc_sitelink_ex;
00150   milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
00151   if(milc_sitelink_ex == NULL){
00152     errorQuda("Error: allocating milc_sitelink failed\n");
00153   }
00154 
00155 
00156 
00157   createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);
00158 
00159   if(gauge_order == QUDA_MILC_GAUGE_ORDER){
00160           for(int i=0; i<V; ++i){
00161                   for(int dir=0; dir<4; ++dir){
00162                           char* src = (char*)sitelink[dir];
00163                           memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
00164                   }     
00165           }
00166   }
00167 
00168 
00169   for(int i=0; i < V_ex; i++){
00170     int sid = i;
00171     int oddBit=0;
00172     if(i >= Vh_ex){
00173       sid = i - Vh_ex;
00174       oddBit = 1;
00175     }
00176     
00177     int za = sid/E1h;
00178     int x1h = sid - za*E1h;
00179     int zb = za/E2;
00180     int x2 = za - zb*E2;
00181     int x4 = zb/E3;
00182     int x3 = zb - x4*E3;
00183     int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00184     int x1 = 2*x1h + x1odd;
00185     
00186     
00187     if( x1< 2 || x1 >= X1 +2
00188         || x2< 2 || x2 >= X2 +2
00189         || x3< 2 || x3 >= X3 +2
00190         || x4< 2 || x4 >= X4 +2){
00191 #ifdef MULTI_GPU
00192       continue;
00193 #endif
00194     }
00195     
00196     
00197     
00198     x1 = (x1 - 2 + X1) % X1;
00199     x2 = (x2 - 2 + X2) % X2;
00200     x3 = (x3 - 2 + X3) % X3;
00201     x4 = (x4 - 2 + X4) % X4;
00202     
00203     int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
00204     if(oddBit){
00205       idx += Vh;
00206     }
00207     for(int dir= 0; dir < 4; dir++){
00208       char* src = (char*)sitelink[dir];
00209       char* dst = (char*)sitelink_ex[dir];
00210       memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
00211 
00212       // milc ordering 
00213       memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
00214     }//dir
00215   }//i
00216   
00217 
00218   double act_path_coeff[6];
00219   for(int i=0;i < 6;i++){
00220     act_path_coeff[i]= 0.1*i;
00221   }
00222 
00223   
00224   //only record the last call's performance
00225   //the first one is for creating the cpu/cuda data structures
00226   struct timeval t0, t1;
00227   
00228   for(int i=0;i < 2;i++){
00229     gettimeofday(&t0, NULL);
00230     if(gauge_order == QUDA_QDP_GAUGE_ORDER){
00231       if(test == 0){
00232         computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam,
00233                            QUDA_COMPUTE_FAT_STANDARD);
00234       }else{
00235         computeFatLinkQuda(fatlink, sitelink_ex, act_path_coeff, &qudaGaugeParam,
00236                            QUDA_COMPUTE_FAT_EXTENDED_VOLUME);
00237       }
00238     }else if(gauge_order == QUDA_MILC_GAUGE_ORDER){
00239       if(test == 0){
00240         computeFatLinkQuda(fatlink, (void**)milc_sitelink, act_path_coeff, &qudaGaugeParam,
00241                            QUDA_COMPUTE_FAT_STANDARD);
00242       }else{
00243         computeFatLinkQuda(fatlink, (void**)milc_sitelink_ex, act_path_coeff, &qudaGaugeParam,
00244                            QUDA_COMPUTE_FAT_EXTENDED_VOLUME);
00245       }
00246     }
00247     gettimeofday(&t1, NULL);
00248   }
00249   
00250   double secs = TDIFF(t0,t1);
00251   
00252   void* reflink[4];
00253   for(int i=0;i < 4;i++){
00254     reflink[i] = malloc(V*gaugeSiteSize*gSize);
00255     if(reflink[i] == NULL){
00256       errorQuda("ERROR; allocate reflink[%d] failed\n", i);
00257     }
00258   }
00259   
00260   if (verify_results){
00261     
00262     //FIXME: we have this compplication because references takes coeff as float/double 
00263     //        depending on the precision while the GPU code aways take coeff as double
00264     void* coeff;
00265     double coeff_dp[6];
00266     float  coeff_sp[6];
00267     for(int i=0;i < 6;i++){
00268       coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
00269     }
00270     if(prec == QUDA_DOUBLE_PRECISION){
00271       coeff = coeff_dp;
00272     }else{
00273       coeff = coeff_sp;
00274     }
00275 #ifdef MULTI_GPU
00276     int optflag = 0;
00277     //we need x,y,z site links in the back and forward T slice
00278     // so it is 3*2*Vs_t
00279     for(int i=0;i < 4; i++){
00280     ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
00281     if (ghost_sitelink[i] == NULL){
00282       printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
00283       exit(1);
00284     }
00285     }
00286     
00287     /*
00288       nu |     |
00289          |_____|
00290            mu
00291     */
00292     
00293     for(int nu=0;nu < 4;nu++){
00294       for(int mu=0; mu < 4;mu++){
00295         if(nu == mu){
00296           ghost_sitelink_diag[nu*4+mu] = NULL;
00297         }else{
00298           //the other directions
00299           int dir1, dir2;
00300           for(dir1= 0; dir1 < 4; dir1++){
00301             if(dir1 !=nu && dir1 != mu){
00302               break;
00303             }
00304           }
00305           for(dir2=0; dir2 < 4; dir2++){
00306             if(dir2 != nu && dir2 != mu && dir2 != dir1){
00307               break;
00308             }
00309           }
00310           ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
00311           if(ghost_sitelink_diag[nu*4+mu] == NULL){
00312             errorQuda("malloc failed for ghost_sitelink_diag\n");
00313           }
00314           
00315           memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
00316         }
00317         
00318       }
00319     }
00320     
00321     exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
00322     llfat_reference_mg(reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
00323 #else
00324     llfat_reference(reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
00325 #endif
00326     
00327   }//verify_results
00328   
00329   //format change for fatlink
00330   void* myfatlink[4];
00331   for(int i=0;i < 4;i++){
00332     myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
00333     if(myfatlink[i] == NULL){
00334       printf("Error: malloc failed for myfatlink[%d]\n", i);
00335       exit(1);
00336     }
00337     memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
00338   }
00339   
00340   for(int i=0;i < V; i++){
00341     for(int dir=0; dir< 4; dir++){
00342       char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
00343       char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
00344       memcpy(dst, src, gaugeSiteSize*gSize);
00345     }
00346   }
00347   
00348     int res=1;
00349     for(int i=0;i < 4;i++){
00350       res &= compare_floats(reflink[i], myfatlink[i], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
00351     }
00352     int accuracy_level;
00353     
00354     accuracy_level = strong_check_link(myfatlink, "GPU results: ",
00355                                        reflink, "CPU reference results:",
00356                                        V, qudaGaugeParam.cpu_prec);
00357     
00358     printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
00359     int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
00360     int flops= 61632;
00361     double perf = 1.0* flops*volume/(secs*1024*1024*1024);
00362     printfQuda("fatlink computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);
00363     
00364     
00365     for(int i=0;i < 4;i++){
00366       free(myfatlink[i]);
00367     }
00368     
00369     if (res == 0){//failed
00370       printfQuda("\n");
00371       printfQuda("Warning: your test failed. \n");
00372       printfQuda(" Did you use --verify?\n");
00373       printfQuda(" Did you check the GPU health by running cuda memtest?\n");
00374     }
00375     
00376 #ifdef MULTI_GPU
00377   if (verify_results){
00378     int i;
00379     for(i=0;i < 4;i++){
00380       free(ghost_sitelink[i]);
00381     }
00382     for(i=0;i <4; i++){
00383       for(int j=0;j <4; j++){
00384         if (i==j){
00385           continue;
00386         }
00387         free(ghost_sitelink_diag[i*4+j]);
00388       }
00389     }
00390    }
00391 #endif
00392     
00393     for(int i=0;i < 4; i++){
00394       cudaFreeHost(sitelink[i]);
00395       cudaFreeHost(sitelink_ex[i]);
00396       free(reflink[i]);
00397     }
00398     cudaFreeHost(fatlink);
00399     if(milc_sitelink) free(milc_sitelink);
00400     if(milc_sitelink_ex) free(milc_sitelink_ex);
00401 #ifdef MULTI_GPU
00402     exchange_llfat_cleanup();
00403 #endif
00404     endQuda();
00405     
00406     return accuracy_level;
00407     
00408 }
00409 
00410 static void
00411 display_test_info(int test)
00412 {
00413   printfQuda("running the following test:\n");
00414     
00415   printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Test          Ordering\n");
00416   printfQuda("%s                       %s                         %d/%d/%d/                  %d             %d              %s \n", 
00417              get_prec_str(prec),
00418              get_recon_str(link_recon), 
00419              xdim, ydim, zdim, tdim, test, 
00420              get_gauge_order_str(gauge_order));
00421 
00422 #ifdef MULTI_GPU
00423   printfQuda("Grid partition info:     X  Y  Z  T\n");
00424   printfQuda("                         %d  %d  %d  %d\n",
00425              commDimPartitioned(0),
00426              commDimPartitioned(1),
00427              commDimPartitioned(2),
00428              commDimPartitioned(3));
00429 #endif
00430 
00431   return ;
00432   
00433 }
00434 
00435 void
00436 usage_extra(char** argv )
00437 {
00438   printfQuda("Extra options:\n");
00439   printfQuda("    --test <0/1>                             # Test method\n");
00440   printfQuda("                                                0: standard method\n");
00441   printfQuda("                                                1: extended volume method\n");
00442   printfQuda("    --verify                                 # Verify the GPU results using CPU results\n");
00443   printfQuda("    --gauge-order <qdp/milc>                 # ordering of the input gauge-field\n");
00444   return ;
00445 }
00446 
00447 int
00448 main(int argc, char **argv)
00449 {
00450 
00451   int test = 0;
00452   
00453   //default to 18 reconstruct, 8^3 x 8
00454   link_recon = QUDA_RECONSTRUCT_NO;
00455   xdim=ydim=zdim=tdim=8;
00456   cpu_prec = prec = QUDA_DOUBLE_PRECISION;
00457 
00458   int i;
00459   for (i =1;i < argc; i++){
00460 
00461     if(process_command_line_option(argc, argv, &i) == 0){
00462       continue;
00463     }
00464 
00465     
00466     if( strcmp(argv[i], "--test") == 0){
00467       if (i+1 >= argc){
00468         usage(argv);
00469       }
00470       test = atoi(argv[i+1]);
00471       i++;
00472       continue;
00473     }
00474 
00475 
00476     if( strcmp(argv[i], "--gauge-order") == 0){
00477       if(i+1 >= argc){
00478         usage(argv);
00479       }
00480 
00481       if(strcmp(argv[i+1], "milc") == 0){
00482         gauge_order = QUDA_MILC_GAUGE_ORDER;
00483       }else if(strcmp(argv[i+1], "qdp") == 0){
00484         gauge_order = QUDA_QDP_GAUGE_ORDER;
00485       }else{
00486         fprintf(stderr, "Error: unsupported gauge-field order\n");
00487         exit(1);
00488       }
00489       i++;      
00490       continue;
00491     }
00492     
00493 
00494     if( strcmp(argv[i], "--verify") == 0){
00495       verify_results=1;
00496       continue;
00497     }
00498 
00499     fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00500     usage(argv);
00501   }
00502 
00503 #ifdef MULTI_GPU
00504   if(gauge_order == QUDA_MILC_GAUGE_ORDER && test == 0){
00505     errorQuda("ERROR: milc format for multi-gpu with test0 is not supported yet!\n");
00506   }
00507 #endif
00508 
00509 
00510   initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00511 
00512 
00513   display_test_info(test);
00514   
00515     
00516   int accuracy_level = llfat_test(test);
00517   
00518   printfQuda("accuracy_level=%d\n", accuracy_level);
00519   
00520   endCommsQuda();
00521 
00522   int ret;
00523   if(accuracy_level >=3 ){
00524     ret = 0;
00525   }else{
00526     ret = 1; //we delclare the test failed
00527   }
00528 
00529   return ret;
00530 }
00531 
00532