QUDA v0.4.0
A library for QCD on GPUs
quda/tests/gauge_force_test.cpp
Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <string.h>
00004 
00005 #include <quda.h>
00006 #include <test_util.h>
00007 #include <gauge_field.h>
00008 #include "misc.h"
00009 #include "gauge_force_reference.h"
00010 #include "gauge_force_quda.h"
00011 #include <sys/time.h>
00012 #include "fat_force_quda.h"
00013 
00014 #ifdef MULTI_GPU
00015 #include <face_quda.h>
00016 #endif
00017 
00018 #define GPU_DIRECT
00019 extern void initCommonConstants(const LatticeField &lat);
00020 
00021 extern int device;
00022 
00023 static QudaGaugeParam qudaGaugeParam;
00024 QudaGaugeFieldOrder gauge_order =  QUDA_QDP_GAUGE_ORDER;
00025 static int verify_results = 0;
00026 extern int tdim;
00027 extern QudaPrecision prec;
00028 extern int xdim;
00029 extern int ydim;
00030 extern int zdim;
00031 extern int tdim;
00032 extern void usage(char** argv);
00033 int attempts = 1;
00034 
00035 int Z[4];
00036 int V;
00037 int Vh;
00038 
00039 int V_ex;
00040 int Vh_ex;
00041 
00042 static int X1, X1h, X2, X3, X4;
00043 static int E1, E1h, E2, E3, E4;
00044 
00045 int E[4];
00046 
00047 extern QudaReconstructType link_recon;
00048 QudaPrecision  link_prec = QUDA_SINGLE_PRECISION;
00049 
00050 extern int gridsize_from_cmdline[];
00051 
00052 
00053 int length[]={
00054   3, 
00055   3, 
00056   3, 
00057   3, 
00058   3, 
00059   3, 
00060   5, 
00061   5, 
00062   5, 
00063   5, 
00064   5, 
00065   5, 
00066   5, 
00067   5, 
00068   5, 
00069   5, 
00070   5, 
00071   5, 
00072   5, 
00073   5, 
00074   5, 
00075   5, 
00076   5, 
00077   5, 
00078   5, 
00079   5, 
00080   5, 
00081   5, 
00082   5, 
00083   5, 
00084   5, 
00085   5, 
00086   5, 
00087   5, 
00088   5, 
00089   5, 
00090   5, 
00091   5, 
00092   5, 
00093   5, 
00094   5, 
00095   5, 
00096   5, 
00097   5, 
00098   5, 
00099   5, 
00100   5, 
00101   5, 
00102 };
00103     
00104 
00105 float loop_coeff_f[]={
00106   1.1,
00107   1.2,
00108   1.3,
00109   1.4,
00110   1.5,
00111   1.6,
00112   2.5,
00113   2.6,
00114   2.7,
00115   2.8,
00116   2.9,
00117   3.0,
00118   3.1,
00119   3.2,
00120   3.3,
00121   3.4,
00122   3.5,
00123   3.6,
00124   3.7,
00125   3.8,
00126   3.9,
00127   4.0,
00128   4.1,
00129   4.2,
00130   4.3,
00131   4.4,
00132   4.5,
00133   4.6,
00134   4.7,
00135   4.8,
00136   4.9,
00137   5.0,
00138   5.1,
00139   5.2,
00140   5.3,
00141   5.4,
00142   5.5,
00143   5.6,
00144   5.7,
00145   5.8,
00146   5.9,
00147   5.0,
00148   6.1,
00149   6.2,
00150   6.3,
00151   6.4,
00152   6.5,
00153   6.6,
00154 };
00155 
00156 int path_dir_x[][5] = {
00157   {1, 7, 6 },
00158   {6, 7, 1 },
00159   {2, 7, 5 },
00160   {5, 7, 2 },
00161   {3, 7, 4 },
00162   {4, 7, 3 },
00163   {0, 1, 7, 7, 6 },
00164   {1, 7, 7, 6, 0 },
00165   {6, 7, 7, 1, 0 },
00166   {0, 6, 7, 7, 1 },
00167   {0, 2, 7, 7, 5 },
00168   {2, 7, 7, 5, 0 },
00169   {5, 7, 7, 2, 0 },
00170   {0, 5, 7, 7, 2 },
00171   {0, 3, 7, 7, 4 },
00172   {3, 7, 7, 4, 0 },
00173   {4, 7, 7, 3, 0 },
00174   {0, 4, 7, 7, 3 },
00175   {6, 6, 7, 1, 1 },
00176   {1, 1, 7, 6, 6 },
00177   {5, 5, 7, 2, 2 },
00178   {2, 2, 7, 5, 5 },
00179   {4, 4, 7, 3, 3 },
00180   {3, 3, 7, 4, 4 },
00181   {1, 2, 7, 6, 5 },
00182   {5, 6, 7, 2, 1 },
00183   {1, 5, 7, 6, 2 },
00184   {2, 6, 7, 5, 1 },
00185   {6, 2, 7, 1, 5 },
00186   {5, 1, 7, 2, 6 },
00187   {6, 5, 7, 1, 2 },
00188   {2, 1, 7, 5, 6 },
00189   {1, 3, 7, 6, 4 },
00190   {4, 6, 7, 3, 1 },
00191   {1, 4, 7, 6, 3 },
00192   {3, 6, 7, 4, 1 },
00193   {6, 3, 7, 1, 4 },
00194   {4, 1, 7, 3, 6 },
00195   {6, 4, 7, 1, 3 },
00196   {3, 1, 7, 4, 6 },
00197   {2, 3, 7, 5, 4 },
00198   {4, 5, 7, 3, 2 },
00199   {2, 4, 7, 5, 3 },
00200   {3, 5, 7, 4, 2 },
00201   {5, 3, 7, 2, 4 },
00202   {4, 2, 7, 3, 5 },
00203   {5, 4, 7, 2, 3 },
00204   {3, 2, 7, 4, 5 },
00205 };
00206 
00207 
00208 int path_dir_y[][5] = {
00209   { 2 ,6 ,5 },
00210   { 5 ,6 ,2 },
00211   { 3 ,6 ,4 },
00212   { 4 ,6 ,3 },
00213   { 0 ,6 ,7 },
00214   { 7 ,6 ,0 },
00215   { 1 ,2 ,6 ,6 ,5 },
00216   { 2 ,6 ,6 ,5 ,1 },
00217   { 5 ,6 ,6 ,2 ,1 },
00218   { 1 ,5 ,6 ,6 ,2 },
00219   { 1 ,3 ,6 ,6 ,4 },
00220   { 3 ,6 ,6 ,4 ,1 },
00221   { 4 ,6 ,6 ,3 ,1 },
00222   { 1 ,4 ,6 ,6 ,3 },
00223   { 1 ,0 ,6 ,6 ,7 },
00224   { 0 ,6 ,6 ,7 ,1 },
00225   { 7 ,6 ,6 ,0 ,1 },
00226   { 1 ,7 ,6 ,6 ,0 },
00227   { 5 ,5 ,6 ,2 ,2 },
00228   { 2 ,2 ,6 ,5 ,5 },
00229   { 4 ,4 ,6 ,3 ,3 },
00230   { 3 ,3 ,6 ,4 ,4 },
00231   { 7 ,7 ,6 ,0 ,0 },
00232   { 0 ,0 ,6 ,7 ,7 },
00233   { 2 ,3 ,6 ,5 ,4 },
00234   { 4 ,5 ,6 ,3 ,2 },
00235   { 2 ,4 ,6 ,5 ,3 },
00236   { 3 ,5 ,6 ,4 ,2 },
00237   { 5 ,3 ,6 ,2 ,4 },
00238   { 4 ,2 ,6 ,3 ,5 },
00239   { 5 ,4 ,6 ,2 ,3 },
00240   { 3 ,2 ,6 ,4 ,5 },
00241   { 2 ,0 ,6 ,5 ,7 },
00242   { 7 ,5 ,6 ,0 ,2 },
00243   { 2 ,7 ,6 ,5 ,0 },
00244   { 0 ,5 ,6 ,7 ,2 },
00245   { 5 ,0 ,6 ,2 ,7 },
00246   { 7 ,2 ,6 ,0 ,5 },
00247   { 5 ,7 ,6 ,2 ,0 },
00248   { 0 ,2 ,6 ,7 ,5 },
00249   { 3 ,0 ,6 ,4 ,7 },
00250   { 7 ,4 ,6 ,0 ,3 },
00251   { 3 ,7 ,6 ,4 ,0 },
00252   { 0 ,4 ,6 ,7 ,3 },
00253   { 4 ,0 ,6 ,3 ,7 },
00254   { 7 ,3 ,6 ,0 ,4 },
00255   { 4 ,7 ,6 ,3 ,0 },
00256   { 0 ,3 ,6 ,7 ,4 }
00257 };
00258 
00259 int path_dir_z[][5] = { 
00260   { 3 ,5 ,4 },
00261   { 4 ,5 ,3 },
00262   { 0 ,5 ,7 },
00263   { 7 ,5 ,0 },
00264   { 1 ,5 ,6 },
00265   { 6 ,5 ,1 },
00266   { 2 ,3 ,5 ,5 ,4 },
00267   { 3 ,5 ,5 ,4 ,2 },
00268   { 4 ,5 ,5 ,3 ,2 },
00269   { 2 ,4 ,5 ,5 ,3 },
00270   { 2 ,0 ,5 ,5 ,7 },
00271   { 0 ,5 ,5 ,7 ,2 },
00272   { 7 ,5 ,5 ,0 ,2 },
00273   { 2 ,7 ,5 ,5 ,0 },
00274   { 2 ,1 ,5 ,5 ,6 },
00275   { 1 ,5 ,5 ,6 ,2 },
00276   { 6 ,5 ,5 ,1 ,2 },
00277   { 2 ,6 ,5 ,5 ,1 },
00278   { 4 ,4 ,5 ,3 ,3 },
00279   { 3 ,3 ,5 ,4 ,4 },
00280   { 7 ,7 ,5 ,0 ,0 },
00281   { 0 ,0 ,5 ,7 ,7 },
00282   { 6 ,6 ,5 ,1 ,1 },
00283   { 1 ,1 ,5 ,6 ,6 },
00284   { 3 ,0 ,5 ,4 ,7 },
00285   { 7 ,4 ,5 ,0 ,3 },
00286   { 3 ,7 ,5 ,4 ,0 },
00287   { 0 ,4 ,5 ,7 ,3 },
00288   { 4 ,0 ,5 ,3 ,7 },
00289   { 7 ,3 ,5 ,0 ,4 },
00290   { 4 ,7 ,5 ,3 ,0 },
00291   { 0 ,3 ,5 ,7 ,4 },
00292   { 3 ,1 ,5 ,4 ,6 },
00293   { 6 ,4 ,5 ,1 ,3 },
00294   { 3 ,6 ,5 ,4 ,1 },
00295   { 1 ,4 ,5 ,6 ,3 },
00296   { 4 ,1 ,5 ,3 ,6 },
00297   { 6 ,3 ,5 ,1 ,4 },
00298   { 4 ,6 ,5 ,3 ,1 },
00299   { 1 ,3 ,5 ,6 ,4 },
00300   { 0 ,1 ,5 ,7 ,6 },
00301   { 6 ,7 ,5 ,1 ,0 },
00302   { 0 ,6 ,5 ,7 ,1 },
00303   { 1 ,7 ,5 ,6 ,0 },
00304   { 7 ,1 ,5 ,0 ,6 },
00305   { 6 ,0 ,5 ,1 ,7 },
00306   { 7 ,6 ,5 ,0 ,1 },
00307   { 1 ,0 ,5 ,6 ,7 }
00308 };
00309 
00310 int path_dir_t[][5] = {
00311   { 0 ,4 ,7 },
00312   { 7 ,4 ,0 },
00313   { 1 ,4 ,6 },
00314   { 6 ,4 ,1 },
00315   { 2 ,4 ,5 },
00316   { 5 ,4 ,2 },
00317   { 3 ,0 ,4 ,4 ,7 },
00318   { 0 ,4 ,4 ,7 ,3 },
00319   { 7 ,4 ,4 ,0 ,3 },
00320   { 3 ,7 ,4 ,4 ,0 },
00321   { 3 ,1 ,4 ,4 ,6 },
00322   { 1 ,4 ,4 ,6 ,3 },
00323   { 6 ,4 ,4 ,1 ,3 },
00324   { 3 ,6 ,4 ,4 ,1 },
00325   { 3 ,2 ,4 ,4 ,5 },
00326   { 2 ,4 ,4 ,5 ,3 },
00327   { 5 ,4 ,4 ,2 ,3 },
00328   { 3 ,5 ,4 ,4 ,2 },
00329   { 7 ,7 ,4 ,0 ,0 },
00330   { 0 ,0 ,4 ,7 ,7 },
00331   { 6 ,6 ,4 ,1 ,1 },
00332   { 1 ,1 ,4 ,6 ,6 },
00333   { 5 ,5 ,4 ,2 ,2 },
00334   { 2 ,2 ,4 ,5 ,5 },
00335   { 0 ,1 ,4 ,7 ,6 },
00336   { 6 ,7 ,4 ,1 ,0 },
00337   { 0 ,6 ,4 ,7 ,1 },
00338   { 1 ,7 ,4 ,6 ,0 },
00339   { 7 ,1 ,4 ,0 ,6 },
00340   { 6 ,0 ,4 ,1 ,7 },
00341   { 7 ,6 ,4 ,0 ,1 },
00342   { 1 ,0 ,4 ,6 ,7 },
00343   { 0 ,2 ,4 ,7 ,5 },
00344   { 5 ,7 ,4 ,2 ,0 },
00345   { 0 ,5 ,4 ,7 ,2 },
00346   { 2 ,7 ,4 ,5 ,0 },
00347   { 7 ,2 ,4 ,0 ,5 },
00348   { 5 ,0 ,4 ,2 ,7 },
00349   { 7 ,5 ,4 ,0 ,2 },
00350   { 2 ,0 ,4 ,5 ,7 },
00351   { 1 ,2 ,4 ,6 ,5 },
00352   { 5 ,6 ,4 ,2 ,1 },
00353   { 1 ,5 ,4 ,6 ,2 },
00354   { 2 ,6 ,4 ,5 ,1 },
00355   { 6 ,2 ,4 ,1 ,5 },
00356   { 5 ,1 ,4 ,2 ,6 },
00357   { 6 ,5 ,4 ,1 ,2 },
00358   { 2 ,1 ,4 ,5 ,6 }
00359 };
00360 
00361 
00362 
00363 void
00364 setDims(int *X) {
00365   V = 1;
00366   for (int d=0; d< 4; d++) {
00367     V *= X[d];
00368     Z[d] = X[d];
00369   }
00370   Vh = V/2;
00371 
00372   V_ex = 1;
00373   for (int d=0; d< 4; d++) {
00374     V_ex *= X[d]+4;
00375   }
00376   Vh_ex = V_ex/2;
00377   
00378   X1=X[0]; X2 = X[1]; X3=X[2]; X4=X[3];
00379   X1h=X1/2;
00380   E1=X1+4; E2=X2+4; E3=X3+4; E4=X4+4;
00381   E1h=E1/2;
00382   
00383   E[0] = E1;
00384   E[1] = E2;
00385   E[2] = E3;
00386   E[3] = E4;
00387 }
00388 
00389 
00390 
00391 static int
00392 gauge_force_test(void) 
00393 {
00394   int max_length = 6;    
00395   
00396   initQuda(device);
00397   
00398   qudaGaugeParam = newQudaGaugeParam();
00399   
00400   qudaGaugeParam.X[0] = xdim;
00401   qudaGaugeParam.X[1] = ydim;
00402   qudaGaugeParam.X[2] = zdim;
00403   qudaGaugeParam.X[3] = tdim;
00404   
00405   setDims(qudaGaugeParam.X);
00406   
00407   qudaGaugeParam.anisotropy = 1.0;
00408   qudaGaugeParam.cpu_prec = link_prec;
00409   qudaGaugeParam.cuda_prec = link_prec;
00410   qudaGaugeParam.reconstruct = link_recon;  
00411   qudaGaugeParam.type = QUDA_WILSON_LINKS; // in this context, just means these are site links   
00412   
00413   qudaGaugeParam.gauge_order = gauge_order;
00414   
00415   int gSize = qudaGaugeParam.cpu_prec;
00416     
00417   void* sitelink;  
00418   void* sitelink_1d;
00419   
00420 #ifdef GPU_DIRECT
00421   cudaMallocHost(&sitelink_1d, 4*V*gaugeSiteSize*gSize);
00422 #else
00423   sitelink_1d= malloc(4*V*gaugeSiteSize*gSize);
00424 #endif
00425   if(sitelink_1d == NULL){
00426     printf("ERROR: malloc failed for sitelink_1d\n");
00427     exit(1);
00428   }
00429   
00430   // this is a hack to have site link generated in 2d 
00431   // then copied to 1d array in "MILC" format
00432   void* sitelink_2d[4];
00433   for(int i=0;i < 4;i++){
00434 #ifdef GPU_DIRECT
00435     cudaMallocHost(&sitelink_2d[i], V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
00436 #else
00437     sitelink_2d[i] = malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
00438 #endif
00439   }
00440   
00441   // fills the gauge field with random numbers
00442   createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
00443   
00444   //copy the 2d sitelink to 1d milc format 
00445   
00446   for(int dir = 0; dir < 4; dir++){
00447     for(int i=0; i < V; i++){
00448       char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
00449       char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
00450       memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
00451     }
00452   }
00453   if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
00454     sitelink =  sitelink_1d;    
00455   }else{ //QUDA_QDP_GAUGE_ORDER
00456     sitelink = (void**)sitelink_2d;
00457   }  
00458   
00459 #ifdef MULTI_GPU
00460   void* sitelink_ex;
00461   void* sitelink_ex_2d[4];
00462   void* sitelink_ex_1d;
00463 
00464   cudaMallocHost((void**)&sitelink_ex_1d, 4*V_ex*gaugeSiteSize*gSize);  
00465   for(int i=0;i < 4;i++){
00466     cudaMallocHost((void**)&sitelink_ex_2d[i], V_ex*gaugeSiteSize*gSize);
00467     if(sitelink_ex_2d[i] == NULL){
00468       errorQuda("ERROR; allocate sitelink_ex[%d] failed\n", i);
00469     }
00470   }
00471 
00472   for(int i=0; i < V_ex; i++){
00473     int sid = i;
00474     int oddBit=0;
00475     if(i >= Vh_ex){
00476       sid = i - Vh_ex;
00477       oddBit = 1;
00478     }
00479     
00480     int za = sid/E1h;
00481     int x1h = sid - za*E1h;
00482     int zb = za/E2;
00483     int x2 = za - zb*E2;
00484     int x4 = zb/E3;
00485     int x3 = zb - x4*E3;
00486     int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00487     int x1 = 2*x1h + x1odd;
00488 
00489 
00490     if( x1< 2 || x1 >= X1 +2
00491         || x2< 2 || x2 >= X2 +2
00492         || x3< 2 || x3 >= X3 +2
00493         || x4< 2 || x4 >= X4 +2){
00494       continue;
00495     }
00496 
00497 
00498     
00499     x1 = (x1 - 2 + X1) % X1;
00500     x2 = (x2 - 2 + X2) % X2;
00501     x3 = (x3 - 2 + X3) % X3;
00502     x4 = (x4 - 2 + X4) % X4;
00503     
00504     int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
00505     if(oddBit){
00506       idx += Vh;
00507     }
00508     for(int dir= 0; dir < 4; dir++){
00509       char* src = (char*)sitelink_2d[dir];
00510       char* dst = (char*)sitelink_ex_2d[dir];
00511       memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
00512     }//dir
00513   }//i
00514   
00515   
00516   for(int dir = 0; dir < 4; dir++){
00517     for(int i=0; i < V_ex; i++){
00518       char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
00519       char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
00520       memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
00521     }
00522   }
00523   
00524   if(qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER){
00525     sitelink_ex = sitelink_ex_2d;
00526   }else{
00527     sitelink_ex = sitelink_ex_1d;
00528   }
00529 
00530 #endif
00531   
00532 
00533 
00534   void* mom = malloc(4*V*momSiteSize*gSize);
00535   void* refmom = malloc(4*V*momSiteSize*gSize);
00536   if(mom == NULL || refmom == NULL){
00537     printf("ERROR: malloc failed for mom/refmom\n");
00538     exit(1);
00539   }    
00540   memset(mom, 0, 4*V*momSiteSize*gSize);
00541   //initiaze some data in cpuMom
00542   createMomCPU(mom,  qudaGaugeParam.cpu_prec);      
00543   memcpy(refmom, mom, 4*V*momSiteSize*qudaGaugeParam.cpu_prec);
00544   
00545   
00546   double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
00547   for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
00548     loop_coeff_d[i] = loop_coeff_f[i];
00549   }
00550     
00551   void* loop_coeff;
00552   if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
00553     loop_coeff = (void*)&loop_coeff_f[0];
00554   }else{
00555     loop_coeff = loop_coeff_d;
00556   }
00557   double eb3 = 0.3;
00558   int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
00559   
00560   int** input_path_buf[4];
00561   for(int dir =0; dir < 4; dir++){
00562     input_path_buf[dir] = (int**)malloc(num_paths*sizeof(int*));
00563     if (input_path_buf[dir] == NULL){
00564       printf("ERORR: malloc failed for input path\n");
00565       exit(1);
00566     }
00567     
00568     for(int i=0;i < num_paths;i++){
00569       input_path_buf[dir][i] = (int*)malloc(length[i]*sizeof(int));
00570       if (input_path_buf[dir][i] == NULL){
00571         printf("ERROR: malloc failed for input_path_buf[dir][%d]\n", i);
00572         exit(1);
00573       }
00574       if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
00575       else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
00576       else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
00577       else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
00578     }
00579   }
00580   
00581   
00582   struct timeval t0, t1;
00583   double timeinfo[3];
00584   /* Multiple execution to exclude warmup time in the first run*/
00585   for (int i =0;i < attempts; i++){
00586     gettimeofday(&t0, NULL);
00587 #ifdef MULTI_GPU
00588     computeGaugeForceQuda(mom, sitelink_ex,  input_path_buf, length,
00589                           loop_coeff, num_paths, max_length, eb3,
00590                           &qudaGaugeParam, timeinfo);
00591     
00592 #else
00593     computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
00594                           loop_coeff, num_paths, max_length, eb3,
00595                         &qudaGaugeParam, timeinfo);
00596 #endif  
00597     gettimeofday(&t1, NULL);
00598   }
00599   
00600   double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
00601   //The number comes from CPU implementation in MILC, gauge_force_imp.c
00602   int flops=153004;
00603     
00604   if (verify_results){  
00605     for(int i = 0;i < attempts;i++){
00606 #ifdef MULTI_GPU
00607       //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
00608       //even they are not partitioned
00609       exchange_cpu_sitelink_ex(qudaGaugeParam.X, (void**)sitelink_ex_2d,
00610                                QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0);    
00611       gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
00612                             input_path_buf, length, loop_coeff, num_paths);
00613 #else
00614       gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
00615                             input_path_buf, length, loop_coeff, num_paths);
00616 #endif
00617     }
00618   }
00619   
00620   int res;
00621   res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
00622   
00623   int accuracy_level;
00624   accuracy_level = strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
00625   
00626   printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");     
00627   
00628   double perf = 1.0* flops*V/(total_time*1e+9);
00629   double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
00630   printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
00631          timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
00632   printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFOPS\n", kernel_perf, perf);
00633   
00634   for(int dir = 0; dir < 4; dir++){
00635     for(int i=0;i < num_paths; i++){
00636       free(input_path_buf[dir][i]);
00637     }
00638     free(input_path_buf[dir]);
00639   }
00640   
00641 #ifdef GPU_DIRECT  
00642   cudaFreeHost(sitelink_1d);
00643 #else
00644   free(sitelink_1d);
00645 #endif
00646   for(int dir=0;dir < 4;dir++){
00647 #ifdef GPU_DIRECT
00648     cudaFreeHost(sitelink_2d[dir]);
00649 #else
00650     free(sitelink_2d[dir]);
00651 #endif
00652   }
00653   
00654 #ifdef MULTI_GPU  
00655   cudaFreeHost(sitelink_ex_1d);
00656   for(int dir=0; dir < 4; dir++){
00657     cudaFreeHost(sitelink_ex_2d[dir]);
00658   }
00659 #endif
00660 
00661 
00662   free(mom);
00663   free(refmom);
00664   endQuda();
00665   
00666   if (res == 0){//failed
00667     printf("\n");
00668     printf("Warning: you test failed. \n");
00669     printf("        Did you use --verify?\n");
00670     printf("        Did you check the GPU health by running cuda memtest?\n");
00671     }
00672   
00673   return accuracy_level;
00674 }            
00675 
00676 
00677 static void
00678 display_test_info()
00679 {
00680     printf("running the following test:\n");
00681     
00682     printf("link_precision           link_reconstruct           space_dim(x/y/z)              T_dimension        Gauge_order    Attempts\n");
00683     printf("%s                       %s                         %d/%d/%d                       %d                  %s           %d\n",  
00684            get_prec_str(link_prec),
00685            get_recon_str(link_recon), 
00686            xdim,ydim,zdim, tdim, 
00687            get_gauge_order_str(gauge_order),
00688            attempts);
00689     return ;
00690     
00691 }
00692 
00693 void
00694 usage_extra(char** argv )
00695 {
00696   printf("Extra options:\n");
00697   printf("    --gauge-order  <qdp/milc>                 # Gauge storing order in CPU\n");
00698   printf("    --attempts  <n>                           # Number of tests\n");
00699   printf("    --verify                                  # Verify the GPU results using CPU results\n");
00700   return ;
00701 }
00702 
00703 int 
00704 main(int argc, char **argv) 
00705 {
00706     int i;
00707     for (i =1;i < argc; i++){
00708         
00709       if(process_command_line_option(argc, argv, &i) == 0){
00710         continue;
00711       }
00712 
00713       if( strcmp(argv[i], "--gauge-order") == 0){
00714         if(i+1 >= argc){
00715           usage(argv);
00716         }
00717         
00718         if(strcmp(argv[i+1], "milc") == 0){
00719           gauge_order = QUDA_MILC_GAUGE_ORDER;
00720         }else if(strcmp(argv[i+1], "qdp") == 0){
00721           gauge_order = QUDA_QDP_GAUGE_ORDER;
00722         }else{
00723           fprintf(stderr, "Error: unsupported gauge-field order\n");
00724           exit(1);
00725         }
00726         i++;
00727         continue;
00728       }
00729       if( strcmp(argv[i], "--attempts") == 0){
00730         if(i+1 >= argc){
00731           usage(argv);
00732         }
00733         
00734         attempts = atoi(argv[i+1]);
00735         if(attempts <= 0){
00736           printf("ERROR: invalid number of attempts(%d)\n", attempts);
00737         }
00738         i++;
00739         continue;
00740       }
00741      
00742       if( strcmp(argv[i], "--verify") == 0){
00743         verify_results=1;
00744         continue;           
00745       } 
00746       
00747       fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00748       usage(argv);
00749     }
00750     
00751     
00752     link_prec = prec;
00753 #ifdef MULTI_GPU
00754     initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00755 #endif
00756 
00757     display_test_info();
00758     
00759     int accuracy_level = gauge_force_test();
00760     printfQuda("accuracy_level=%d\n", accuracy_level);
00761 
00762 #ifdef MULTI_GPU
00763     endCommsQuda();
00764 #endif
00765     
00766     int ret;
00767     if(accuracy_level >=3 ){
00768       ret = 0; 
00769     }else{
00770       ret = 1; //we delclare the test failed
00771     }
00772 
00773     return ret;
00774 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines