QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <string.h> 00004 00005 #include <quda.h> 00006 #include "test_util.h" 00007 #include "gauge_field.h" 00008 #include "fat_force_quda.h" 00009 #include "misc.h" 00010 #include "fermion_force_reference.h" 00011 #include "fermion_force_quda.h" 00012 #include "hw_quda.h" 00013 #include <sys/time.h> 00014 00015 extern void usage(char** argv); 00016 extern int device; 00017 cudaGaugeField *cudaGauge = NULL; 00018 cpuGaugeField *cpuGauge = NULL; 00019 00020 cudaGaugeField *cudaMom = NULL; 00021 cpuGaugeField *cpuMom = NULL; 00022 cpuGaugeField *refMom = NULL; 00023 00024 static FullHw cudaHw; 00025 static QudaGaugeParam gaugeParam; 00026 static void* hw; //the array of half_wilson_vector 00027 00028 extern int gridsize_from_cmdline[]; 00029 00030 int verify_results = 0; 00031 00032 int ODD_BIT = 1; 00033 extern int xdim, ydim, zdim, tdim; 00034 00035 extern QudaReconstructType link_recon; 00036 QudaPrecision link_prec = QUDA_SINGLE_PRECISION; 00037 extern QudaPrecision prec; 00038 QudaPrecision hw_prec = QUDA_SINGLE_PRECISION; 00039 QudaPrecision mom_prec = QUDA_SINGLE_PRECISION; 00040 00041 QudaPrecision cpu_hw_prec = QUDA_SINGLE_PRECISION; 00042 00043 int Z[4]; 00044 int V; 00045 int Vh; 00046 void 00047 setDims(int *X) { 00048 V = 1; 00049 for (int d=0; d< 4; d++) { 00050 V *= X[d]; 00051 Z[d] = X[d]; 00052 } 00053 Vh = V/2; 00054 } 00055 00056 extern void initCommonConstants(const LatticeField &gauge); 00057 00058 static void 00059 fermion_force_init() 00060 { 00061 initQuda(device); 00062 //cudaSetDevice(dev); CUERR; 00063 00064 gaugeParam.X[0] = xdim; 00065 gaugeParam.X[1] = ydim; 00066 gaugeParam.X[2] = zdim; 00067 gaugeParam.X[3] = tdim; 00068 setDims(gaugeParam.X); 00069 00070 gaugeParam.cpu_prec = link_prec; 00071 gaugeParam.cuda_prec = link_prec; 00072 gaugeParam.reconstruct = link_recon; 00073 00074 gaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER; 00075 00076 GaugeFieldParam gParam(0, gaugeParam); 00077 gParam.create = QUDA_NULL_FIELD_CREATE; 00078 00079 cpuGauge = new cpuGaugeField(gParam); 00080 00081 // this is a hack to have site link generated in 2d 00082 // then copied to 1d array in "MILC" format 00083 void* siteLink_2d[4]; 00084 for(int i=0;i < 4;i++){ 00085 siteLink_2d[i] = malloc(cpuGauge->Volume()*gaugeSiteSize*gaugeParam.cpu_prec); 00086 if (siteLink_2d[i] == NULL){ 00087 errorQuda("ERROR: malloc failed for siteLink_2d\n"); 00088 } 00089 } 00090 00091 // fills the gauge field with random numbers 00092 createSiteLinkCPU(siteLink_2d, gaugeParam.cpu_prec, 1); 00093 00094 //copy the 2d sitelink to 1d milc format 00095 for(int dir=0;dir < 4; dir++){ 00096 for(int i=0;i < cpuGauge->Volume(); i++){ 00097 char* src = ((char*)siteLink_2d[dir]) + i * gaugeSiteSize* gaugeParam.cpu_prec; 00098 char* dst = ((char*)cpuGauge->Gauge_p()) + (4*i+dir)*gaugeSiteSize*gaugeParam.cpu_prec ; 00099 memcpy(dst, src, gaugeSiteSize*gaugeParam.cpu_prec); 00100 } 00101 } 00102 00103 for(int i=0;i < 4;i++){ 00104 free(siteLink_2d[i]); 00105 } 00106 00107 #if 0 00108 site_link_sanity_check(siteLink, V, gaugeParam.cpu_prec, &gaugeParam); 00109 #endif 00110 00111 //gaugeParam.site_ga_pad = gaugeParam.ga_pad = 0; 00112 //gaugeParam.reconstruct = link_recon; 00113 00114 gParam.precision = gaugeParam.cuda_prec; 00115 gParam.reconstruct = link_recon; 00116 cudaGauge = new cudaGaugeField(gParam); 00117 00118 gParam.reconstruct = QUDA_RECONSTRUCT_10; 00119 gParam.precision = gaugeParam.cpu_prec; 00120 cpuMom = new cpuGaugeField(gParam); 00121 refMom = new cpuGaugeField(gParam); 00122 00123 createMomCPU(cpuMom->Gauge_p(), mom_prec); 00124 //memset(cpuMom->Gauge_p(), 0, 4*cpuMom->Volume()*momSiteSize*gaugeParam.cpu_prec); 00125 00126 memcpy(refMom->Gauge_p(), cpuMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize*gaugeParam.cpu_prec); 00127 00128 gParam.precision = gaugeParam.cuda_prec; 00129 cudaMom = new cudaGaugeField(gParam); 00130 00131 hw = malloc(4*cpuGauge->Volume()*hwSiteSize*gaugeParam.cpu_prec); 00132 if (hw == NULL){ 00133 fprintf(stderr, "ERROR: malloc failed for hw\n"); 00134 exit(1); 00135 } 00136 createHwCPU(hw, hw_prec); 00137 00138 cudaHw = createHwQuda(gaugeParam.X, hw_prec); 00139 00140 return; 00141 } 00142 00143 static void 00144 fermion_force_end() 00145 { 00146 delete cudaMom; 00147 delete cudaGauge; 00148 00149 delete cpuGauge; 00150 delete cpuMom; 00151 delete refMom; 00152 00153 freeHwQuda(cudaHw); 00154 free(hw); 00155 00156 endQuda(); 00157 } 00158 00159 00160 static int 00161 fermion_force_test(void) 00162 { 00163 00164 fermion_force_init(); 00165 initCommonConstants(*cudaGauge); 00166 fermion_force_init_cuda(&gaugeParam); 00167 00168 00169 float eps= 0.02; 00170 float weight1 =1.0; 00171 float weight2 =1.0; 00172 float act_path_coeff[6]; 00173 00174 act_path_coeff[0] = 0.625000; 00175 act_path_coeff[1] = -0.058479; 00176 act_path_coeff[2] = -0.087719; 00177 act_path_coeff[3] = 0.030778; 00178 act_path_coeff[4] = -0.007200; 00179 act_path_coeff[5] = -0.123113; 00180 00181 // download the momentum field to the GPU 00182 cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION); 00183 00184 // download the gauge field to the GPU 00185 cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION); 00186 00187 loadHwToGPU(cudaHw, hw, cpu_hw_prec); 00188 00189 00190 if (verify_results){ 00191 fermion_force_reference(eps, weight1, weight2, act_path_coeff, hw, cpuGauge->Gauge_p(), refMom->Gauge_p()); 00192 } 00193 00194 00195 /* 00196 * The flops number comes from CPU implementation in MILC 00197 * function eo_fermion_force_twoterms_field(), fermion_force_asqtad.c 00198 * 00199 */ 00200 int flops = 433968; 00201 00202 struct timeval t0, t1; 00203 cudaThreadSynchronize(); 00204 00205 gettimeofday(&t0, NULL); 00206 fermion_force_cuda(eps, weight1, weight2, act_path_coeff, cudaHw, *cudaGauge, *cudaMom, &gaugeParam); 00207 cudaThreadSynchronize(); 00208 gettimeofday(&t1, NULL); 00209 double secs = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); 00210 00211 // copy the new momentum back on the CPU 00212 cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION); 00213 00214 int res; 00215 res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, gaugeParam.cpu_prec); 00216 00217 int accuracy_level; 00218 accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), gaugeParam.cpu_prec); 00219 00220 printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); 00221 00222 int volume = gaugeParam.X[0]*gaugeParam.X[1]*gaugeParam.X[2]*gaugeParam.X[3]; 00223 double perf = 1.0* flops*volume/(secs*1024*1024*1024); 00224 printf("GPU runtime =%.2f ms, kernel performance= %.2f GFLOPS\n", secs*1000, perf); 00225 00226 fermion_force_end(); 00227 00228 if (res == 0){//failed 00229 printf("\n"); 00230 printf("Warning: you test failed. \n"); 00231 printf(" Did you use --verify?\n"); 00232 printf(" Did you check the GPU health by running cuda memtest?\n"); 00233 } 00234 00235 return accuracy_level; 00236 } 00237 00238 00239 static void 00240 display_test_info() 00241 { 00242 printf("running the following fermion force computation test:\n"); 00243 00244 printf("link_precision link_reconstruct space_dim(x/y/z) T_dimension\n"); 00245 printf("%s %s %d/%d/%d %d \n", 00246 get_prec_str(link_prec), 00247 get_recon_str(link_recon), 00248 xdim, ydim, zdim, tdim); 00249 return ; 00250 00251 } 00252 00253 void 00254 usage_extra(char** argv ) 00255 { 00256 printf("Extra options: \n"); 00257 printf(" --verify # Verify the GPU results using CPU results\n"); 00258 return ; 00259 } 00260 00261 int 00262 main(int argc, char **argv) 00263 { 00264 int i; 00265 for (i =1;i < argc; i++){ 00266 if(process_command_line_option(argc, argv, &i) == 0){ 00267 continue; 00268 } 00269 00270 if( strcmp(argv[i], "--verify") == 0){ 00271 verify_results=1; 00272 continue; 00273 } 00274 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00275 usage(argv); 00276 } 00277 00278 #ifdef MULTI_GPU 00279 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00280 #endif 00281 00282 link_prec = prec; 00283 00284 display_test_info(); 00285 00286 int accuracy_level = fermion_force_test(); 00287 printfQuda("accuracy_level=%d\n", accuracy_level); 00288 00289 00290 #ifdef MULTI_GPU 00291 endCommsQuda(); 00292 #endif 00293 00294 int ret; 00295 if(accuracy_level >=3 ){ 00296 ret = 0; 00297 }else{ 00298 ret = 1; //we delclare the test failed 00299 } 00300 00301 00302 return ret; 00303 }