QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <string.h> 00004 #include <sys/time.h> 00005 #include <cuda.h> 00006 #include <cuda_runtime.h> 00007 00008 #include "quda.h" 00009 #include "gauge_field.h" 00010 #include "test_util.h" 00011 #include "llfat_reference.h" 00012 #include "misc.h" 00013 #include "util_quda.h" 00014 #include "llfat_quda.h" 00015 #include "fat_force_quda.h" 00016 #include "hisq_links_quda.h" 00017 #include "dslash_quda.h" 00018 #include "hisq_force_quda.h" 00019 00020 #ifdef MULTI_GPU 00021 #include "face_quda.h" 00022 #include "comm_quda.h" 00023 #endif 00024 00025 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 00026 00027 extern void usage(char** argv); 00028 static int verify_results = 0; 00029 00030 extern int device; 00031 int Z[4]; 00032 int V; 00033 int Vh; 00034 int Vs[4]; 00035 int Vsh[4]; 00036 00037 static int V_ex; 00038 static int Vh_ex; 00039 00040 00041 static double unitarize_eps = 1e-6; 00042 static bool reunit_allow_svd = true; 00043 static bool reunit_svd_only = false; 00044 static double svd_rel_error = 1e-4; 00045 static double svd_abs_error = 1e-5; 00046 static double max_allowed_error = 1e-12; 00047 static bool check_unitarization = true; 00048 00049 00050 extern int xdim, ydim, zdim, tdim; 00051 extern int gridsize_from_cmdline[]; 00052 00053 extern QudaReconstructType link_recon; 00054 extern QudaPrecision prec; 00055 static QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00056 static QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER; 00057 00058 static size_t gSize; 00059 00060 void 00061 setDims(const int *X) { 00062 V = 1; 00063 for (int dir=0; dir<4; ++dir) { 00064 V *= X[dir]; 00065 Z[dir] = X[dir]; 00066 } 00067 Vh = V/2; 00068 00069 Vs[0] = X[1]*X[2]*X[3]; 00070 Vs[1] = X[0]*X[2]*X[3]; 00071 Vs[2] = X[0]*X[1]*X[3]; 00072 Vs[3] = X[0]*X[1]*X[2]; 00073 for(int dir=0; dir<4; ++dir) Vsh[dir] = Vs[dir]/2; 00074 V_ex = 1; 00075 for (int d=0; d< 4; d++) { 00076 V_ex *= X[d]+4; 00077 } 00078 Vh_ex = V_ex/2; 00079 return; 00080 } 00081 00082 static int 00083 unitarize_link_test() 00084 { 00085 00086 QudaGaugeParam qudaGaugeParam = newQudaGaugeParam(); 00087 00088 initQuda(0); 00089 00090 cpu_prec = prec; 00091 gSize = cpu_prec; 00092 qudaGaugeParam.anisotropy = 1.0; 00093 00094 qudaGaugeParam.X[0] = xdim; 00095 qudaGaugeParam.X[1] = ydim; 00096 qudaGaugeParam.X[2] = zdim; 00097 qudaGaugeParam.X[3] = tdim; 00098 00099 setDims(qudaGaugeParam.X); 00100 00101 QudaPrecision link_prec = QUDA_SINGLE_PRECISION; 00102 QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO; 00103 00104 qudaGaugeParam.cpu_prec = link_prec; 00105 qudaGaugeParam.cuda_prec = link_prec; 00106 qudaGaugeParam.reconstruct = link_recon; 00107 qudaGaugeParam.type = QUDA_WILSON_LINKS; 00108 00109 00110 hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam); 00111 00112 qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; 00113 qudaGaugeParam.anisotropy = 1.0; 00114 qudaGaugeParam.cuda_prec_sloppy = prec; 00115 qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 00116 qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; 00117 qudaGaugeParam.ga_pad = 0; 00118 qudaGaugeParam.packed_size = 0; 00119 qudaGaugeParam.gaugeGiB = 0; 00120 qudaGaugeParam.flag = false; 00121 00122 00123 qudaGaugeParam.cpu_prec = cpu_prec; 00124 qudaGaugeParam.cuda_prec = prec; 00125 qudaGaugeParam.gauge_order = gauge_order; 00126 qudaGaugeParam.type=QUDA_WILSON_LINKS; 00127 qudaGaugeParam.reconstruct = link_recon; 00128 qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE 00129 | QUDA_FAT_PRESERVE_GPU_GAUGE 00130 | QUDA_FAT_PRESERVE_COMM_MEM; 00131 00132 setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam); 00133 00134 GaugeFieldParam gParam(0, qudaGaugeParam); 00135 gParam.pad = 0; 00136 gParam.create = QUDA_REFERENCE_FIELD_CREATE; 00137 gParam.link_type = QUDA_WILSON_LINKS; 00138 gParam.order = QUDA_MILC_GAUGE_ORDER; 00139 cpuGaugeField *cpuOutLink = new cpuGaugeField(gParam); 00140 00141 gParam.pad = 0; 00142 gParam.create = QUDA_NULL_FIELD_CREATE; 00143 gParam.link_type = QUDA_WILSON_LINKS; 00144 gParam.order = QUDA_QDP_GAUGE_ORDER; 00145 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 00146 cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam); 00147 cudaGaugeField *cudaULink = new cudaGaugeField(gParam); 00148 00149 initCommonConstants(*cudaFatLink); 00150 00151 void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize); 00152 if(fatlink == NULL){ 00153 errorQuda("ERROR: allocating fatlink failed\n"); 00154 } 00155 00156 void* sitelink[4]; 00157 for(int i=0;i < 4;i++){ 00158 cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize); 00159 if(sitelink[i] == NULL){ 00160 errorQuda("ERROR; allocate sitelink[%d] failed\n", i); 00161 } 00162 } 00163 00164 createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); 00165 00166 double act_path_coeff[6]; 00167 act_path_coeff[0] = 0.625000; 00168 act_path_coeff[1] = -0.058479; 00169 act_path_coeff[2] = -0.087719; 00170 act_path_coeff[3] = 0.030778; 00171 act_path_coeff[4] = -0.007200; 00172 act_path_coeff[5] = -0.123113; 00173 00174 00175 //only record the last call's performance 00176 //the first one is for creating the cpu/cuda data structures 00177 00178 if(gauge_order == QUDA_QDP_GAUGE_ORDER){ 00179 computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam, 00180 QUDA_COMPUTE_FAT_STANDARD); 00181 } // gauge order is QDP_GAUGE_ORDER 00182 00183 cpuOutLink->setGauge((void**)fatlink); 00184 cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION); 00185 00186 00187 00188 hisq::setUnitarizeLinksConstants(unitarize_eps, 00189 max_allowed_error, 00190 reunit_allow_svd, 00191 reunit_svd_only, 00192 svd_rel_error, 00193 svd_abs_error); 00194 00195 hisq::setUnitarizeLinksPadding(0,0); 00196 00197 int* num_failures_dev; 00198 if(cudaMalloc(&num_failures_dev, sizeof(int)) != cudaSuccess){ 00199 errorQuda("cudaMallo failed for num_failures_dev\n"); 00200 } 00201 cudaMemset(num_failures_dev, 0, sizeof(int)); 00202 00203 struct timeval t0, t1; 00204 00205 gettimeofday(&t0,NULL); 00206 hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev); 00207 cudaThreadSynchronize(); 00208 gettimeofday(&t1,NULL); 00209 00210 int num_failures=0; 00211 cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost); 00212 00213 delete cudaFatLink; 00214 delete cudaULink; 00215 for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]); 00216 cudaFree(num_failures_dev); 00217 #ifdef MULTI_GPU 00218 exchange_llfat_cleanup(); 00219 #endif 00220 endQuda(); 00221 00222 printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); 00223 return num_failures; 00224 } 00225 00226 static void 00227 display_test_info() 00228 { 00229 printfQuda("running the following test:\n"); 00230 00231 printfQuda("link_precision link_reconstruct space_dimension T_dimension algorithm max allowed error\n"); 00232 printfQuda("%s %s %d/%d/%d/ %d %s %g \n", 00233 get_prec_str(prec), 00234 get_recon_str(link_recon), 00235 xdim, ydim, zdim, tdim, 00236 get_unitarization_str(reunit_svd_only), 00237 max_allowed_error); 00238 00239 #ifdef MULTI_GPU 00240 printfQuda("Grid partition info: X Y Z T\n"); 00241 printfQuda(" %d %d %d %d\n", 00242 commDimPartitioned(0), 00243 commDimPartitioned(1), 00244 commDimPartitioned(2), 00245 commDimPartitioned(3)); 00246 #endif 00247 00248 return ; 00249 00250 } 00251 00252 00253 int 00254 main(int argc, char **argv) 00255 { 00256 //default to 18 reconstruct, 8^3 x 8 00257 link_recon = QUDA_RECONSTRUCT_NO; 00258 xdim=ydim=zdim=tdim=8; 00259 cpu_prec = prec = QUDA_DOUBLE_PRECISION; 00260 00261 int i; 00262 for (i=1; i<argc; i++){ 00263 if(process_command_line_option(argc, argv, &i) == 0){ 00264 continue; 00265 } 00266 00267 fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); 00268 usage(argv); 00269 } 00270 00271 initCommsQuda(argc, argv, gridsize_from_cmdline, 4); 00272 00273 display_test_info(); 00274 int num_failures = unitarize_link_test(); 00275 printfQuda("Number of failures = %d\n", num_failures); 00276 if(num_failures > 0){ 00277 printfQuda("Failure rate = %lf%\n", num_failures/(4.0*V)); 00278 printfQuda("You may want to increase your error tolerance or vary the unitarization parameters\n"); 00279 } 00280 endCommsQuda(); 00281 00282 return EXIT_SUCCESS; 00283 } 00284 00285