|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 00005 #include <quda.h> 00006 #include <quda_internal.h> 00007 #include <dirac_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <blas_quda.h> 00012 00013 #include <test_util.h> 00014 #include <wilson_dslash_reference.h> 00015 00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00017 const int test_type = 1; 00018 // clover-improved? (0 = plain Wilson, 1 = clover) 00019 const int clover_yes = 0; 00020 00021 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd? 00022 const QudaDagType dagger = QUDA_DAG_NO; // apply Dslash or Dslash dagger? 00023 const int transfer = 0; // include transfer time in the benchmark? 00024 00025 const int loops = 100; 00026 00027 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00028 QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION; 00029 00030 QudaGaugeParam gauge_param; 00031 QudaInvertParam inv_param; 00032 00033 FullGauge gauge; 00034 FullClover clover, cloverInv; 00035 00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0; 00038 00039 void *hostGauge[4], *hostClover, *hostCloverInv; 00040 00041 Dirac *dirac; 00042 00043 void init() { 00044 00045 gauge_param = newQudaGaugeParam(); 00046 inv_param = newQudaInvertParam(); 00047 00048 gauge_param.X[0] = 24; 00049 gauge_param.X[1] = 24; 00050 gauge_param.X[2] = 24; 00051 gauge_param.X[3] = 24; 00052 setDims(gauge_param.X); 00053 00054 gauge_param.anisotropy = 2.3; 00055 00056 gauge_param.type = QUDA_WILSON_LINKS; 00057 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00058 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00059 00060 gauge_param.cpu_prec = cpu_prec; 00061 gauge_param.cuda_prec = cuda_prec; 00062 gauge_param.reconstruct = QUDA_RECONSTRUCT_12; 00063 gauge_param.reconstruct_sloppy = gauge_param.reconstruct; 00064 gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; 00065 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00066 00067 inv_param.kappa = 0.1; 00068 00069 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00070 inv_param.dagger = dagger; 00071 00072 inv_param.cpu_prec = cpu_prec; 00073 inv_param.cuda_prec = cuda_prec; 00074 00075 gauge_param.ga_pad = 0; 00076 inv_param.sp_pad = 0; 00077 inv_param.cl_pad = 0; 00078 00079 //gauge_param.ga_pad = 24*24*24; 00080 //inv_param.sp_pad = 24*24*24; 00081 //inv_param.cl_pad = 24*24*24; 00082 00083 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00084 00085 if (test_type == 2) { 00086 inv_param.solution_type = QUDA_MAT_SOLUTION; 00087 } else { 00088 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00089 } 00090 00091 if (clover_yes) { 00092 inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; 00093 inv_param.clover_cpu_prec = cpu_prec; 00094 inv_param.clover_cuda_prec = cuda_prec; 00095 inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; 00096 inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; 00097 if (test_type > 0) { 00098 hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); 00099 hostCloverInv = hostClover; // fake it 00100 } else { 00101 hostClover = NULL; 00102 hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); 00103 } 00104 } else { 00105 inv_param.dslash_type = QUDA_WILSON_DSLASH; 00106 } 00107 00108 inv_param.verbosity = QUDA_VERBOSE; 00109 00110 // construct input fields 00111 for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); 00112 00113 ColorSpinorParam csParam; 00114 00115 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00116 csParam.nColor = 3; 00117 csParam.nSpin = 4; 00118 csParam.nDim = 4; 00119 for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; 00120 csParam.precision = inv_param.cpu_prec; 00121 csParam.pad = 0; 00122 if (test_type < 2) { 00123 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00124 csParam.x[0] /= 2; 00125 } else { 00126 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00127 } 00128 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00129 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00130 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00131 csParam.create = QUDA_ZERO_FIELD_CREATE; 00132 00133 spinor = new cpuColorSpinorField(csParam); 00134 spinorOut = new cpuColorSpinorField(csParam); 00135 spinorRef = new cpuColorSpinorField(csParam); 00136 00137 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00138 csParam.x[0] = gauge_param.X[0]; 00139 00140 printfQuda("Randomizing fields... "); 00141 00142 construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); 00143 spinor->Source(QUDA_RANDOM_SOURCE); 00144 00145 if (clover_yes) { 00146 double norm = 0.0; // clover components are random numbers in the range (-norm, norm) 00147 double diag = 1.0; // constant added to the diagonal 00148 00149 if (test_type == 2) { 00150 construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); 00151 } else { 00152 construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); 00153 } 00154 } 00155 printfQuda("done.\n"); fflush(stdout); 00156 00157 int dev = 0; 00158 initQuda(dev); 00159 00160 printfQuda("Sending gauge field to GPU\n"); 00161 00162 loadGaugeQuda(hostGauge, &gauge_param); 00163 gauge = cudaGaugePrecise; 00164 00165 if (clover_yes) { 00166 printfQuda("Sending clover field to GPU\n"); 00167 loadCloverQuda(hostClover, hostCloverInv, &inv_param); 00168 clover = cudaCloverPrecise; 00169 cloverInv = cudaCloverInvPrecise; 00170 } 00171 00172 if (!transfer) { 00173 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00174 csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00175 csParam.pad = inv_param.sp_pad; 00176 csParam.precision = inv_param.cuda_prec; 00177 if (csParam.precision == QUDA_DOUBLE_PRECISION ) { 00178 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00179 } else { 00180 /* Single and half */ 00181 csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00182 } 00183 00184 if (test_type < 2) { 00185 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00186 csParam.x[0] /= 2; 00187 } 00188 00189 printfQuda("Creating cudaSpinor\n"); 00190 cudaSpinor = new cudaColorSpinorField(csParam); 00191 printfQuda("Creating cudaSpinorOut\n"); 00192 cudaSpinorOut = new cudaColorSpinorField(csParam); 00193 00194 if (test_type == 2) csParam.x[0] /= 2; 00195 00196 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00197 tmp = new cudaColorSpinorField(csParam); 00198 if (clover_yes) tmp2 = new cudaColorSpinorField(csParam); 00199 00200 printfQuda("Sending spinor field to GPU\n"); 00201 *cudaSpinor = *spinor; 00202 00203 std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 00204 norm2(*cudaSpinor) << std::endl; 00205 00206 bool pc = (test_type != 2); 00207 DiracParam diracParam; 00208 setDiracParam(diracParam, &inv_param, pc); 00209 diracParam.verbose = QUDA_VERBOSE; 00210 diracParam.tmp1 = tmp; 00211 diracParam.tmp2 = tmp2; 00212 00213 dirac = Dirac::create(diracParam); 00214 00215 } else { 00216 std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; 00217 } 00218 00219 } 00220 00221 void end() { 00222 if (!transfer) { 00223 delete dirac; 00224 delete cudaSpinor; 00225 delete cudaSpinorOut; 00226 delete tmp; 00227 if (clover_yes) delete tmp2; 00228 } 00229 00230 // release memory 00231 delete spinor; 00232 delete spinorOut; 00233 delete spinorRef; 00234 00235 for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); 00236 if (clover_yes) { 00237 if (test_type == 2) free(hostClover); 00238 else free(hostCloverInv); 00239 } 00240 endQuda(); 00241 } 00242 00243 // execute kernel 00244 double dslashCUDA() { 00245 00246 printfQuda("Executing %d kernel loops...\n", loops); 00247 fflush(stdout); 00248 stopwatchStart(); 00249 for (int i = 0; i < loops; i++) { 00250 switch (test_type) { 00251 case 0: 00252 if (transfer) { 00253 dslashQuda(spinorOut->v, spinor->v, &inv_param, parity); 00254 } else { 00255 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00256 } 00257 break; 00258 case 1: 00259 case 2: 00260 if (transfer) { 00261 MatQuda(spinorOut->v, spinor->v, &inv_param); 00262 } else { 00263 dirac->M(*cudaSpinorOut, *cudaSpinor); 00264 } 00265 break; 00266 } 00267 } 00268 00269 // check for errors 00270 cudaError_t stat = cudaGetLastError(); 00271 if (stat != cudaSuccess) 00272 printf("with ERROR: %s\n", cudaGetErrorString(stat)); 00273 00274 cudaThreadSynchronize(); 00275 double secs = stopwatchReadSeconds(); 00276 printf("done.\n\n"); 00277 00278 return secs; 00279 } 00280 00281 void dslashRef() { 00282 00283 // FIXME: remove once reference clover is finished 00284 if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { 00285 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00286 } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { 00287 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00288 } 00289 00290 // compare to dslash reference implementation 00291 printf("Calculating reference implementation..."); 00292 fflush(stdout); 00293 switch (test_type) { 00294 case 0: 00295 dslash(spinorRef->v, hostGauge, spinor->v, parity, dagger, 00296 inv_param.cpu_prec, gauge_param.cpu_prec); 00297 break; 00298 case 1: 00299 matpc(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, inv_param.matpc_type, dagger, 00300 inv_param.cpu_prec, gauge_param.cpu_prec); 00301 break; 00302 case 2: 00303 mat(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, dagger, 00304 inv_param.cpu_prec, gauge_param.cpu_prec); 00305 break; 00306 default: 00307 printf("Test type not defined\n"); 00308 exit(-1); 00309 } 00310 00311 printf("done.\n"); 00312 00313 } 00314 00315 int main(int argc, char **argv) 00316 { 00317 init(); 00318 00319 float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); 00320 float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10); 00321 printf("\nSpinor mem: %.3f GiB\n", spinorGiB); 00322 printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); 00323 printf("Shared mem: %.3f KB\n", sharedKB); 00324 00325 int attempts = 1; 00326 dslashRef(); 00327 for (int i=0; i<attempts; i++) { 00328 00329 double secs = dslashCUDA(); 00330 00331 if (!transfer) *spinorOut = *cudaSpinorOut; 00332 00333 // print timing information 00334 printf("%fms per loop\n", 1000*secs); 00335 00336 unsigned long long flops = 0; 00337 if (!transfer) flops = dirac->Flops(); 00338 int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24; 00339 if (clover_yes) { 00340 floats += test_type ? 72*2 : 72; 00341 } 00342 printf("GFLOPS = %f\n", 1.0e-9*flops/secs); 00343 printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/((secs/loops)*(1<<30))); 00344 00345 if (!transfer) { 00346 std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 00347 ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00348 } else { 00349 std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00350 } 00351 00352 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00353 } 00354 end(); 00355 }
1.7.3