QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 00005 #include <quda.h> 00006 #include <quda_internal.h> 00007 #include <dirac_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <blas_quda.h> 00012 00013 #include <test_util.h> 00014 #include <domain_wall_dslash_reference.h> 00015 00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00017 const int test_type = 1; 00018 00019 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd? 00020 const QudaDagType dagger = QUDA_DAG_NO; // apply Dslash or Dslash dagger? 00021 const int transfer = 0; // include transfer time in the benchmark? 00022 00023 const int loops = 100; 00024 00025 const int Ls = 16; 00026 double kappa5; 00027 00028 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00029 QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; 00030 00031 QudaGaugeParam gauge_param; 00032 QudaInvertParam inv_param; 00033 00034 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00035 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0; 00036 00037 void *hostGauge[4]; 00038 00039 Dirac *dirac; 00040 00041 void init() { 00042 00043 gauge_param = newQudaGaugeParam(); 00044 inv_param = newQudaInvertParam(); 00045 00046 gauge_param.X[0] = 16; 00047 gauge_param.X[1] = 16; 00048 gauge_param.X[2] = 16; 00049 gauge_param.X[3] = 16; 00050 00051 setDims(gauge_param.X, Ls); 00052 00053 gauge_param.anisotropy = 2.3; 00054 00055 gauge_param.type = QUDA_WILSON_LINKS; 00056 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00057 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00058 00059 gauge_param.cpu_prec = cpu_prec; 00060 gauge_param.cuda_prec = cuda_prec; 00061 gauge_param.reconstruct = QUDA_RECONSTRUCT_12; 00062 gauge_param.reconstruct_sloppy = gauge_param.reconstruct; 00063 gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; 00064 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00065 gauge_param.type = QUDA_WILSON_LINKS; 00066 00067 inv_param.inv_type = QUDA_CG_INVERTER; 00068 00069 inv_param.mass = 0.01; 00070 inv_param.m5 = -1.5; 00071 kappa5 = 0.5/(5 + inv_param.m5); 00072 00073 inv_param.Ls = Ls; 00074 00075 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00076 inv_param.dagger = dagger; 00077 00078 inv_param.cpu_prec = cpu_prec; 00079 inv_param.cuda_prec = cuda_prec; 00080 00081 gauge_param.ga_pad = 0; 00082 inv_param.sp_pad = 0; 00083 inv_param.cl_pad = 0; 00084 00085 inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00086 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00087 00088 if (test_type == 2) { 00089 inv_param.solution_type = QUDA_MAT_SOLUTION; 00090 } else { 00091 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00092 } 00093 00094 inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; 00095 00096 inv_param.verbosity = QUDA_VERBOSE; 00097 00098 // construct input fields 00099 for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); 00100 00101 ColorSpinorParam csParam; 00102 00103 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00104 csParam.nColor = 3; 00105 csParam.nSpin = 4; 00106 csParam.nDim = 5; 00107 for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; 00108 csParam.x[4] = Ls; 00109 csParam.precision = inv_param.cpu_prec; 00110 csParam.pad = 0; 00111 if (test_type < 2) { 00112 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00113 csParam.x[0] /= 2; 00114 } else { 00115 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00116 } 00117 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00118 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00119 csParam.gammaBasis = inv_param.gamma_basis; 00120 csParam.create = QUDA_ZERO_FIELD_CREATE; 00121 00122 spinor = new cpuColorSpinorField(csParam); 00123 spinorOut = new cpuColorSpinorField(csParam); 00124 spinorRef = new cpuColorSpinorField(csParam); 00125 00126 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00127 csParam.x[0] = gauge_param.X[0]; 00128 00129 printfQuda("Randomizing fields... "); 00130 00131 construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); 00132 spinor->Source(QUDA_RANDOM_SOURCE); 00133 00134 printfQuda("done.\n"); fflush(stdout); 00135 00136 int dev = 0; 00137 initQuda(dev); 00138 00139 printfQuda("Sending gauge field to GPU\n"); 00140 00141 loadGaugeQuda(hostGauge, &gauge_param); 00142 00143 if (!transfer) { 00144 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00145 csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00146 csParam.pad = inv_param.sp_pad; 00147 csParam.precision = inv_param.cuda_prec; 00148 if (csParam.precision == QUDA_DOUBLE_PRECISION ) { 00149 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00150 } else { 00151 /* Single and half */ 00152 csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00153 } 00154 00155 if (test_type < 2) { 00156 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00157 csParam.x[0] /= 2; 00158 } 00159 00160 printfQuda("Creating cudaSpinor\n"); 00161 cudaSpinor = new cudaColorSpinorField(csParam); 00162 printfQuda("Creating cudaSpinorOut\n"); 00163 cudaSpinorOut = new cudaColorSpinorField(csParam); 00164 00165 if (test_type == 2) csParam.x[0] /= 2; 00166 00167 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00168 tmp = new cudaColorSpinorField(csParam); 00169 00170 printfQuda("Sending spinor field to GPU\n"); 00171 *cudaSpinor = *spinor; 00172 00173 std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 00174 norm2(*cudaSpinor) << std::endl; 00175 00176 bool pc = (test_type != 2); 00177 DiracParam diracParam; 00178 setDiracParam(diracParam, &inv_param, pc); 00179 diracParam.verbose = QUDA_VERBOSE; 00180 diracParam.tmp1 = tmp; 00181 diracParam.tmp2 = tmp2; 00182 00183 dirac = Dirac::create(diracParam); 00184 00185 } else { 00186 std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; 00187 } 00188 00189 } 00190 00191 void end() { 00192 if (!transfer) { 00193 delete dirac; 00194 delete cudaSpinor; 00195 delete cudaSpinorOut; 00196 delete tmp; 00197 } 00198 00199 // release memory 00200 delete spinor; 00201 delete spinorOut; 00202 delete spinorRef; 00203 00204 for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); 00205 endQuda(); 00206 } 00207 00208 // execute kernel 00209 double dslashCUDA() { 00210 00211 printfQuda("Executing %d kernel loops...\n", loops); 00212 fflush(stdout); 00213 00214 cudaEvent_t start, end; 00215 cudaEventCreate(&start); 00216 cudaEventRecord(start, 0); 00217 cudaEventSynchronize(start); 00218 00219 for (int i = 0; i < loops; i++) { 00220 switch (test_type) { 00221 case 0: 00222 if (transfer) { 00223 dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); 00224 } else { 00225 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00226 } 00227 break; 00228 case 1: 00229 case 2: 00230 if (transfer) { 00231 MatQuda(spinorOut->V(), spinor->V(), &inv_param); 00232 } else { 00233 dirac->M(*cudaSpinorOut, *cudaSpinor); 00234 } 00235 break; 00236 } 00237 } 00238 00239 cudaEventCreate(&end); 00240 cudaEventRecord(end, 0); 00241 cudaEventSynchronize(end); 00242 float runTime; 00243 cudaEventElapsedTime(&runTime, start, end); 00244 cudaEventDestroy(start); 00245 cudaEventDestroy(end); 00246 00247 double secs = runTime / 1000; //stopwatchReadSeconds(); 00248 00249 // check for errors 00250 cudaError_t stat = cudaGetLastError(); 00251 if (stat != cudaSuccess) 00252 printf("with ERROR: %s\n", cudaGetErrorString(stat)); 00253 00254 printf("done.\n\n"); 00255 00256 return secs; 00257 } 00258 00259 void dslashRef() { 00260 00261 // FIXME: remove once reference clover is finished 00262 if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { 00263 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00264 } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { 00265 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00266 } 00267 00268 // compare to dslash reference implementation 00269 printf("Calculating reference implementation..."); 00270 fflush(stdout); 00271 switch (test_type) { 00272 case 0: 00273 dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, 00274 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00275 break; 00276 case 1: 00277 matpc(spinorRef->V(), hostGauge, spinor->V(), kappa5, inv_param.matpc_type, dagger, 00278 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00279 break; 00280 case 2: 00281 mat(spinorRef->V(), hostGauge, spinor->V(), kappa5, dagger, 00282 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00283 break; 00284 default: 00285 printf("Test type not defined\n"); 00286 exit(-1); 00287 } 00288 00289 printf("done.\n"); 00290 00291 } 00292 00293 int main(int argc, char **argv) 00294 { 00295 init(); 00296 00297 float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); 00298 printf("\nSpinor mem: %.3f GiB\n", spinorGiB); 00299 printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); 00300 00301 int attempts = 1; 00302 dslashRef(); 00303 00304 for (int i=0; i<attempts; i++) { 00305 00306 if (tune) { // warm-up run 00307 printfQuda("Tuning...\n"); 00308 setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); 00309 dslashCUDA(1); 00310 } 00311 00312 double secs = dslashCUDA(); 00313 00314 if (!transfer) *spinorOut = *cudaSpinorOut; 00315 00316 // print timing information 00317 printf("%fms per loop\n", 1000*secs); 00318 00319 unsigned long long flops = 0; 00320 if (!transfer) flops = dirac->Flops(); 00321 00322 int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24; 00323 if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 00324 spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short 00325 int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; 00326 00327 printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); 00328 printfQuda("GB/s = %f\n\n", 00329 (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); 00330 00331 00332 if (!transfer) { 00333 std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 00334 ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00335 } else { 00336 std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00337 } 00338 00339 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00340 } 00341 end(); 00342 }