|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 00005 #include <quda.h> 00006 #include <quda_internal.h> 00007 #include <dirac_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <blas_quda.h> 00012 00013 #include <test_util.h> 00014 #include <domain_wall_dslash_reference.h> 00015 00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00017 const int test_type = 2; 00018 00019 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd? 00020 const QudaDagType dagger = QUDA_DAG_NO; // apply Dslash or Dslash dagger? 00021 const int transfer = 0; // include transfer time in the benchmark? 00022 00023 const int loops = 100; 00024 00025 const int Ls = 16; 00026 double kappa5; 00027 00028 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00029 QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION; 00030 00031 QudaGaugeParam gauge_param; 00032 QudaInvertParam inv_param; 00033 00034 FullGauge gauge; 00035 00036 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00037 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp=0, *tmp2=0; 00038 00039 void *hostGauge[4]; 00040 00041 Dirac *dirac; 00042 00043 void init() { 00044 00045 gauge_param = newQudaGaugeParam(); 00046 inv_param = newQudaInvertParam(); 00047 00048 gauge_param.X[0] = 12; 00049 gauge_param.X[1] = 12; 00050 gauge_param.X[2] = 12; 00051 gauge_param.X[3] = 12; 00052 00053 setDims(gauge_param.X, Ls); 00054 00055 gauge_param.anisotropy = 2.3; 00056 00057 gauge_param.type = QUDA_WILSON_LINKS; 00058 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00059 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00060 00061 gauge_param.cpu_prec = cpu_prec; 00062 gauge_param.cuda_prec = cuda_prec; 00063 gauge_param.reconstruct = QUDA_RECONSTRUCT_12; 00064 gauge_param.reconstruct_sloppy = gauge_param.reconstruct; 00065 gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; 00066 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00067 gauge_param.type = QUDA_WILSON_LINKS; 00068 00069 inv_param.inv_type = QUDA_CG_INVERTER; 00070 00071 inv_param.mass = 0.01; 00072 inv_param.m5 = -1.5; 00073 kappa5 = 0.5/(5 + inv_param.m5); 00074 00075 inv_param.Ls = Ls; 00076 00077 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00078 inv_param.dagger = dagger; 00079 00080 inv_param.cpu_prec = cpu_prec; 00081 inv_param.cuda_prec = cuda_prec; 00082 00083 gauge_param.ga_pad = 0; 00084 inv_param.sp_pad = 0; 00085 inv_param.cl_pad = 0; 00086 00087 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00088 00089 if (test_type == 2) { 00090 inv_param.solution_type = QUDA_MAT_SOLUTION; 00091 } else { 00092 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00093 } 00094 00095 inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; 00096 00097 inv_param.verbosity = QUDA_VERBOSE; 00098 00099 // construct input fields 00100 for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); 00101 00102 ColorSpinorParam csParam; 00103 00104 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00105 csParam.nColor = 3; 00106 csParam.nSpin = 4; 00107 csParam.nDim = 5; 00108 for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; 00109 csParam.x[4] = Ls; 00110 csParam.precision = inv_param.cpu_prec; 00111 csParam.pad = 0; 00112 if (test_type < 2) { 00113 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00114 csParam.x[0] /= 2; 00115 } else { 00116 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00117 } 00118 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00119 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00120 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00121 csParam.create = QUDA_ZERO_FIELD_CREATE; 00122 00123 spinor = new cpuColorSpinorField(csParam); 00124 spinorOut = new cpuColorSpinorField(csParam); 00125 spinorRef = new cpuColorSpinorField(csParam); 00126 00127 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00128 csParam.x[0] = gauge_param.X[0]; 00129 00130 printfQuda("Randomizing fields... "); 00131 00132 construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); 00133 spinor->Source(QUDA_RANDOM_SOURCE); 00134 00135 printfQuda("done.\n"); fflush(stdout); 00136 00137 int dev = 0; 00138 initQuda(dev); 00139 00140 printfQuda("Sending gauge field to GPU\n"); 00141 00142 loadGaugeQuda(hostGauge, &gauge_param); 00143 gauge = cudaGaugePrecise; 00144 00145 if (!transfer) { 00146 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00147 csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00148 csParam.pad = inv_param.sp_pad; 00149 csParam.precision = inv_param.cuda_prec; 00150 if (csParam.precision == QUDA_DOUBLE_PRECISION ) { 00151 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00152 } else { 00153 /* Single and half */ 00154 csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00155 } 00156 00157 if (test_type < 2) { 00158 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00159 csParam.x[0] /= 2; 00160 } 00161 00162 printfQuda("Creating cudaSpinor\n"); 00163 cudaSpinor = new cudaColorSpinorField(csParam); 00164 printfQuda("Creating cudaSpinorOut\n"); 00165 cudaSpinorOut = new cudaColorSpinorField(csParam); 00166 00167 if (test_type == 2) csParam.x[0] /= 2; 00168 00169 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00170 tmp = new cudaColorSpinorField(csParam); 00171 00172 printfQuda("Sending spinor field to GPU\n"); 00173 *cudaSpinor = *spinor; 00174 00175 std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 00176 norm2(*cudaSpinor) << std::endl; 00177 00178 bool pc = (test_type != 2); 00179 DiracParam diracParam; 00180 setDiracParam(diracParam, &inv_param, pc); 00181 diracParam.verbose = QUDA_VERBOSE; 00182 diracParam.tmp1 = tmp; 00183 diracParam.tmp2 = tmp2; 00184 00185 dirac = Dirac::create(diracParam); 00186 00187 } else { 00188 std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; 00189 } 00190 00191 } 00192 00193 void end() { 00194 if (!transfer) { 00195 delete dirac; 00196 delete cudaSpinor; 00197 delete cudaSpinorOut; 00198 delete tmp; 00199 } 00200 00201 // release memory 00202 delete spinor; 00203 delete spinorOut; 00204 delete spinorRef; 00205 00206 for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); 00207 endQuda(); 00208 } 00209 00210 // execute kernel 00211 double dslashCUDA() { 00212 00213 printfQuda("Executing %d kernel loops...\n", loops); 00214 fflush(stdout); 00215 stopwatchStart(); 00216 for (int i = 0; i < loops; i++) { 00217 switch (test_type) { 00218 case 0: 00219 if (transfer) { 00220 dslashQuda(spinorOut->v, spinor->v, &inv_param, parity); 00221 } else { 00222 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00223 } 00224 break; 00225 case 1: 00226 case 2: 00227 if (transfer) { 00228 MatQuda(spinorOut->v, spinor->v, &inv_param); 00229 } else { 00230 dirac->M(*cudaSpinorOut, *cudaSpinor); 00231 } 00232 break; 00233 } 00234 } 00235 00236 // check for errors 00237 cudaError_t stat = cudaGetLastError(); 00238 if (stat != cudaSuccess) 00239 printf("with ERROR: %s\n", cudaGetErrorString(stat)); 00240 00241 cudaThreadSynchronize(); 00242 double secs = stopwatchReadSeconds(); 00243 printf("done.\n\n"); 00244 00245 return secs; 00246 } 00247 00248 void dslashRef() { 00249 00250 // FIXME: remove once reference clover is finished 00251 if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { 00252 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00253 } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { 00254 inv_param.matpc_type = QUDA_MATPC_ODD_ODD; 00255 } 00256 00257 // compare to dslash reference implementation 00258 printf("Calculating reference implementation..."); 00259 fflush(stdout); 00260 switch (test_type) { 00261 case 0: 00262 dslash(spinorRef->v, hostGauge, spinor->v, parity, dagger, 00263 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00264 break; 00265 case 1: 00266 matpc(spinorRef->v, hostGauge, spinor->v, kappa5, inv_param.matpc_type, dagger, 00267 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00268 break; 00269 case 2: 00270 mat(spinorRef->v, hostGauge, spinor->v, kappa5, dagger, 00271 inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); 00272 break; 00273 default: 00274 printf("Test type not defined\n"); 00275 exit(-1); 00276 } 00277 00278 printf("done.\n"); 00279 00280 } 00281 00282 int main(int argc, char **argv) 00283 { 00284 init(); 00285 00286 float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); 00287 float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10); 00288 printf("\nSpinor mem: %.3f GiB\n", spinorGiB); 00289 printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); 00290 printf("Shared mem: %.3f KB\n", sharedKB); 00291 00292 int attempts = 1; 00293 dslashRef(); 00294 00295 for (int i=0; i<attempts; i++) { 00296 00297 double secs = dslashCUDA(); 00298 00299 if (!transfer) *spinorOut = *cudaSpinorOut; 00300 00301 // print timing information 00302 printf("%fms per loop\n", 1000*secs); 00303 00304 unsigned long long flops = 0; 00305 if (!transfer) flops = dirac->Flops(); 00306 int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24; 00307 printf("GFLOPS = %f\n", 1.0e-9*flops/secs); 00308 printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/((secs/loops)*(1<<30))); 00309 00310 if (!transfer) { 00311 std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 00312 ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00313 } else { 00314 std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00315 } 00316 00317 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00318 } 00319 end(); 00320 }
1.7.3