|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 00005 #include <quda.h> 00006 #include <quda_internal.h> 00007 #include <dirac_quda.h> 00008 #include <dslash_quda.h> 00009 #include <invert_quda.h> 00010 #include <util_quda.h> 00011 #include <blas_quda.h> 00012 00013 #include <test_util.h> 00014 #include <twisted_mass_dslash_reference.h> 00015 00016 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) 00017 const int test_type = 1; 00018 00019 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd? 00020 const QudaDagType dagger = QUDA_DAG_NO; // apply Dslash or Dslash dagger? 00021 const int transfer = 0; // include transfer time in the benchmark? 00022 00023 const int loops = 100; 00024 00025 QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; 00026 QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION; 00027 00028 QudaGaugeParam gauge_param; 00029 QudaInvertParam inv_param; 00030 00031 FullGauge gauge; 00032 00033 cpuColorSpinorField *spinor, *spinorOut, *spinorRef; 00034 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut, *tmp1=0, *tmp2=0; 00035 00036 void *hostGauge[4]; 00037 00038 Dirac *dirac; 00039 00040 void init() { 00041 00042 gauge_param = newQudaGaugeParam(); 00043 inv_param = newQudaInvertParam(); 00044 00045 gauge_param.X[0] = 24; 00046 gauge_param.X[1] = 24; 00047 gauge_param.X[2] = 24; 00048 gauge_param.X[3] = 24; 00049 setDims(gauge_param.X); 00050 00051 gauge_param.anisotropy = 2.3; 00052 00053 gauge_param.type = QUDA_WILSON_LINKS; 00054 gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; 00055 gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; 00056 00057 gauge_param.cpu_prec = cpu_prec; 00058 gauge_param.cuda_prec = cuda_prec; 00059 gauge_param.reconstruct = QUDA_RECONSTRUCT_12; 00060 gauge_param.reconstruct_sloppy = gauge_param.reconstruct; 00061 gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; 00062 gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; 00063 gauge_param.type = QUDA_WILSON_LINKS; 00064 00065 inv_param.kappa = 0.1; 00066 inv_param.mu = 0.01; 00067 inv_param.twist_flavor = QUDA_TWIST_MINUS; 00068 00069 inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; 00070 inv_param.dagger = dagger; 00071 00072 inv_param.cpu_prec = cpu_prec; 00073 inv_param.cuda_prec = cuda_prec; 00074 00075 gauge_param.ga_pad = 0; 00076 inv_param.sp_pad = 0; 00077 inv_param.cl_pad = 0; 00078 00079 //gauge_param.ga_pad = 24*24*24; 00080 //inv_param.sp_pad = 24*24*24; 00081 //inv_param.cl_pad = 24*24*24; 00082 00083 inv_param.dirac_order = QUDA_DIRAC_ORDER; 00084 00085 if (test_type == 2) { 00086 inv_param.solution_type = QUDA_MAT_SOLUTION; 00087 } else { 00088 inv_param.solution_type = QUDA_MATPC_SOLUTION; 00089 } 00090 00091 inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH; 00092 00093 inv_param.verbosity = QUDA_VERBOSE; 00094 00095 // construct input fields 00096 for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); 00097 00098 ColorSpinorParam csParam; 00099 00100 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00101 csParam.nColor = 3; 00102 csParam.nSpin = 4; 00103 csParam.twistFlavor = inv_param.twist_flavor; 00104 csParam.nDim = 4; 00105 for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; 00106 csParam.precision = inv_param.cpu_prec; 00107 csParam.pad = 0; 00108 if (test_type < 2) { 00109 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00110 csParam.x[0] /= 2; 00111 } else { 00112 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00113 } 00114 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00115 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00116 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00117 csParam.create = QUDA_ZERO_FIELD_CREATE; 00118 00119 spinor = new cpuColorSpinorField(csParam); 00120 spinorOut = new cpuColorSpinorField(csParam); 00121 spinorRef = new cpuColorSpinorField(csParam); 00122 00123 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00124 csParam.x[0] = gauge_param.X[0]; 00125 00126 printfQuda("Randomizing fields... "); 00127 00128 construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); 00129 spinor->Source(QUDA_RANDOM_SOURCE); 00130 00131 printfQuda("done.\n"); fflush(stdout); 00132 00133 int dev = 0; 00134 initQuda(dev); 00135 00136 printfQuda("Sending gauge field to GPU\n"); 00137 00138 loadGaugeQuda(hostGauge, &gauge_param); 00139 gauge = cudaGaugePrecise; 00140 00141 if (!transfer) { 00142 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00143 csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; 00144 csParam.pad = inv_param.sp_pad; 00145 csParam.precision = inv_param.cuda_prec; 00146 if (csParam.precision == QUDA_DOUBLE_PRECISION ) { 00147 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00148 } else { 00149 /* Single and half */ 00150 csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; 00151 } 00152 00153 if (test_type < 2) { 00154 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00155 csParam.x[0] /= 2; 00156 } 00157 00158 printfQuda("Creating cudaSpinor\n"); 00159 cudaSpinor = new cudaColorSpinorField(csParam); 00160 printfQuda("Creating cudaSpinorOut\n"); 00161 cudaSpinorOut = new cudaColorSpinorField(csParam); 00162 00163 if (test_type == 2) csParam.x[0] /= 2; 00164 00165 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00166 tmp1 = new cudaColorSpinorField(csParam); 00167 tmp2 = new cudaColorSpinorField(csParam); 00168 00169 printfQuda("Sending spinor field to GPU\n"); 00170 *cudaSpinor = *spinor; 00171 00172 std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 00173 norm2(*cudaSpinor) << std::endl; 00174 00175 bool pc = (test_type != 2); 00176 DiracParam diracParam; 00177 setDiracParam(diracParam, &inv_param, pc); 00178 diracParam.verbose = QUDA_VERBOSE; 00179 diracParam.tmp1 = tmp1; 00180 diracParam.tmp2 = tmp2; 00181 00182 dirac = Dirac::create(diracParam); 00183 } else { 00184 std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; 00185 } 00186 00187 } 00188 00189 void end() { 00190 if (!transfer) { 00191 delete dirac; 00192 delete cudaSpinor; 00193 delete cudaSpinorOut; 00194 delete tmp1; 00195 delete tmp2; 00196 } 00197 00198 // release memory 00199 delete spinor; 00200 delete spinorOut; 00201 delete spinorRef; 00202 00203 for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); 00204 endQuda(); 00205 } 00206 00207 // execute kernel 00208 double dslashCUDA() { 00209 00210 printfQuda("Executing %d kernel loops...\n", loops); 00211 fflush(stdout); 00212 stopwatchStart(); 00213 for (int i = 0; i < loops; i++) { 00214 switch (test_type) { 00215 case 0: 00216 if (transfer) { 00217 dslashQuda(spinorOut->v, spinor->v, &inv_param, parity); 00218 } else { 00219 dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); 00220 } 00221 break; 00222 case 1: 00223 case 2: 00224 if (transfer) { 00225 MatQuda(spinorOut->v, spinor->v, &inv_param); 00226 } else { 00227 dirac->M(*cudaSpinorOut, *cudaSpinor); 00228 } 00229 break; 00230 } 00231 } 00232 00233 // check for errors 00234 cudaError_t stat = cudaGetLastError(); 00235 if (stat != cudaSuccess) 00236 printf("with ERROR: %s\n", cudaGetErrorString(stat)); 00237 00238 cudaThreadSynchronize(); 00239 double secs = stopwatchReadSeconds(); 00240 printf("done.\n\n"); 00241 00242 return secs; 00243 } 00244 00245 void dslashRef() { 00246 00247 // compare to dslash reference implementation 00248 printf("Calculating reference implementation..."); 00249 fflush(stdout); 00250 switch (test_type) { 00251 case 0: 00252 dslash(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00253 parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec); 00254 break; 00255 case 1: 00256 matpc(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00257 inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param.cpu_prec); 00258 break; 00259 case 2: 00260 mat(spinorRef->v, hostGauge, spinor->v, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 00261 dagger, inv_param.cpu_prec, gauge_param.cpu_prec); 00262 break; 00263 default: 00264 printf("Test type not defined\n"); 00265 exit(-1); 00266 } 00267 00268 printf("done.\n"); 00269 00270 } 00271 00272 int main(int argc, char **argv) 00273 { 00274 init(); 00275 00276 float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); 00277 float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10); 00278 printf("\nSpinor mem: %.3f GiB\n", spinorGiB); 00279 printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); 00280 printf("Shared mem: %.3f KB\n", sharedKB); 00281 00282 int attempts = 1; 00283 dslashRef(); 00284 for (int i=0; i<attempts; i++) { 00285 00286 double secs = dslashCUDA(); 00287 00288 if (!transfer) *spinorOut = *cudaSpinorOut; 00289 00290 // print timing information 00291 printf("%fms per loop\n", 1000*secs); 00292 00293 unsigned long long flops = 0; 00294 if (!transfer) flops = dirac->Flops(); 00295 int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24; 00296 00297 printf("GFLOPS = %f\n", 1.0e-9*flops/secs); 00298 printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/((secs/loops)*(1<<30))); 00299 00300 if (!transfer) { 00301 std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 00302 ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00303 } else { 00304 std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; 00305 } 00306 00307 cpuColorSpinorField::Compare(*spinorRef, *spinorOut); 00308 } 00309 end(); 00310 }
1.7.3