QUDA v0.4.0
A library for QCD on GPUs
|
00001 #define MAX(a,b) ((a) > (b)?(a): (b)) 00002 __constant__ int X1h; 00003 __constant__ int X2h; 00004 __constant__ int X1; 00005 __constant__ int X2; 00006 __constant__ int X3; 00007 __constant__ int X4; 00008 00009 __constant__ int X1_3; 00010 __constant__ int X2_3; 00011 __constant__ int X3_3; 00012 __constant__ int X4_3; 00013 00014 __constant__ int X1m1; 00015 __constant__ int X2m1; 00016 __constant__ int X3m1; 00017 __constant__ int X4m1; 00018 00019 __constant__ int X1m3; 00020 __constant__ int X2m3; 00021 __constant__ int X3m3; 00022 __constant__ int X4m3; 00023 00024 __constant__ int X2X1mX1; 00025 __constant__ int X3X2X1mX2X1; 00026 __constant__ int X4X3X2X1mX3X2X1; 00027 __constant__ int X4X3X2X1hmX3X2X1h; 00028 00029 __constant__ int X2X1m3X1; 00030 __constant__ int X3X2X1m3X2X1; 00031 __constant__ int X4X3X2X1m3X3X2X1; 00032 __constant__ int X4X3X2X1hm3X3X2X1h; 00033 00034 __constant__ int X2X1; 00035 __constant__ int X3X1; 00036 __constant__ int X3X2; 00037 __constant__ int X3X2X1; 00038 __constant__ int X4X2X1; 00039 __constant__ int X4X2X1h; 00040 __constant__ int X4X3X1; 00041 __constant__ int X4X3X1h; 00042 __constant__ int X4X3X2; 00043 __constant__ int X4X3X2h; 00044 00045 __constant__ int Vh_2d_max; 00046 00047 __constant__ int X2X1_3; 00048 __constant__ int X3X2X1_3; 00049 00050 __constant__ int Vh; 00051 __constant__ int Vs; 00052 __constant__ int Vsh; 00053 __constant__ int sp_stride; 00054 __constant__ int ga_stride; 00055 __constant__ int cl_stride; 00056 __constant__ int ghostFace[QUDA_MAX_DIM]; 00057 00058 __constant__ int fat_ga_stride; 00059 __constant__ int long_ga_stride; 00060 __constant__ float fat_ga_max; 00061 00062 __constant__ int gauge_fixed; 00063 00064 // domain wall constants 00065 __constant__ int Ls; 00066 00067 // single precision constants 00068 __constant__ float anisotropy_f; 00069 __constant__ float coeff_f; 00070 __constant__ float t_boundary_f; 00071 __constant__ float pi_f; 00072 00073 // double precision constants 00074 __constant__ double anisotropy; 00075 __constant__ double t_boundary; 00076 __constant__ double coeff; 00077 00078 __constant__ float2 An2; 00079 __constant__ float2 TB2; 00080 __constant__ float2 No2; 00081 00082 // Are we processor 0 in time? 00083 __constant__ bool Pt0; 00084 00085 // Are we processor Nt-1 in time? 00086 __constant__ bool PtNm1; 00087 00088 // factor of 2 (or 1) for T-dimensional spin projection 00089 __constant__ double tProjScale; 00090 __constant__ float tProjScale_f; 00091 00092 //for link fattening/gauge force/fermion force code 00093 __constant__ int site_ga_stride; 00094 __constant__ int staple_stride; 00095 __constant__ int llfat_ga_stride; 00096 __constant__ int mom_ga_stride; 00097 __constant__ int E1, E2, E3, E4, E1h; 00098 __constant__ int Vh_ex; 00099 __constant__ int E2E1; 00100 __constant__ int E3E2E1; 00101 00102 int initDslash = 0; 00103 int initClover = 0; 00104 int initDomainWall = 0; 00105 int initStaggered = 0; 00106 00107 bool qudaPt0 = true; // Single core versions always to Boundary 00108 bool qudaPtNm1 = true; 00109 00110 void initCommonConstants(const LatticeField &lat) { 00111 int Vh = lat.VolumeCB(); 00112 cudaMemcpyToSymbol("Vh", &Vh, sizeof(int)); 00113 00114 Vspatial = lat.X()[0]*lat.X()[1]*lat.X()[2]/2; // FIXME - this should not be called Vs, rather Vsh 00115 cudaMemcpyToSymbol("Vs", &Vspatial, sizeof(int)); 00116 00117 int half_Vspatial = Vspatial; 00118 cudaMemcpyToSymbol("Vsh", &half_Vspatial, sizeof(int)); 00119 00120 int X1 = lat.X()[0]; 00121 cudaMemcpyToSymbol("X1", &X1, sizeof(int)); 00122 00123 int X2 = lat.X()[1]; 00124 cudaMemcpyToSymbol("X2", &X2, sizeof(int)); 00125 00126 int X3 = lat.X()[2]; 00127 cudaMemcpyToSymbol("X3", &X3, sizeof(int)); 00128 00129 int X4 = lat.X()[3]; 00130 cudaMemcpyToSymbol("X4", &X4, sizeof(int)); 00131 00132 int ghostFace[4]; 00133 ghostFace[0] = X2*X3*X4/2; 00134 ghostFace[1] = X1*X3*X4/2; 00135 ghostFace[2] = X1*X2*X4/2; 00136 ghostFace[3] = X1*X2*X3/2; 00137 cudaMemcpyToSymbol("ghostFace", ghostFace, 4*sizeof(int)); 00138 00139 int X1_3 = 3*X1; 00140 cudaMemcpyToSymbol("X1_3", &X1_3, sizeof(int)); 00141 00142 int X2_3 = 3*X2; 00143 cudaMemcpyToSymbol("X2_3", &X2_3, sizeof(int)); 00144 00145 int X3_3 = 3*X3; 00146 cudaMemcpyToSymbol("X3_3", &X3_3, sizeof(int)); 00147 00148 int X4_3 = 3*X4; 00149 cudaMemcpyToSymbol("X4_3", &X4_3, sizeof(int)); 00150 00151 00152 int X2X1 = X2*X1; 00153 cudaMemcpyToSymbol("X2X1", &X2X1, sizeof(int)); 00154 00155 int X3X1 = X3*X1; 00156 cudaMemcpyToSymbol("X3X1", &X3X1, sizeof(int)); 00157 00158 int X3X2 = X3*X2; 00159 cudaMemcpyToSymbol("X3X2", &X3X2, sizeof(int)); 00160 00161 00162 int X3X2X1 = X3*X2*X1; 00163 cudaMemcpyToSymbol("X3X2X1", &X3X2X1, sizeof(int)); 00164 00165 int X4X2X1 = X4*X2*X1; 00166 cudaMemcpyToSymbol("X4X2X1", &X4X2X1, sizeof(int)); 00167 00168 int X4X2X1h = X4*X2*X1/2; 00169 cudaMemcpyToSymbol("X4X2X1h", &X4X2X1h, sizeof(int)); 00170 00171 int X4X3X1 = X4*X3*X1; 00172 cudaMemcpyToSymbol("X4X3X1", &X4X3X1, sizeof(int)); 00173 00174 int X4X3X1h = X4*X3*X1/2; 00175 cudaMemcpyToSymbol("X4X3X1h", &X4X3X1h, sizeof(int)); 00176 00177 int X4X3X2 = X4*X3*X2; 00178 cudaMemcpyToSymbol("X4X3X2", &X4X3X2, sizeof(int)); 00179 00180 int X4X3X2h = X4*X3*X2/2; 00181 cudaMemcpyToSymbol("X4X3X2h", &X4X3X2h, sizeof(int)); 00182 00183 int X2X1_3 = 3*X2*X1; 00184 cudaMemcpyToSymbol("X2X1_3", &X2X1_3, sizeof(int)); 00185 00186 int X3X2X1_3 = 3*X3*X2*X1; 00187 cudaMemcpyToSymbol("X3X2X1_3", &X3X2X1_3, sizeof(int)); 00188 00189 00190 int X1h = X1/2; 00191 cudaMemcpyToSymbol("X1h", &X1h, sizeof(int)); 00192 00193 int X2h = X2/2; 00194 cudaMemcpyToSymbol("X2h", &X2h, sizeof(int)); 00195 00196 int X1m1 = X1 - 1; 00197 cudaMemcpyToSymbol("X1m1", &X1m1, sizeof(int)); 00198 00199 int X2m1 = X2 - 1; 00200 cudaMemcpyToSymbol("X2m1", &X2m1, sizeof(int)); 00201 00202 int X3m1 = X3 - 1; 00203 cudaMemcpyToSymbol("X3m1", &X3m1, sizeof(int)); 00204 00205 int X4m1 = X4 - 1; 00206 cudaMemcpyToSymbol("X4m1", &X4m1, sizeof(int)); 00207 00208 int X1m3 = X1 - 3; 00209 cudaMemcpyToSymbol("X1m3", &X1m3, sizeof(int)); 00210 00211 int X2m3 = X2 - 3; 00212 cudaMemcpyToSymbol("X2m3", &X2m3, sizeof(int)); 00213 00214 int X3m3 = X3 - 3; 00215 cudaMemcpyToSymbol("X3m3", &X3m3, sizeof(int)); 00216 00217 int X4m3 = X4 - 3; 00218 cudaMemcpyToSymbol("X4m3", &X4m3, sizeof(int)); 00219 00220 00221 int X2X1mX1 = X2X1 - X1; 00222 cudaMemcpyToSymbol("X2X1mX1", &X2X1mX1, sizeof(int)); 00223 00224 int X3X2X1mX2X1 = X3X2X1 - X2X1; 00225 cudaMemcpyToSymbol("X3X2X1mX2X1", &X3X2X1mX2X1, sizeof(int)); 00226 00227 int X4X3X2X1mX3X2X1 = (X4-1)*X3X2X1; 00228 cudaMemcpyToSymbol("X4X3X2X1mX3X2X1", &X4X3X2X1mX3X2X1, sizeof(int)); 00229 00230 int X4X3X2X1hmX3X2X1h = (X4-1)*X3*X2*X1h; 00231 cudaMemcpyToSymbol("X4X3X2X1hmX3X2X1h", &X4X3X2X1hmX3X2X1h, sizeof(int)); 00232 00233 int X2X1m3X1 = X2X1 - 3*X1; 00234 cudaMemcpyToSymbol("X2X1m3X1", &X2X1m3X1, sizeof(int)); 00235 00236 int X3X2X1m3X2X1 = X3X2X1 - 3*X2X1; 00237 cudaMemcpyToSymbol("X3X2X1m3X2X1", &X3X2X1m3X2X1, sizeof(int)); 00238 00239 int X4X3X2X1m3X3X2X1 = (X4-3)*X3X2X1; 00240 cudaMemcpyToSymbol("X4X3X2X1m3X3X2X1", &X4X3X2X1m3X3X2X1, sizeof(int)); 00241 00242 int X4X3X2X1hm3X3X2X1h = (X4-3)*X3*X2*X1h; 00243 cudaMemcpyToSymbol("X4X3X2X1hm3X3X2X1h", &X4X3X2X1hm3X3X2X1h, sizeof(int)); 00244 00245 int Vh_2d_max = MAX(X1*X2/2, X1*X3/2); 00246 Vh_2d_max = MAX(Vh_2d_max, X1*X4/2); 00247 Vh_2d_max = MAX(Vh_2d_max, X2*X3/2); 00248 Vh_2d_max = MAX(Vh_2d_max, X2*X4/2); 00249 Vh_2d_max = MAX(Vh_2d_max, X3*X4/2); 00250 cudaMemcpyToSymbol("Vh_2d_max", &Vh_2d_max, sizeof(int)); 00251 00252 cudaMemcpyToSymbol("Pt0", &(qudaPt0), sizeof(bool)); 00253 cudaMemcpyToSymbol("PtNm1", &(qudaPtNm1), sizeof(bool)); 00254 00255 checkCudaError(); 00256 00257 // copy a few of the constants needed by tuneLaunch() 00258 dslashConstants.x[0] = X1; 00259 dslashConstants.x[1] = X2; 00260 dslashConstants.x[2] = X3; 00261 dslashConstants.x[3] = X4; 00262 } 00263 00264 void initGaugeFieldConstants(const cudaGaugeField &gauge) 00265 { 00266 initCommonConstants(gauge); 00267 00268 int ga_stride = gauge.Stride(); 00269 cudaMemcpyToSymbol("ga_stride", &ga_stride, sizeof(int)); 00270 00271 int gf = (gauge.GaugeFixed() == QUDA_GAUGE_FIXED_YES) ? 1 : 0; 00272 cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int)); 00273 00274 double anisotropy_ = gauge.Anisotropy(); 00275 cudaMemcpyToSymbol("anisotropy", &(anisotropy_), sizeof(double)); 00276 00277 double t_bc = (gauge.TBoundary() == QUDA_PERIODIC_T) ? 1.0 : -1.0; 00278 cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double)); 00279 00280 double coeff = -24.0*gauge.Tadpole()*gauge.Tadpole(); 00281 cudaMemcpyToSymbol("coeff", &(coeff), sizeof(double)); 00282 00283 return; 00284 } 00285 00286 00287 00288 void initDslashConstants(const cudaGaugeField &gauge, const int sp_stride) 00289 { 00290 initCommonConstants(gauge); 00291 00292 cudaMemcpyToSymbol("sp_stride", &sp_stride, sizeof(int)); 00293 00294 int ga_stride = gauge.Stride(); 00295 cudaMemcpyToSymbol("ga_stride", &ga_stride, sizeof(int)); 00296 00297 int gf = (gauge.GaugeFixed() == QUDA_GAUGE_FIXED_YES) ? 1 : 0; 00298 cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int)); 00299 00300 double anisotropy_ = gauge.Anisotropy(); 00301 cudaMemcpyToSymbol("anisotropy", &(anisotropy_), sizeof(double)); 00302 00303 double t_bc = (gauge.TBoundary() == QUDA_PERIODIC_T) ? 1.0 : -1.0; 00304 cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double)); 00305 00306 double coeff = -24.0*gauge.Tadpole()*gauge.Tadpole(); 00307 cudaMemcpyToSymbol("coeff", &(coeff), sizeof(double)); 00308 00309 00310 00311 float anisotropy_f = gauge.Anisotropy(); 00312 cudaMemcpyToSymbol("anisotropy_f", &(anisotropy_f), sizeof(float)); 00313 00314 float t_bc_f = (gauge.TBoundary() == QUDA_PERIODIC_T) ? 1.0 : -1.0; 00315 cudaMemcpyToSymbol("t_boundary_f", &(t_bc_f), sizeof(float)); 00316 00317 float coeff_f = -24.0*gauge.Tadpole()*gauge.Tadpole(); 00318 cudaMemcpyToSymbol("coeff_f", &(coeff_f), sizeof(float)); 00319 00320 00321 float2 An2 = make_float2(gauge.Anisotropy(), 1.0 / (gauge.Anisotropy()*gauge.Anisotropy())); 00322 cudaMemcpyToSymbol("An2", &(An2), sizeof(float2)); 00323 float2 TB2 = make_float2(t_bc_f, 1.0 / (t_bc_f * t_bc_f)); 00324 cudaMemcpyToSymbol("TB2", &(TB2), sizeof(float2)); 00325 float2 No2 = make_float2(1.0, 1.0); 00326 cudaMemcpyToSymbol("No2", &(No2), sizeof(float2)); 00327 00328 float h_pi_f = M_PI; 00329 cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float)); 00330 00331 double TProjScale = (kernelPackT ? 1.0 : 2.0); 00332 // temporary additions (?) for checking Ron's T-packing kernel with old multi-gpu kernel 00333 cudaMemcpyToSymbol("tProjScale", &(TProjScale), sizeof(double)); 00334 00335 float TProjScale_f = (float)TProjScale; 00336 cudaMemcpyToSymbol("tProjScale_f", &(TProjScale_f), sizeof(float)); 00337 00338 checkCudaError(); 00339 00340 initDslash = 1; 00341 00342 // create events 00343 #ifndef DSLASH_PROFILING 00344 // add cudaEventDisableTiming for lower sync overhead 00345 for (int i=0; i<Nstream; i++) { 00346 cudaEventCreate(&packEnd[i], cudaEventDisableTiming); 00347 cudaEventCreate(&gatherStart[i], cudaEventDisableTiming); 00348 cudaEventCreate(&gatherEnd[i], cudaEventDisableTiming); 00349 cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming); 00350 cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming); 00351 } 00352 #else 00353 cudaEventCreate(&dslashStart); 00354 cudaEventCreate(&dslashEnd); 00355 for (int i=0; i<Nstream; i++) { 00356 cudaEventCreate(&packStart[i]); 00357 cudaEventCreate(&packEnd[i]); 00358 00359 cudaEventCreate(&gatherStart[i]); 00360 cudaEventCreate(&gatherEnd[i]); 00361 00362 cudaEventCreate(&scatterStart[i]); 00363 cudaEventCreate(&scatterEnd[i]); 00364 00365 cudaEventCreate(&kernelStart[i]); 00366 cudaEventCreate(&kernelEnd[i]); 00367 00368 kernelTime[i][0] = 0.0; 00369 kernelTime[i][1] = 0.0; 00370 00371 gatherTime[i][0] = 0.0; 00372 gatherTime[i][1] = 0.0; 00373 00374 commsTime[i][0] = 0.0; 00375 commsTime[i][1] = 0.0; 00376 00377 scatterTime[i][0] = 0.0; 00378 scatterTime[i][1] = 0.0; 00379 } 00380 #endif 00381 } 00382 00383 void initCloverConstants (const int cl_stride) { 00384 cudaMemcpyToSymbol("cl_stride", &cl_stride, sizeof(int)); 00385 00386 initClover = 1; 00387 } 00388 00389 void initDomainWallConstants(const int Ls) { 00390 cudaMemcpyToSymbol("Ls", &Ls, sizeof(int)); 00391 dslashConstants.Ls = Ls; // needed by tuneLaunch() 00392 00393 initDomainWall = 1; 00394 } 00395 00396 void 00397 initStaggeredConstants(const cudaGaugeField &fatgauge, const cudaGaugeField &longgauge) 00398 { 00399 00400 int fat_ga_stride = fatgauge.Stride(); 00401 int long_ga_stride = longgauge.Stride(); 00402 float fat_link_max = fatgauge.LinkMax(); 00403 00404 cudaMemcpyToSymbol("fat_ga_stride", &fat_ga_stride, sizeof(int)); 00405 cudaMemcpyToSymbol("long_ga_stride", &long_ga_stride, sizeof(int)); 00406 00407 cudaMemcpyToSymbol("fat_ga_max", &fat_link_max, sizeof(float)); 00408 initStaggered = 1; 00409 return; 00410 } 00411