11 #ifndef STAGGERED_TESLA_HACK
17 #ifdef GPU_DOMAIN_WALL_DIRAC
21 #ifndef STAGGERED_TESLA_HACK
28 #ifdef GPU_CLOVER_DIRAC
31 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
34 #ifdef GPU_STAGGERED_DIRAC
36 int long_gauge_stride;
40 int threadDimMapLower[4];
41 int threadDimMapUpper[4];
44 #ifdef USE_TEXTURE_OBJECTS
45 cudaTextureObject_t inTex;
46 cudaTextureObject_t inTexNorm;
47 cudaTextureObject_t xTex;
48 cudaTextureObject_t xTexNorm;
49 cudaTextureObject_t outTex;
50 cudaTextureObject_t outTexNorm;
51 cudaTextureObject_t gauge0Tex;
52 cudaTextureObject_t gauge1Tex;
53 cudaTextureObject_t longGauge0Tex;
54 cudaTextureObject_t longGauge1Tex;
55 cudaTextureObject_t longPhase0Tex;
56 cudaTextureObject_t longPhase1Tex;
57 cudaTextureObject_t cloverTex;
58 cudaTextureObject_t cloverNormTex;
59 cudaTextureObject_t cloverInvTex;
60 cudaTextureObject_t cloverInvNormTex;
67 #ifdef GPU_DOMAIN_WALL_DIRAC
76 #ifdef GPU_CLOVER_DIRAC
87 static double twist_a = 0.0;
88 static double twist_b = 0.0;
92 #define MAX(a,b) ((a)>(b) ? (a):(b))
212 int volumeCB = lat.VolumeCB();
213 cudaMemcpyToSymbol(
Vh, &volumeCB,
sizeof(
int));
215 int Vspatial = lat.X()[0]*lat.X()[1]*lat.X()[2]/2;
216 cudaMemcpyToSymbol(
Vs, &Vspatial,
sizeof(
int));
218 int half_Vspatial = Vspatial;
219 cudaMemcpyToSymbol(
Vsh, &half_Vspatial,
sizeof(
int));
222 cudaMemcpyToSymbol(
X1, &L1,
sizeof(
int));
225 cudaMemcpyToSymbol(
X2, &L2,
sizeof(
int));
228 cudaMemcpyToSymbol(
X3, &L3,
sizeof(
int));
231 cudaMemcpyToSymbol(
X4, &L4,
sizeof(
int));
234 ghostFace_h[0] = L2*L3*L4/2;
235 ghostFace_h[1] = L1*L3*L4/2;
236 ghostFace_h[2] = L1*L2*L4/2;
237 ghostFace_h[3] = L1*L2*L3/2;
238 cudaMemcpyToSymbol(
ghostFace, ghostFace_h, 4*
sizeof(
int));
241 cudaMemcpyToSymbol(
X1_3, &L1_3,
sizeof(
int));
244 cudaMemcpyToSymbol(
X2_3, &L2_3,
sizeof(
int));
247 cudaMemcpyToSymbol(
X3_3, &L3_3,
sizeof(
int));
250 cudaMemcpyToSymbol(
X4_3, &L4_3,
sizeof(
int));
253 cudaMemcpyToSymbol(
X2X1, &L2L1,
sizeof(
int));
256 cudaMemcpyToSymbol(
X3X1, &L3L1,
sizeof(
int));
259 cudaMemcpyToSymbol(
X3X2, &L3L2,
sizeof(
int));
261 int L3L2L1 = L3*L2*L1;
262 cudaMemcpyToSymbol(
X3X2X1, &L3L2L1,
sizeof(
int));
264 int L4L2L1 = L4*L2*L1;
265 cudaMemcpyToSymbol(
X4X2X1, &L4L2L1,
sizeof(
int));
267 int L4L2L1h = L4*L2*L1/2;
268 cudaMemcpyToSymbol(
X4X2X1h, &L4L2L1h,
sizeof(
int));
270 int L4L3L1 = L4*L3*L1;
271 cudaMemcpyToSymbol(
X4X3X1, &L4L3L1,
sizeof(
int));
273 int L4L3L1h = L4*L3*L1/2;
274 cudaMemcpyToSymbol(
X4X3X1h, &L4L3L1h,
sizeof(
int));
276 int L4L3L2 = L4*L3*L2;
277 cudaMemcpyToSymbol(
X4X3X2, &L4L3L2,
sizeof(
int));
279 int L4L3L2h = L4*L3*L2/2;
280 cudaMemcpyToSymbol(
X4X3X2h, &L4L3L2h,
sizeof(
int));
282 int L2L1_3 = 3*L2*L1;
283 cudaMemcpyToSymbol(
X2X1_3, &L2L1_3,
sizeof(
int));
285 int L3L2L1_3 = 3*L3*L2*L1;
286 cudaMemcpyToSymbol(
X3X2X1_3, &L3L2L1_3,
sizeof(
int));
289 cudaMemcpyToSymbol(
X1h, &L1h,
sizeof(
int));
292 cudaMemcpyToSymbol(
X2h, &L2h,
sizeof(
int));
295 cudaMemcpyToSymbol(
X1m1, &L1m1,
sizeof(
int));
298 cudaMemcpyToSymbol(
X2m1, &L2m1,
sizeof(
int));
301 cudaMemcpyToSymbol(
X3m1, &L3m1,
sizeof(
int));
304 cudaMemcpyToSymbol(
X4m1, &L4m1,
sizeof(
int));
307 cudaMemcpyToSymbol(
X1m3, &L1m3,
sizeof(
int));
310 cudaMemcpyToSymbol(
X2m3, &L2m3,
sizeof(
int));
313 cudaMemcpyToSymbol(
X3m3, &L3m3,
sizeof(
int));
316 cudaMemcpyToSymbol(
X4m3, &L4m3,
sizeof(
int));
318 int L2L1mL1 = L2L1 - L1;
319 cudaMemcpyToSymbol(
X2X1mX1, &L2L1mL1,
sizeof(
int));
321 int L3L2L1mL2L1 = L3L2L1 - L2L1;
322 cudaMemcpyToSymbol(
X3X2X1mX2X1, &L3L2L1mL2L1,
sizeof(
int));
324 int L4L3L2L1mL3L2L1 = (L4-1)*L3L2L1;
327 int L4L3L2L1hmL3L2L1h = (L4-1)*L3*L2*L1h;
330 int L2L1m3L1 = L2L1 - 3*L1;
331 cudaMemcpyToSymbol(
X2X1m3X1, &L2L1m3L1,
sizeof(
int));
333 int L3L2L1m3L2L1 = L3L2L1 - 3*L2L1;
334 cudaMemcpyToSymbol(
X3X2X1m3X2X1, &L3L2L1m3L2L1,
sizeof(
int));
336 int L4L3L2L1m3L3L2L1 = (L4-3)*L3L2L1;
339 int L4L3L2L1hm3L3L2L1h = (L4-3)*L3*L2*L1h;
341 int Vh_2d_max_h =
MAX(L1*L2/2, L1*L3/2);
342 Vh_2d_max_h =
MAX(Vh_2d_max_h, L1*L4/2);
343 Vh_2d_max_h =
MAX(Vh_2d_max_h, L2*L3/2);
344 Vh_2d_max_h =
MAX(Vh_2d_max_h, L2*L4/2);
345 Vh_2d_max_h =
MAX(Vh_2d_max_h, L3*L4/2);
346 cudaMemcpyToSymbol(
Vh_2d_max, &Vh_2d_max_h,
sizeof(
int));
352 bool first_node_in_t =
true;
353 bool last_node_in_t =
true;
356 cudaMemcpyToSymbol(
Pt0, &(first_node_in_t),
sizeof(
bool));
357 cudaMemcpyToSymbol(
PtNm1, &(last_node_in_t),
sizeof(
bool));
365 int E2E1_h = E2_h*E1_h;
366 int E3E2E1_h = E3_h*E2_h*E1_h;
367 int Vh_ex_h = E1_h*E2_h*E3_h*E4_h/2;
369 cudaMemcpyToSymbol(
E1, &E1_h,
sizeof(
int));
370 cudaMemcpyToSymbol(
E1h, &E1h_h,
sizeof(
int));
371 cudaMemcpyToSymbol(
E2, &E2_h,
sizeof(
int));
372 cudaMemcpyToSymbol(
E3, &E3_h,
sizeof(
int));
373 cudaMemcpyToSymbol(
E4, &E4_h,
sizeof(
int));
374 cudaMemcpyToSymbol(
E2E1, &E2E1_h,
sizeof(
int));
375 cudaMemcpyToSymbol(
E3E2E1, &E3E2E1_h,
sizeof(
int));
376 cudaMemcpyToSymbol(
Vh_ex, &Vh_ex_h,
sizeof(
int));
388 int ga_stride_h = gauge.Stride();
389 cudaMemcpyToSymbol(
ga_stride, &ga_stride_h,
sizeof(
int));
392 cudaMemcpyToSymbol(
fat_ga_stride, &ga_stride_h,
sizeof(
int));
393 float link_max_h = gauge.LinkMax();
394 cudaMemcpyToSymbol(
fat_ga_max, &link_max_h,
sizeof(
float));
397 cudaMemcpyToSymbol(
gauge_fixed, &(gf),
sizeof(
int));
399 double anisotropy_ = gauge.Anisotropy();
400 cudaMemcpyToSymbol(
anisotropy, &(anisotropy_),
sizeof(
double));
403 cudaMemcpyToSymbol(
t_boundary, &(t_bc),
sizeof(
double));
405 float anisotropy_fh = gauge.Anisotropy();
406 cudaMemcpyToSymbol(
anisotropy_f, &(anisotropy_fh),
sizeof(
float));
409 cudaMemcpyToSymbol(
t_boundary_f, &(t_bc_f),
sizeof(
float));
413 float2 An2_h = make_float2(gauge.Anisotropy(), 1.0 / (gauge.Anisotropy()*gauge.Anisotropy()));
414 cudaMemcpyToSymbol(
An2, &(An2_h),
sizeof(float2));
415 float2 TB2_h = make_float2(t_bc_f, 1.0 / (t_bc_f * t_bc_f));
416 cudaMemcpyToSymbol(
TB2, &(TB2_h),
sizeof(float2));
417 float2 No2_h = make_float2(1.0, 1.0);
418 cudaMemcpyToSymbol(
No2, &(No2_h),
sizeof(float2));
430 cudaMemcpyToSymbol(
pi_f, &pi_f_h,
sizeof(
float));
435 cudaMemcpyToSymbol(
tProjScale, &tProjScale_h,
sizeof(
double));
437 float tProjScale_fh = (float)tProjScale_h;
438 cudaMemcpyToSymbol(
tProjScale_f, &tProjScale_fh,
sizeof(
float));
446 TimeProfile &profile)
450 int fat_ga_stride_h = fatgauge.Stride();
451 int long_ga_stride_h = longgauge.Stride();
452 float fat_link_max_h = fatgauge.LinkMax();
454 float coeff_fh = 1.0/longgauge.Scale();
455 cudaMemcpyToSymbol(
coeff_f, &(coeff_fh),
sizeof(
float));
457 double coeff_h = 1.0/longgauge.Scale();
458 cudaMemcpyToSymbol(
coeff, &(coeff_h),
sizeof(
double));
460 cudaMemcpyToSymbol(
fat_ga_stride, &fat_ga_stride_h,
sizeof(
int));
461 cudaMemcpyToSymbol(
long_ga_stride, &long_ga_stride_h,
sizeof(
int));
462 cudaMemcpyToSymbol(
fat_ga_max, &fat_link_max_h,
sizeof(
float));
476 void initMDWFConstants(
const double *b_5,
const double *c_5,
int dim_s,
const double m5h, TimeProfile &profile)
480 static int last_Ls = -1;
481 if (dim_s != last_Ls) {
484 for (
int i=0; i<dim_s; i++) {
485 b_5_f[i] = (float)b_5[i];
486 c_5_f[i] = (float)c_5[i];
489 cudaMemcpyToSymbol(
mdwf_b5_d, b_5, dim_s*
sizeof(
double));
490 cudaMemcpyToSymbol(
mdwf_c5_d, c_5, dim_s*
sizeof(
double));
491 cudaMemcpyToSymbol(
mdwf_b5_f, b_5_f, dim_s*
sizeof(
float));
492 cudaMemcpyToSymbol(
mdwf_c5_f, c_5_f, dim_s*
sizeof(
float));
497 static double last_m5 = 99999;
498 if (m5h != last_m5) {
499 float m5h_f = (float)m5h;
500 cudaMemcpyToSymbol(
m5_d, &m5h,
sizeof(
double));
501 cudaMemcpyToSymbol(
m5_f, &m5h_f,
sizeof(
float));
512 a = 2.0 * kappa *
mu;
515 a = -2.0 * kappa *
mu;
516 b = 1.0 / (1.0 + a*a);
518 errorQuda(
"Twist type %d not defined\n", twist);
520 if (dagger) a *= -1.0;
__constant__ int X4X3X2X1hm3X3X2X1h
__constant__ float coeff_f
void initStaggeredConstants(const cudaGaugeField &fatgauge, const cudaGaugeField &longgauge, TimeProfile &profile)
__constant__ float t_boundary_f
__constant__ int X3X2X1m3X2X1
__constant__ fat_force_const_t hf
struct fat_force_stride_s fat_force_const_t
void initMDWFConstants(const double *b_5, const double *c_5, int dim_s, const double m5h, TimeProfile &profile)
__constant__ fat_force_const_t gf
__constant__ int X3X2X1mX2X1
__constant__ float fat_ga_max
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
__constant__ int X4X3X2X1m3X3X2X1
int ghostOffset[QUDA_MAX_DIM]
__constant__ float mdwf_b5_f[QUDA_MAX_DWF_LS]
int ghostNormOffset[QUDA_MAX_DIM]
__constant__ int fat_ga_stride
__constant__ int ghostFace[QUDA_MAX_DIM+1]
__constant__ int long_ga_stride
__constant__ int X3X2X1_3
VOLATILE spinorFloat kappa
__constant__ int X2X1m3X1
__constant__ double anisotropy
__constant__ double coeff
__constant__ float anisotropy_f
void setTwistParam(double &a, double &b, const double &kappa, const double &mu, const int dagger, const QudaTwistGamma5Type twist)
void initGaugeConstants(const cudaGaugeField &gauge, TimeProfile &profile)
__constant__ int gauge_fixed
__constant__ int X4X3X2X1mX3X2X1
__constant__ double mdwf_b5_d[QUDA_MAX_DWF_LS]
__constant__ int ga_stride
__constant__ fat_force_const_t fl
__constant__ int Vh_2d_max
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
__constant__ float tProjScale_f
__constant__ double tProjScale
__constant__ double t_boundary
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__constant__ float mdwf_c5_f[QUDA_MAX_DWF_LS]
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ double mdwf_c5_d[QUDA_MAX_DWF_LS]
void initDslashConstants(TimeProfile &profile)