3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
7 #if (CUDA_VERSION >= 4010)
10 #define VOLATILE volatile
14 #define spinorFloat double
40 #define mdwf_b5 mdwf_b5_d
41 #define mdwf_c5 mdwf_c5_d
43 #define spinorFloat float
69 #define mdwf_b5 mdwf_b5_f
70 #define mdwf_c5 mdwf_c5_f
71 #endif // SPINOR_DOUBLE
114 #endif // GAUGE_DOUBLE
117 #define gT00_re (+g00_re)
118 #define gT00_im (-g00_im)
119 #define gT01_re (+g10_re)
120 #define gT01_im (-g10_im)
121 #define gT02_re (+g20_re)
122 #define gT02_im (-g20_im)
123 #define gT10_re (+g01_re)
124 #define gT10_im (-g01_im)
125 #define gT11_re (+g11_re)
126 #define gT11_im (-g11_im)
127 #define gT12_re (+g21_re)
128 #define gT12_im (-g21_im)
129 #define gT20_re (+g02_re)
130 #define gT20_im (-g02_im)
131 #define gT21_re (+g12_re)
132 #define gT21_im (-g12_im)
133 #define gT22_re (+g22_re)
134 #define gT22_im (-g22_im)
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
166 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
169 #if (__COMPUTE_CAPABILITY__ >= 200)
170 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
172 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
179 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
181 #endif // MULTI_GPU half precision
183 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
200 X = 2*
sid + (boundaryCrossing +
param.parity) % 2;
224 const int face_volume = (
param.threads*
param.Ls >> 1);
225 const int face_num = (
sid >= face_volume);
226 face_idx =
sid - face_num*face_volume;
232 #if (DD_PREC==2) // half precision
237 coordsFromDW4DFaceIndex<1>(
sid,
x1,
x2,
x3,
x4,
xs,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
241 X = 2*sid + (boundaryCrossing +
param.parity) % 2;
261 #if (DD_PREC==0) //temporal hack
307 const int sp_idx = (x1==
X1m1 ? X-
X1m1 : X+1) >> 1;
497 const int sp_idx = (x1==0 ? X+
X1m1 : X-1) >> 1;
503 const int ga_idx = sp_idx %
Vh;
694 const int ga_idx = sid %
Vh;
881 const int sp_idx = (x2==0 ? X+
X2X1mX1 : X-
X1) >> 1;
887 const int ga_idx = sp_idx %
Vh;
1078 const int ga_idx = sid %
Vh;
1271 const int ga_idx = sp_idx %
Vh;
1462 const int ga_idx = sid %
Vh;
1503 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1504 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1505 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1506 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1507 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1508 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1576 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1577 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1578 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1579 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1580 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1581 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1717 const int ga_idx = sp_idx %
Vh;
1759 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1760 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1761 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1762 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1763 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1764 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1832 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1833 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1834 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1835 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1836 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1837 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1952 #if defined MULTI_GPU && defined DSLASH_XPAY
1958 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
1960 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
1962 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
1964 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
1972 READ_ACCUM(ACCUMTEX,
param.sp_stride)
1981 #ifdef SPINOR_DOUBLE
2031 #endif // SPINOR_DOUBLE
2032 #endif // DSLASH_XPAY
2043 #undef SHARED_STRIDE
VOLATILE spinorFloat o20_re
ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o10_re
__constant__ int X3X2X1mX2X1
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o12_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o22_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
__constant__ double coeff
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o02_re
__constant__ int gauge_fixed
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o21_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o31_im
__constant__ int ga_stride
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o02_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o32_re