3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
7 #if (CUDA_VERSION >= 4010)
10 #define VOLATILE volatile
14 #define spinorFloat double
40 #define spinorFloat float
65 #endif // SPINOR_DOUBLE
108 #endif // GAUGE_DOUBLE
111 #define gT00_re (+g00_re)
112 #define gT00_im (-g00_im)
113 #define gT01_re (+g10_re)
114 #define gT01_im (-g10_im)
115 #define gT02_re (+g20_re)
116 #define gT02_im (-g20_im)
117 #define gT10_re (+g01_re)
118 #define gT10_im (-g01_im)
119 #define gT11_re (+g11_re)
120 #define gT11_im (-g11_im)
121 #define gT12_re (+g21_re)
122 #define gT12_im (-g21_im)
123 #define gT20_re (+g02_re)
124 #define gT20_im (-g02_im)
125 #define gT21_re (+g12_re)
126 #define gT21_im (-g12_im)
127 #define gT22_re (+g22_re)
128 #define gT22_im (-g22_im)
157 #if (__COMPUTE_CAPABILITY__ >= 200)
158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
173 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
175 #endif // MULTI_GPU half precision
177 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
195 X = 2*
sid + (boundaryCrossing +
param.parity) % 2;
218 const int dim =
static_cast<int>(kernel_type);
219 const int face_volume = (
param.threads*
Ls >> 1);
220 const int face_num = (
sid >= face_volume);
221 face_idx =
sid - face_num*face_volume;
227 #if (DD_PREC==2) // half precision
228 sp_norm_idx =
sid +
param.ghostNormOffset[
static_cast<int>(kernel_type)];
231 coordsFromDWFaceIndex<1>(
sid,
x1,
x2,
x3,
x4,
xs, face_idx, face_volume, dim, face_num,
param.parity);
236 X = 2*sid + (boundaryCrossing +
param.parity) % 2;
257 #if (DD_PREC==0) //temporal hack
301 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
303 const int sp_idx = (x1==
X1m1 ? X-
X1m1 : X+1) >> 1;
343 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
492 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
494 const int sp_idx = (x1==0 ? X+
X1m1 : X-1) >> 1;
500 const int ga_idx = sp_idx %
Vh;
538 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
687 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
692 const int ga_idx = sid %
Vh;
729 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
878 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
880 const int sp_idx = (x2==0 ? X+
X2X1mX1 : X-
X1) >> 1;
886 const int ga_idx = sp_idx %
Vh;
924 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1073 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1078 const int ga_idx = sid %
Vh;
1115 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1264 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1272 const int ga_idx = sp_idx %
Vh;
1310 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1459 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1464 const int ga_idx = sid %
Vh;
1499 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1505 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1506 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1507 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1508 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1509 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1510 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1573 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1579 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1580 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1581 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1582 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1583 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1584 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1712 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1720 const int ga_idx = sp_idx %
Vh;
1756 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1762 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1763 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1764 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1765 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1766 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1767 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1830 const int sp_stride_pad =
Ls*
ghostFace[
static_cast<int>(kernel_type)];
1836 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1837 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1838 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1839 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1840 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1841 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1965 int sp_idx = ( xs == 0 ? X+(
Ls-1)*2*
Vh : X-2*
Vh ) / 2;
2010 int sp_idx = ( xs ==
Ls-1 ? X-(
Ls-1)*2*
Vh : X+2*
Vh ) / 2;
2055 #if defined MULTI_GPU && defined DSLASH_XPAY
2059 switch(kernel_type) {
2061 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
2063 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
2065 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
2067 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
2078 #ifdef SPINOR_DOUBLE
2079 o00_re = a*o00_re + accum0.x;
2104 o00_re = a*o00_re + accum0.x;
2128 #endif // SPINOR_DOUBLE
2130 #endif // DSLASH_XPAY
2138 #undef SHARED_STRIDE