3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
8 #else // Open64 compiler
9 #define VOLATILE volatile
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
159 #endif // GAUGE_DOUBLE
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
208 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
210 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
220 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
222 #endif // MULTI_GPU half precision
235 if (x2 >=
X2)
return;
236 if (x3 >=
X3)
return;
254 sid = blockIdx.x*blockDim.x + threadIdx.x;
257 const int dim =
static_cast<int>(kernel_type);
258 const int face_volume = (
param.threads >> 1);
259 const int face_num = (
sid >= face_volume);
260 face_idx =
sid - face_num*face_volume;
266 #if (DD_PREC==2) // half precision
267 sp_norm_idx =
sid +
param.ghostNormOffset[
static_cast<int>(kernel_type)];
270 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4, face_idx, face_volume, dim, face_num,
param.parity);
303 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
344 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
497 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
499 const int sp_idx = (
x1==0 ?
X+
X1m1 :
X-1) >> 1;
505 const int ga_idx = sp_idx;
520 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
541 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
694 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
699 const int ga_idx =
sid;
712 if (threadIdx.y == blockDim.y-1 && blockDim.y <
X2 ) {
731 int tx = (threadIdx.x + blockDim.x - ((
x1+1)&1) ) % blockDim.x;
732 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
753 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
906 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
914 const int ga_idx = sp_idx;
928 if (threadIdx.y == 0 && blockDim.y <
X2) {
947 int tx = (threadIdx.x + blockDim.x - ((
x1+1)&1)) % blockDim.x;
948 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
969 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1122 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1127 const int ga_idx =
sid;
1140 if (threadIdx.z == blockDim.z-1 && blockDim.z <
X3) {
1159 int tx = (threadIdx.x + blockDim.x - ((
x1+1)&1) ) % blockDim.x;
1160 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1181 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1334 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1342 const int ga_idx = sp_idx;
1356 if (threadIdx.z == 0 && blockDim.z <
X3) {
1375 int tx = (threadIdx.x + blockDim.x - ((
x1+1)&1)) % blockDim.x;
1376 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1397 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1550 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1555 const int ga_idx =
sid;
1590 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1596 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1597 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1598 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1599 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1600 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1601 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1661 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1667 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1668 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1669 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1670 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1671 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1672 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1804 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1812 const int ga_idx = sp_idx;
1848 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1854 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1855 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1856 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1857 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1858 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1859 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1919 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1925 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1926 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1927 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1928 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1929 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1930 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
2053 switch(kernel_type) {
2055 incomplete = incomplete || (
param.commDim[3] && (
x4==0 ||
x4==
X4m1));
2057 incomplete = incomplete || (
param.commDim[2] && (
x3==0 ||
x3==
X3m1));
2059 incomplete = incomplete || (
param.commDim[1] && (
x2==0 ||
x2==
X2m1));
2061 incomplete = incomplete || (
param.commDim[0] && (
x1==0 ||
x1==
X1m1));
2112 o20_re = b*tmp20_re;
2113 o20_im = b*tmp20_im;
2114 o21_re = b*tmp21_re;
2115 o21_im = b*tmp21_im;
2116 o22_re = b*tmp22_re;
2117 o22_im = b*tmp22_im;
2118 o30_re = b*tmp30_re;
2119 o30_im = b*tmp30_im;
2120 o31_re = b*tmp31_re;
2121 o31_im = b*tmp31_im;
2122 o32_re = b*tmp32_re;
2123 o32_im = b*tmp32_im;
2149 #endif // DSLASH_XPAY
2180 #endif // DSLASH_XPAY
2188 #undef WRITE_SPINOR_SHARED
2189 #undef READ_SPINOR_SHARED
2190 #undef SHARED_STRIDE