3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
8 #else // Open64 compiler
9 #define VOLATILE volatile
13 #define spinorFloat double
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
63 #define spinorFloat float
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
155 #endif // GAUGE_DOUBLE
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
178 #define o00_re s[0*SHARED_STRIDE]
179 #define o00_im s[1*SHARED_STRIDE]
180 #define o01_re s[2*SHARED_STRIDE]
181 #define o01_im s[3*SHARED_STRIDE]
182 #define o02_re s[4*SHARED_STRIDE]
183 #define o02_im s[5*SHARED_STRIDE]
184 #define o10_re s[6*SHARED_STRIDE]
185 #define o10_im s[7*SHARED_STRIDE]
186 #define o11_re s[8*SHARED_STRIDE]
187 #define o11_im s[9*SHARED_STRIDE]
188 #define o12_re s[10*SHARED_STRIDE]
189 #define o12_im s[11*SHARED_STRIDE]
190 #define o20_re s[12*SHARED_STRIDE]
191 #define o20_im s[13*SHARED_STRIDE]
192 #define o21_re s[14*SHARED_STRIDE]
193 #define o21_im s[15*SHARED_STRIDE]
194 #define o22_re s[16*SHARED_STRIDE]
195 #define o22_im s[17*SHARED_STRIDE]
196 #define o30_re s[18*SHARED_STRIDE]
204 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
206 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
209 extern __shared__
char s_data[];
221 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
223 #endif // MULTI_GPU half precision
232 sid = blockIdx.x*blockDim.x + threadIdx.x;
233 if (sid >=
param.threads)
return;
254 sid = blockIdx.x*blockDim.x + threadIdx.x;
255 if (sid >=
param.threads)
return;
257 const int dim =
static_cast<int>(kernel_type);
258 const int face_volume = (
param.threads >> 1);
259 const int face_num = (sid >= face_volume);
260 face_idx = sid - face_num*face_volume;
266 #if (DD_PREC==2) // half precision
267 sp_norm_idx = sid +
param.ghostNormOffset[
static_cast<int>(kernel_type)];
270 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4, face_idx, face_volume, dim, face_num,
param.parity);
303 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
305 const int sp_idx = (x1==
X1m1 ? X-
X1m1 : X+1) >> 1;
341 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
494 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
496 const int sp_idx = (x1==0 ? X+
X1m1 : X-1) >> 1;
502 const int ga_idx = sp_idx;
536 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
689 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
694 const int ga_idx =
sid;
727 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
880 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
882 const int sp_idx = (x2==0 ? X+
X2X1mX1 : X-
X1) >> 1;
888 const int ga_idx = sp_idx;
922 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1075 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1080 const int ga_idx =
sid;
1113 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1266 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1274 const int ga_idx = sp_idx;
1308 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1461 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1466 const int ga_idx =
sid;
1501 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1507 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1508 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1509 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1510 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1511 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1512 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1572 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1578 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1579 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1580 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1581 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1582 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1583 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1715 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1723 const int ga_idx = sp_idx;
1759 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1765 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1766 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1767 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1768 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1769 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1770 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1830 const int sp_stride_pad =
ghostFace[
static_cast<int>(kernel_type)];
1836 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1837 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1838 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1839 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1840 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1841 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1964 switch(kernel_type) {
1966 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
1968 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
1970 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
1972 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
2023 o20_re = b*tmp20_re;
2024 o20_im = b*tmp20_im;
2025 o21_re = b*tmp21_re;
2026 o21_im = b*tmp21_im;
2027 o22_re = b*tmp22_re;
2028 o22_im = b*tmp22_im;
2029 o30_re = b*tmp30_re;
2030 o30_im = b*tmp30_im;
2031 o31_re = b*tmp31_re;
2032 o31_im = b*tmp31_im;
2033 o32_re = b*tmp32_re;
2034 o32_im = b*tmp32_im;
2060 #endif // DSLASH_XPAY
2091 #endif // DSLASH_XPAY
2099 #undef SHARED_STRIDE