3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
8 #else // Open64 compiler
9 #define VOLATILE volatile
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
159 #endif // GAUGE_DOUBLE
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
396 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
398 #endif // MULTI_GPU half precision
412 if (x2 >=
X2)
return;
413 if (x3 >=
X3)
return;
431 sid = blockIdx.x*blockDim.x + threadIdx.x;
432 if (sid >=
param.threads)
return;
435 const int face_volume = (
param.threads >> 1);
436 const int face_num = (sid >= face_volume);
437 face_idx = sid - face_num*face_volume;
443 #if (DD_PREC==2) // half precision
447 const int dims[] = {
X1,
X2,
X3,
X4};
448 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
483 const int sp_idx = (x1==
X1m1 ? X-
X1m1 : X+1) >> 1;
677 const int sp_idx = (x1==0 ? X+
X1m1 : X-1) >> 1;
683 const int ga_idx =
sp_idx;
698 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
877 const int ga_idx =
sid;
890 if (threadIdx.y == blockDim.y-1 && blockDim.y <
X2 ) {
909 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
910 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
1086 const int sp_idx = (x2==0 ? X+
X2X1mX1 : X-
X1) >> 1;
1092 const int ga_idx =
sp_idx;
1106 if (threadIdx.y == 0 && blockDim.y <
X2) {
1125 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1126 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1305 const int ga_idx =
sid;
1318 if (threadIdx.z == blockDim.z-1 && blockDim.z <
X3) {
1337 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1338 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1520 const int ga_idx =
sp_idx;
1534 if (threadIdx.z == 0 && blockDim.z <
X3) {
1553 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1554 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1733 const int ga_idx =
sid;
1774 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1775 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1776 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1777 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1778 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1779 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1845 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1846 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1847 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1848 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1849 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1850 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1990 const int ga_idx =
sp_idx;
2032 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
2033 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
2034 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
2035 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
2036 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
2037 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
2103 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
2104 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
2105 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
2106 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
2107 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
2108 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
2227 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2233 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
2235 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
2237 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
2239 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
2245 #ifdef DSLASH_CLOVER
2261 o30_re = a30_re; o30_im = a30_im;
2277 o31_re = a31_re; o31_im = a31_im;
2293 o32_re = a32_re; o32_im = a32_im;
2445 o00_re = a00_re; o00_im = a00_im;
2446 o01_re = a01_re; o01_im = a01_im;
2447 o02_re = a02_re; o02_im = a02_im;
2448 o10_re = a10_re; o10_im = a10_im;
2449 o11_re = a11_re; o11_im = a11_im;
2450 o12_re = a12_re; o12_im = a12_im;
2603 o20_re = a20_re; o20_im = a20_im;
2604 o21_re = a21_re; o21_im = a21_im;
2605 o22_re = a22_re; o22_im = a22_im;
2606 o30_re = a30_re; o30_im = a30_im;
2607 o31_re = a31_re; o31_im = a31_im;
2608 o32_re = a32_re; o32_im = a32_im;
2627 o30_re = a30_re; o30_im = a30_im;
2643 o31_re = a31_re; o31_im = a31_im;
2659 o32_re = a32_re; o32_im = a32_im;
2662 #endif // DSLASH_CLOVER
2666 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2692 #endif // DSLASH_XPAY
2700 #undef WRITE_SPINOR_SHARED
2701 #undef READ_SPINOR_SHARED
2702 #undef SHARED_STRIDE
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o10_im
__constant__ int X3X2X1mX2X1
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o01_im
#define READ_SPINOR_SHARED
VOLATILE spinorFloat o20_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o02_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o20_re
__constant__ int gauge_fixed
VOLATILE spinorFloat o01_re
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o21_im
#define WRITE_SPINOR_SHARED
VOLATILE spinorFloat o30_re
__constant__ int ga_stride
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o32_re
coordsFromIndex3D< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o00_im
__constant__ int X4X3X2X1hmX3X2X1h
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)