3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
8 #else // Open64 compiler
9 #define VOLATILE volatile
13 #define spinorFloat double
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
63 #define spinorFloat float
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
155 #endif // GAUGE_DOUBLE
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
209 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
211 #endif // MULTI_GPU half precision
220 sid = blockIdx.x*blockDim.x + threadIdx.x;
243 sid = blockIdx.x*blockDim.x + threadIdx.x;
247 const int face_volume = (
param.threads >> 1);
248 const int face_num = (
sid >= face_volume);
255 #if (DD_PREC==2) // half precision
260 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
313 #ifdef TWIST_INV_DSLASH
489 const int sp_idx = (
x1==0 ?
X+
X1m1 :
X-1) >> 1;
495 const int ga_idx =
sp_idx;
511 #ifdef TWIST_INV_DSLASH
690 const int ga_idx =
sid;
705 #ifdef TWIST_INV_DSLASH
887 const int ga_idx =
sp_idx;
903 #ifdef TWIST_INV_DSLASH
1082 const int ga_idx =
sid;
1097 #ifdef TWIST_INV_DSLASH
1279 const int ga_idx =
sp_idx;
1295 #ifdef TWIST_INV_DSLASH
1474 const int ga_idx =
sid;
1490 #ifndef TWIST_INV_DSLASH
1520 #ifdef TWIST_INV_DSLASH
1575 #ifndef TWIST_INV_DSLASH
1605 #ifdef TWIST_INV_DSLASH
1759 const int ga_idx =
sp_idx;
1776 #ifndef TWIST_INV_DSLASH
1806 #ifdef TWIST_INV_DSLASH
1861 #ifndef TWIST_INV_DSLASH
1891 #ifdef TWIST_INV_DSLASH
2030 incomplete = incomplete || (
param.commDim[3] && (
x4==0 ||
x4==
X4m1));
2032 incomplete = incomplete || (
param.commDim[2] && (
x3==0 ||
x3==
X3m1));
2034 incomplete = incomplete || (
param.commDim[1] && (
x2==0 ||
x2==
X2m1));
2036 incomplete = incomplete || (
param.commDim[0] && (
x1==0 ||
x1==
X1m1));
2043 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2046 #ifndef TWIST_INV_DSLASH
2104 #ifndef TWIST_INV_DSLASH
VOLATILE spinorFloat o32_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define APPLY_TWIST(a, reg)
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o12_im
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
__constant__ int X3X2X1mX2X1
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o21_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o02_im
WRITE_SPINOR(param.sp_stride)
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o01_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o31_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o11_im
__constant__ int gauge_fixed
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o02_re
__constant__ int ga_stride
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o20_re
__constant__ int X4X3X2X1hmX3X2X1h
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im