5 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
10 #else // Open64 compiler
11 #define VOLATILE volatile
15 #define spinorFloat double
16 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
17 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
42 #define acc00_re accum0.x
43 #define acc00_im accum0.y
44 #define acc01_re accum1.x
45 #define acc01_im accum1.y
46 #define acc02_re accum2.x
47 #define acc02_im accum2.y
48 #define acc10_re accum3.x
49 #define acc10_im accum3.y
50 #define acc11_re accum4.x
51 #define acc11_im accum4.y
52 #define acc12_re accum5.x
53 #define acc12_im accum5.y
54 #define acc20_re accum6.x
55 #define acc20_im accum6.y
56 #define acc21_re accum7.x
57 #define acc21_im accum7.y
58 #define acc22_re accum8.x
59 #define acc22_im accum8.y
60 #define acc30_re accum9.x
61 #define acc30_im accum9.y
62 #define acc31_re accum10.x
63 #define acc31_im accum10.y
64 #define acc32_re accum11.x
65 #define acc32_im accum11.y
67 #define spinorFloat float
68 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
69 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
94 #define acc00_re accum0.x
95 #define acc00_im accum0.y
96 #define acc01_re accum0.z
97 #define acc01_im accum0.w
98 #define acc02_re accum1.x
99 #define acc02_im accum1.y
100 #define acc10_re accum1.z
101 #define acc10_im accum1.w
102 #define acc11_re accum2.x
103 #define acc11_im accum2.y
104 #define acc12_re accum2.z
105 #define acc12_im accum2.w
106 #define acc20_re accum3.x
107 #define acc20_im accum3.y
108 #define acc21_re accum3.z
109 #define acc21_im accum3.w
110 #define acc22_re accum4.x
111 #define acc22_im accum4.y
112 #define acc30_re accum4.z
113 #define acc30_im accum4.w
114 #define acc31_re accum5.x
115 #define acc31_im accum5.y
116 #define acc32_re accum5.z
117 #define acc32_im accum5.w
118 #endif // SPINOR_DOUBLE
161 #endif // GAUGE_DOUBLE
164 #define gT00_re (+g00_re)
165 #define gT00_im (-g00_im)
166 #define gT01_re (+g10_re)
167 #define gT01_im (-g10_im)
168 #define gT02_re (+g20_re)
169 #define gT02_im (-g20_im)
170 #define gT10_re (+g01_re)
171 #define gT10_im (-g01_im)
172 #define gT11_re (+g11_re)
173 #define gT11_im (-g11_im)
174 #define gT12_re (+g21_re)
175 #define gT12_im (-g21_im)
176 #define gT20_re (+g02_re)
177 #define gT20_im (-g02_im)
178 #define gT21_re (+g12_re)
179 #define gT21_im (-g12_im)
180 #define gT22_re (+g22_re)
181 #define gT22_im (-g22_im)
210 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
212 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
221 #if (DD_PREC==2) // half precision
223 #endif // half precision
236 sid = blockIdx.x*blockDim.x + threadIdx.x;
242 const int face_volume = ((
param.threadDimMapUpper[
dim] -
param.threadDimMapLower[
dim]) >> 1);
243 const int face_num = (
sid >= face_volume);
248 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
251 for(
int dir=0; dir<4; ++dir){
252 active = active || isActive(dim,dir,+1,x1,x2,x3,x4,
param.commDim,
param.
X);
280 const int sp_idx = face_idx +
param.ghostOffset[0];
432 if (isActive(dim,0,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x1==0 )
441 const int sp_idx = face_idx +
param.ghostOffset[0];
443 sp_norm_idx = face_idx +
param.ghostNormOffset[0];
602 const int sp_idx = face_idx +
param.ghostOffset[1];
607 const int ga_idx =
sid;
754 if (isActive(dim,1,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x2==0 )
763 const int sp_idx = face_idx +
param.ghostOffset[1];
765 sp_norm_idx = face_idx +
param.ghostNormOffset[1];
924 const int sp_idx = face_idx +
param.ghostOffset[2];
929 const int ga_idx =
sid;
1076 if (isActive(dim,2,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x3==0 )
1085 const int sp_idx = face_idx +
param.ghostOffset[2];
1087 sp_norm_idx = face_idx +
param.ghostNormOffset[2];
1237 if (isActive(dim,3,+1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==
X4m1 )
1246 const int sp_idx = face_idx +
param.ghostOffset[3];
1251 const int ga_idx =
sid;
1269 #ifdef TWIST_INV_DSLASH
1323 #ifdef TWIST_INV_DSLASH
1453 if (isActive(dim,3,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==0 )
1462 const int sp_idx = face_idx +
param.ghostOffset[3];
1464 sp_norm_idx = face_idx +
param.ghostNormOffset[3];
1485 #ifdef TWIST_INV_DSLASH
1539 #ifdef TWIST_INV_DSLASH
1671 READ_ACCUM(ACCUMTEX,
param.sp_stride)
1674 #ifndef TWIST_INV_DSLASH
1732 #ifndef TWIST_INV_DSLASH
1743 #undef WRITE_SPINOR_SHARED
1744 #undef READ_SPINOR_SHARED
1745 #undef SHARED_STRIDE
#define APPLY_TWIST(a, reg)
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
#define READ_INTERMEDIATE_SPINOR
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
__constant__ int gauge_fixed
__constant__ int ga_stride
#define READ_GAUGE_MATRIX
__constant__ int X4X3X2X1hmX3X2X1h