3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24 6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 8 #else // Open64 compiler 9 #define VOLATILE volatile 13 #define spinorFloat double 14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2 15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2 40 #define acc00_re accum0.x 41 #define acc00_im accum0.y 42 #define acc01_re accum1.x 43 #define acc01_im accum1.y 44 #define acc02_re accum2.x 45 #define acc02_im accum2.y 46 #define acc10_re accum3.x 47 #define acc10_im accum3.y 48 #define acc11_re accum4.x 49 #define acc11_im accum4.y 50 #define acc12_re accum5.x 51 #define acc12_im accum5.y 52 #define acc20_re accum6.x 53 #define acc20_im accum6.y 54 #define acc21_re accum7.x 55 #define acc21_im accum7.y 56 #define acc22_re accum8.x 57 #define acc22_im accum8.y 58 #define acc30_re accum9.x 59 #define acc30_im accum9.y 60 #define acc31_re accum10.x 61 #define acc31_im accum10.y 62 #define acc32_re accum11.x 63 #define acc32_im accum11.y 65 #define spinorFloat float 66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4 67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4 92 #define acc00_re accum0.x 93 #define acc00_im accum0.y 94 #define acc01_re accum0.z 95 #define acc01_im accum0.w 96 #define acc02_re accum1.x 97 #define acc02_im accum1.y 98 #define acc10_re accum1.z 99 #define acc10_im accum1.w 100 #define acc11_re accum2.x 101 #define acc11_im accum2.y 102 #define acc12_re accum2.z 103 #define acc12_im accum2.w 104 #define acc20_re accum3.x 105 #define acc20_im accum3.y 106 #define acc21_re accum3.z 107 #define acc21_im accum3.w 108 #define acc22_re accum4.x 109 #define acc22_im accum4.y 110 #define acc30_re accum4.z 111 #define acc30_im accum4.w 112 #define acc31_re accum5.x 113 #define acc31_im accum5.y 114 #define acc32_re accum5.z 115 #define acc32_im accum5.w 116 #endif // SPINOR_DOUBLE 159 #endif // GAUGE_DOUBLE 162 #define gT00_re (+g00_re) 163 #define gT00_im (-g00_im) 164 #define gT01_re (+g10_re) 165 #define gT01_im (-g10_im) 166 #define gT02_re (+g20_re) 167 #define gT02_im (-g20_im) 168 #define gT10_re (+g01_re) 169 #define gT10_im (-g01_im) 170 #define gT11_re (+g11_re) 171 #define gT11_im (-g11_im) 172 #define gT12_re (+g21_re) 173 #define gT12_im (-g21_im) 174 #define gT20_re (+g02_re) 175 #define gT20_im (-g02_im) 176 #define gT21_re (+g12_re) 177 #define gT21_im (-g12_im) 178 #define gT22_re (+g22_re) 179 #define gT22_im (-g22_im) 208 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 210 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 231 if (cpprd[2] >=
param.dc.
X[2])
return;
293 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
294 #if (DD_PREC==2) // half precision 295 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
315 #ifdef TWIST_INV_DSLASH 325 #ifdef TWIST_INV_DSLASH 349 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
502 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
503 #if (DD_PREC==2) // half precision 504 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
549 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
702 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
703 #if (DD_PREC==2) // half precision 704 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
725 #ifdef TWIST_INV_DSLASH 735 #ifdef TWIST_INV_DSLASH 755 int ty = (threadIdx.y <
blockDim.y - 1) ? threadIdx.y + 1 : 0;
776 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
929 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
930 #if (DD_PREC==2) // half precision 931 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
956 #ifdef TWIST_INV_DSLASH 966 #ifdef TWIST_INV_DSLASH 986 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 :
blockDim.y - 1;
1007 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1160 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1161 #if (DD_PREC==2) // half precision 1162 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1183 #ifdef TWIST_INV_DSLASH 1184 #ifdef SPINOR_DOUBLE 1193 #ifdef TWIST_INV_DSLASH 1234 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1387 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1388 #if (DD_PREC==2) // half precision 1389 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1412 if (threadIdx.z == 0 &&
blockDim.z < X3) {
1414 #ifdef TWIST_INV_DSLASH 1415 #ifdef SPINOR_DOUBLE 1424 #ifdef TWIST_INV_DSLASH 1444 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 :
blockDim.z - 1;
1465 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1618 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1619 #if (DD_PREC==2) // half precision 1620 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1642 #ifdef TWIST_INV_DSLASH 1643 #ifdef SPINOR_DOUBLE 1651 #ifndef TWIST_INV_DSLASH 1675 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1727 #ifdef TWIST_INV_DSLASH 1728 #ifdef SPINOR_DOUBLE 1736 #ifndef TWIST_INV_DSLASH 1760 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1903 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1904 #if (DD_PREC==2) // half precision 1905 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1931 #ifdef TWIST_INV_DSLASH 1932 #ifdef SPINOR_DOUBLE 1940 #ifndef TWIST_INV_DSLASH 1964 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2016 #ifdef TWIST_INV_DSLASH 2017 #ifdef SPINOR_DOUBLE 2025 #ifndef TWIST_INV_DSLASH 2049 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2183 switch(kernel_type) {
2197 #ifndef TWIST_INV_DSLASH 2198 #ifdef SPINOR_DOUBLE 2207 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2210 #ifndef TWIST_INV_DSLASH 2268 #ifndef TWIST_INV_DSLASH 2279 #undef WRITE_SPINOR_SHARED 2280 #undef READ_SPINOR_SHARED 2281 #undef SHARED_STRIDE VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o30_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o21_im
#define APPLY_TWIST(a, reg)
#define READ_SPINOR_SHARED
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o10_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o11_im
#define WRITE_SPINOR_SHARED
VOLATILE spinorFloat o11_re
WRITE_SPINOR(param.sp_stride)
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
coordsFromIndex3D< EVEN_X >(X, coord, sid, param)
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o12_re
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o22_re
APPLY_TWIST_INV(-a, b, o)
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o00_im