3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 7 #if (CUDA_VERSION >= 4010) 10 #define VOLATILE volatile 14 #define spinorFloat double 40 #define spinorFloat float 65 #endif // SPINOR_DOUBLE 108 #endif // GAUGE_DOUBLE 111 #define gT00_re (+g00_re) 112 #define gT00_im (-g00_im) 113 #define gT01_re (+g10_re) 114 #define gT01_im (-g10_im) 115 #define gT02_re (+g20_re) 116 #define gT02_im (-g20_im) 117 #define gT10_re (+g01_re) 118 #define gT10_im (-g01_im) 119 #define gT11_re (+g11_re) 120 #define gT11_im (-g11_im) 121 #define gT12_re (+g21_re) 122 #define gT12_im (-g21_im) 123 #define gT20_re (+g02_re) 124 #define gT20_im (-g02_im) 125 #define gT21_re (+g12_re) 126 #define gT21_im (-g12_im) 127 #define gT22_re (+g22_re) 128 #define gT22_im (-g22_im) 157 #if (__COMPUTE_CAPABILITY__ >= 200) 158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 163 #if (__COMPUTE_CAPABILITY__ >= 200) 164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 236 #if (DD_PREC==0) //temporal hack 280 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
281 #if (DD_PREC==2) // half precision 282 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
325 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
474 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
475 #if (DD_PREC==2) // half precision 476 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
523 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
672 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
673 #if (DD_PREC==2) // half precision 674 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
717 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
866 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
867 #if (DD_PREC==2) // half precision 868 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
915 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1064 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1065 #if (DD_PREC==2) // half precision 1066 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1109 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1258 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1259 #if (DD_PREC==2) // half precision 1260 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1307 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1456 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1457 #if (DD_PREC==2) // half precision 1458 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1499 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1573 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1712 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1713 #if (DD_PREC==2) // half precision 1714 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1759 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1833 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1973 if (
coord[4] != 0 )
2058 #if defined MULTI_GPU && defined DSLASH_XPAY 2063 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2064 #ifdef SPINOR_DOUBLE 2069 #ifdef SPINOR_DOUBLE 2119 #endif // SPINOR_DOUBLE 2121 #endif // DSLASH_XPAY 2124 #if defined MULTI_GPU && defined DSLASH_XPAY 2128 switch(kernel_type) {
2144 #ifdef SPINOR_DOUBLE 2149 #ifdef SPINOR_DOUBLE 2199 #endif // SPINOR_DOUBLE 2201 #endif // DSLASH_XPAY 2209 #undef SHARED_STRIDE RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o10_im
coordsFromIndex< 5, QUDA_5D_PC, EVEN_X >(X, coord, sid, param)
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o31_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_im
#define READ_SPINOR_GHOST
#define ASSN_GAUGE_MATRIX
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o02_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o22_re