3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 6 #if (CUDA_VERSION >= 4010) 9 #define VOLATILE volatile 13 #define spinorFloat double 15 #if CUDA_VERSION >= 6050 && CUDA_VERSION < 7050 16 #define POW(a, b) pow(a, static_cast<spinorFloat>(b)) 18 #define POW(a, b) pow(a, b) 46 #define mdwf_b5 param.mdwf_b5_d 47 #define mdwf_c5 param.mdwf_c5_d 48 #define mferm param.mferm 52 #define spinorFloat float 53 #define POW(a, b) __fast_pow(a, b) 79 #define mdwf_b5 param.mdwf_b5_f 80 #define mdwf_c5 param.mdwf_c5_f 81 #define mferm param.mferm_f 84 #endif // SPINOR_DOUBLE 127 #endif // GAUGE_DOUBLE 130 #define gT00_re (+g00_re) 131 #define gT00_im (-g00_im) 132 #define gT01_re (+g10_re) 133 #define gT01_im (-g10_im) 134 #define gT02_re (+g20_re) 135 #define gT02_im (-g20_im) 136 #define gT10_re (+g01_re) 137 #define gT10_im (-g01_im) 138 #define gT11_re (+g11_re) 139 #define gT11_im (-g11_im) 140 #define gT12_re (+g21_re) 141 #define gT12_im (-g21_im) 142 #define gT20_re (+g02_re) 143 #define gT20_im (-g02_im) 144 #define gT21_re (+g12_re) 145 #define gT21_im (-g12_im) 146 #define gT22_re (+g22_re) 147 #define gT22_im (-g22_im) 176 #if (__COMPUTE_CAPABILITY__ >= 200) 177 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 179 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 182 #if (__COMPUTE_CAPABILITY__ >= 200) 183 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 185 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 249 #if (DD_PREC==0) //temporal hack 293 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
294 #if (DD_PREC==2) // half precision 295 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
337 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
486 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
487 #if (DD_PREC==2) // half precision 488 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
534 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
683 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
684 #if (DD_PREC==2) // half precision 685 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
727 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
876 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
877 #if (DD_PREC==2) // half precision 878 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
924 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1073 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1074 #if (DD_PREC==2) // half precision 1075 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1117 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1266 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1267 #if (DD_PREC==2) // half precision 1268 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1314 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1463 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1464 #if (DD_PREC==2) // half precision 1465 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1506 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1579 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1718 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1719 #if (DD_PREC==2) // half precision 1720 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1765 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1838 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1964 #if defined MULTI_GPU && defined DSLASH_XPAY 1968 switch(kernel_type) {
1984 READ_ACCUM(ACCUMTEX,
param.sp_stride)
1993 #ifdef SPINOR_DOUBLE 2043 #endif // SPINOR_DOUBLE 2044 #endif // DSLASH_XPAY 2059 #undef SHARED_STRIDE
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o12_im
ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o22_re
coordsFromFaceIndex< 5, QUDA_4D_PC, kernel_type, 1 >(X, sid, coord, face_idx, face_num, param)
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o21_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o02_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid)
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o32_re