5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 8 #if (CUDA_VERSION >= 4010) 11 #define VOLATILE volatile 15 #define spinorFloat double 41 #define mdwf_b5 param.mdwf_b5_d 42 #define mdwf_c5 param.mdwf_c5_d 43 #define mferm param.mferm 47 #define spinorFloat float 73 #define mdwf_b5 param.mdwf_b5_f 74 #define mdwf_c5 param.mdwf_c5_f 75 #define mferm param.mferm_f 78 #endif // SPINOR_DOUBLE 121 #endif // GAUGE_DOUBLE 124 #define gT00_re (+g00_re) 125 #define gT00_im (-g00_im) 126 #define gT01_re (+g10_re) 127 #define gT01_im (-g10_im) 128 #define gT02_re (+g20_re) 129 #define gT02_im (-g20_im) 130 #define gT10_re (+g01_re) 131 #define gT10_im (-g01_im) 132 #define gT11_re (+g11_re) 133 #define gT11_im (-g11_im) 134 #define gT12_re (+g21_re) 135 #define gT12_im (-g21_im) 136 #define gT20_re (+g02_re) 137 #define gT20_im (-g02_im) 138 #define gT21_re (+g12_re) 139 #define gT21_im (-g12_im) 140 #define gT22_re (+g22_re) 141 #define gT22_im (-g22_im) 170 #if (__COMPUTE_CAPABILITY__ >= 200) 171 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 173 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 176 #if (__COMPUTE_CAPABILITY__ >= 200) 177 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 179 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 186 #if (DD_PREC==2) // half precision 188 #endif // half precision 232 for(
int dir=0; dir<4; ++dir){
257 #if (DD_PREC==0) //temporal hack 298 #if (DD_PREC==2) // half precision 317 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[0];
461 #if (DD_PREC==2) // half precision 480 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[0];
624 #if (DD_PREC==2) // half precision 643 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[1];
787 #if (DD_PREC==2) // half precision 806 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[1];
950 #if (DD_PREC==2) // half precision 969 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[2];
1113 #if (DD_PREC==2) // half precision 1132 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[2];
1276 #if (DD_PREC==2) // half precision 1294 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1343 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1477 #if (DD_PREC==2) // half precision 1495 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1544 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1671 READ_ACCUM(ACCUMTEX,
param.sp_stride)
1680 #ifdef SPINOR_DOUBLE 1730 #endif // SPINOR_DOUBLE 1731 #endif // DSLASH_XPAY 1745 #undef SHARED_STRIDE
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o01_re
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o10_im
#define READ_SPINOR_GHOST
#define ASSN_GAUGE_MATRIX
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o31_re
#define RECONSTRUCT_GAUGE_MATRIX
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re