3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 8 #if (CUDA_VERSION >= 4010) 11 #define VOLATILE volatile 15 #define spinorFloat double 41 #define spinorFloat float 66 #endif // SPINOR_DOUBLE 109 #endif // GAUGE_DOUBLE 112 #define gT00_re (+g00_re) 113 #define gT00_im (-g00_im) 114 #define gT01_re (+g10_re) 115 #define gT01_im (-g10_im) 116 #define gT02_re (+g20_re) 117 #define gT02_im (-g20_im) 118 #define gT10_re (+g01_re) 119 #define gT10_im (-g01_im) 120 #define gT11_re (+g11_re) 121 #define gT11_im (-g11_im) 122 #define gT12_re (+g21_re) 123 #define gT12_im (-g21_im) 124 #define gT20_re (+g02_re) 125 #define gT20_im (-g02_im) 126 #define gT21_re (+g12_re) 127 #define gT21_im (-g12_im) 128 #define gT22_re (+g22_re) 129 #define gT22_im (-g22_im) 158 #if (__COMPUTE_CAPABILITY__ >= 200) 159 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 161 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 164 #if (__COMPUTE_CAPABILITY__ >= 200) 165 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 167 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 174 #if (DD_PREC==2) // half precision 176 #endif // half precision 217 for(
int dir=0; dir<4; ++dir){
243 #if (DD_PREC==0) //temporal hack 284 #if (DD_PREC==2) // half precision 304 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[0];
449 #if (DD_PREC==2) // half precision 469 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[0];
614 #if (DD_PREC==2) // half precision 634 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[1];
779 #if (DD_PREC==2) // half precision 799 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[1];
944 #if (DD_PREC==2) // half precision 964 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[2];
1109 #if (DD_PREC==2) // half precision 1129 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[2];
1274 #if (DD_PREC==2) // half precision 1292 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1343 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1478 #if (DD_PREC==2) // half precision 1496 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1547 const int sp_stride_pad =
param.dc.Ls*
param.dc.ghostFace[3];
1675 #ifdef SPINOR_DOUBLE 1680 #ifdef SPINOR_DOUBLE 1730 #endif // SPINOR_DOUBLE 1732 #endif // DSLASH_XPAY 1740 #undef SHARED_STRIDE
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o01_re
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o10_im
#define READ_SPINOR_GHOST
#define ASSN_GAUGE_MATRIX
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o31_re
#define RECONSTRUCT_GAUGE_MATRIX
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re