13 template <
int nDim, QudaDWFPCType type,
int dim,
int nLayers,
int face_num,
typename Param>
31 int aux2 = aux1 /
param.dc.face_Y[
dim];
32 int aux3 = aux2 /
param.dc.face_Z[
dim];
33 int aux4 = (nDim == 5 ? aux3 /
param.dc.face_T[
dim] : 0);
35 int y = aux1 - aux2 *
param.dc.face_Y[
dim];
36 int z = aux2 - aux3 *
param.dc.face_Z[
dim];
37 int t = aux3 - aux4 *
param.dc.face_T[
dim];
43 }
else if (!(
param.dc.face_Y[
dim] & 1)) {
51 }
else if (!(
param.dc.face_Z[
dim] & 1)) {
58 }
else if (!(
param.dc.face_T[
dim]) && nDim == 5) {
109 template <
int dim,
int nLayers,
int face_num,
typename Param>
115 int face_X =
X[0], face_Y =
X[1], face_Z =
X[2];
130 int face_XYZ = face_X * face_Y * face_Z;
131 int face_XY = face_X * face_Y;
147 int aux2 = aux1 / face_Y;
148 int y = aux1 - aux2 * face_Y;
149 int t = aux2 / face_Z;
150 int z = aux2 -
t * face_Z;
152 }
else if (!(face_Y & 1)) {
156 }
else if (!(face_Z & 1)) {
168 int gap =
X[
dim] - nLayers;
206 template <
int dim,
int nLayers,
int face_num,
typename Param>
210 const auto *dims =
param.dc.dims[
dim];
211 const auto &V4 =
param.dc.volume_4d;
220 int s = face_idx_in /
param.dc.face_XYZT[
dim];
225 int aux2 = aux1 / dims[1];
226 int y = aux1 - aux2 * dims[1];
227 int t = aux2 / dims[2];
228 int z = aux2 -
t * dims[2];
232 int gap =
X[
dim] - nLayers;
257 return (
idx +
s*V4) >> 1;
275 template <
int dim,
int nLayers,
int face_num,
typename Param>
283 int V =
X[0]*
X[1]*
X[2]*
X[3];
284 int face_X =
X[0], face_Y =
X[1], face_Z =
X[2];
288 dims[0]=
X[1]; dims[1]=
X[2]; dims[2]=
X[3];
292 dims[0]=
X[0];dims[1]=
X[2]; dims[2]=
X[3];
296 dims[0]=
X[0]; dims[1]=
X[1]; dims[2]=
X[3];
300 dims[0]=
X[0]; dims[1]=
X[1]; dims[2]=
X[3];
303 int face_XYZ = face_X * face_Y * face_Z;
304 int face_XY = face_X * face_Y;
313 int aux2 = aux1 / dims[1];
314 int y = aux1 - aux2 * dims[1];
315 int t = aux2 / dims[2];
316 int z = aux2 -
t * dims[2];
322 int gap =
X[
dim] - nLayers - 2*
R[
dim];
361 template<KernelType dim,
int nLayers,
int Dir,
typename Param>
375 x[3] =
zb -
x[0]*
X[3];
377 x[0] += ((
x[0] >= nLayers) ? (
X[0] - 2*nLayers) : 0);
379 x[0] += (
X[0] - nLayers);
381 x[1] = 2*x1h + ((
x[0] +
x[2] +
x[3] +
param.parity) & 1);
389 x[3] =
zb -
x[1]*
X[3];
391 x[1] += ((
x[1] >= nLayers) ? (
X[1] - 2*nLayers) : 0);
393 x[1] += (
X[1] - nLayers);
395 x[0] = 2*
x0h + ((
x[1] +
x[2] +
x[3] +
param.parity) & 1);
403 x[3] =
zb -
x[2]*
X[3];
405 x[2] += ((
x[2] >= nLayers) ? (
X[2] - 2*nLayers) : 0);
407 x[2] += (
X[2] - nLayers);
409 x[0] = 2*
x0h + ((
x[1] +
x[2] +
x[3] +
param.parity) & 1);
417 x[2] =
zb -
x[3]*
X[2];
419 x[3] += ((
x[3] >= nLayers) ? (
X[3] - 2*nLayers) : 0);
421 x[3] += (
X[3] - nLayers);
423 x[0] = 2*
x0h + ((
x[1] +
x[2] +
x[3] +
param.parity) & 1);
441 template <
int nDim, QudaDWFPCType type,
int dim_,
int nLayers,
typename Int,
typename Param>
448 const auto &face_X =
param.dc.face_X[
dim];
449 const auto &face_Y =
param.dc.face_Y[
dim];
450 const auto &face_Z =
param.dc.face_Z[
dim];
451 const auto &face_T =
param.dc.face_T[
dim];
452 const auto &face_XYZT =
param.dc.face_XYZT[
dim];
453 const auto &face_XYZ =
param.dc.face_XYZ[
dim];
454 const auto &face_XY =
param.dc.face_XY[
dim];
471 int aux2 = aux1 / face_Y;
472 int aux3 = aux2 / face_Z;
473 int aux4 = (nDim == 5 ? aux3 / face_T : 0);
476 x[1] = aux1 - aux2 * face_Y;
477 x[2] = aux2 - aux3 * face_Z;
478 x[3] = aux3 - aux4 * face_T;
481 if (type ==
QUDA_5D_PC)
x[0] += (face_parity +
x[4] +
x[3] +
x[2] +
x[1]) & 1;
482 else x[0] += (face_parity +
x[3] +
x[2] +
x[1]) & 1;
484 }
else if (!(face_Y & 1)) {
486 x[4] = (nDim == 5 ?
face_idx / face_XYZT : 0);
491 else face_idx += (face_parity +
x[3] +
x[2]) & 1;
496 }
else if (!(face_Z & 1)) {
498 x[4] = (nDim == 5 ?
face_idx / face_XYZT : 0);
510 x[4] = (nDim == 5 ?
face_idx / face_XYZT : 0);
523 idx =
X[0]*(
X[1]*(
X[2]*(
X[3]*
x[4] +
x[3]) +
x[2]) +
x[1]) +
x[0];
549 template <
int nDim, QudaDWFPCType pc_type, IndexType
idxType,
typename T,
typename Param>
614 int XYZT =
param.dc.Vh << 1;
615 int XYZ =
param.dc.X3X2X1;
616 int XY =
param.dc.X2X1;
626 #if DSLASH_TUNE_TILE // tiled indexing - experimental and disabled for now 628 const auto *grid =
param.grid;
632 for (
int i=0;
i<4;
i++) aux[
i+1] = aux[
i] /
block[
i];
633 for (
int i=4;
i<8;
i++) aux[
i+1] = aux[
i] / grid[
i];
636 for (
int i=0;
i<4;
i++)
x[
i] +=
block[
i]*(aux[
i+4] - aux[
i+5] * grid[
i]);
637 x[4] = (nDim == 5) ? aux[8] : 0;
643 cb_idx = (nDim == 5 ? (((
x[4]*
X[3]+
x[3])*
X[2]+
x[2])*
X[1]+
x[1])*
X[0]+
x[0] : ((
x[3]*
X[2]+
x[2])*
X[1]+
x[1])*
X[0]+
x[0]) >> 1;
644 idx = 2*cb_idx + oddbit;
648 for (
int i=0;
i<4;
i++) aux[
i+1] = aux[
i] /
X[
i];
650 x[0] = aux[0] - aux[1] *
X[0];
651 x[1] = aux[1] - aux[2] *
X[1];
652 x[2] = aux[2] - aux[3] *
X[2];
653 x[3] = aux[3] - (nDim == 5 ? aux[4] *
X[3] : 0);
654 x[4] = (nDim == 5) ? aux[4] : 0;
661 }
else if (idxType ==
EVEN_Y ) {
663 x[3] = (
idx / XYZ) %
X[3];
664 x[2] = (
idx / XY) %
X[2];
666 x[1] = (
idx /
X[0]) %
X[1];
668 }
else if (idxType ==
EVEN_Z ) {
670 x[3] = (
idx / XYZ) %
X[3];
672 x[2] = (
idx / XY) %
X[2];
673 x[1] = (
idx /
X[0]) %
X[1];
679 x[2] = (
idx / XY) %
X[2];
680 x[1] = (
idx /
X[0]) %
X[1];
696 template <IndexType
idxType,
typename Int,
typename Param>
701 int xt = blockIdx.x*
blockDim.x + threadIdx.x;
704 x[0] = aux -
x[3]*
X[0];
705 x[1] = blockIdx.y*
blockDim.y + threadIdx.y;
706 x[2] = blockIdx.z*
blockDim.z + threadIdx.z;
707 x[0] += (
param.parity +
x[3] +
x[2] +
x[1]) &1;
708 idx = ((
x[3]*
X[2] +
x[2])*
X[1] +
x[1])*
X[0] +
x[0];
726 template <
int dim,
typename T>
752 template <
typename T>
753 static inline __device__
bool isActive(
const int threadDim,
int offsetDim,
int offset,
const int y[],
const int partitioned[],
const T
X[])
760 if(!partitioned[offsetDim])
return false;
762 if(threadDim < offsetDim)
return false;
770 if(!partitioned[3])
break;
771 if(partitioned[3] && inBoundary<3>(
width,
y,
X))
return false;
775 if((!partitioned[3]) && (!partitioned[2]))
break;
776 if(partitioned[3] && inBoundary<3>(
width,
y,
X))
return false;
777 if(partitioned[2] && inBoundary<2>(
width,
y,
X))
return false;
781 if((!partitioned[3]) && (!partitioned[2]) && (!partitioned[1]))
break;
782 if(partitioned[3] && inBoundary<3>(
width,
y,
X))
return false;
783 if(partitioned[2] && inBoundary<2>(
width,
y,
X))
return false;
784 if(partitioned[1] && inBoundary<1>(
width,
y,
X))
return false;
807 template <
int nDim=4,
typename Param>
811 const int s = (nDim == 5 ? tid/
param.threads : 0);
833 template <
int nDim=4,
typename Param>
845 template<
int nDim,
int nLayers,
typename I,
typename Param>
849 int y[5] = {
x[0],
x[1],
x[2],
x[3], (nDim==5 ?
x[4] : 0)};
851 y[face_dim] = (
y[face_dim] < nLayers) ?
y[face_dim] :
y[face_dim] - (D[face_dim] - nLayers);
852 D[face_dim] = nLayers;
854 if (nDim == 5)
face_idx = (((((D[3]*
y[4] +
y[3])*D[2] +
y[2])*D[1] +
y[1])*D[0] +
y[0]) >> 1);
855 else if (nDim == 4)
face_idx = ((((D[2]*
y[3] +
y[2])*D[1] +
y[1])*D[0] +
y[0]) >> 1);
882 template <
typename T>
889 if (blockIdx.x < gridp) {
891 const int i = blockIdx.x % swizzle;
892 const int j = blockIdx.x / swizzle;
912 float sign = signbit(
a) ? -1.0f : 1.0f;
913 float power = __powf(
fabsf(
a),
b);
914 return b&1 ?
sign * power : power;
static __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param ¶m)
Compute the full-lattice coordinates from the input face index. This is used by the staggered halo up...
size_t const void size_t size_t width
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int *const x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ int indexFromFaceIndexExtended(int face_idx, const Param ¶m)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
__device__ int dimFromFaceIndex(int &face_idx, const int tid, const Param ¶m)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param ¶m)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
static __device__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, const Param ¶m)
Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo ...
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ void faceIndexFromCoords(int &face_idx, I *const x, int face_dim, const Param ¶m)
Compute the face index from the lattice coordinates.
static __device__ bool inBoundary(const int depth, const int coord[], const T X[])
Compute whether the provided coordinate is within the halo region boundary of a given dimension...
__device__ float __fast_pow(float a, int b)
static __device__ int indexFromFaceIndex(int face_idx, const Param ¶m)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
static __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param ¶m)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...