12 template <
int dim,
int nLayers,
int face_num,
typename Param>
15 const auto *
X = param.dc.X;
16 const auto *
R = param.R;
18 int face_X =
X[0], face_Y =
X[1], face_Z =
X[2];
33 int face_XYZ = face_X * face_Y * face_Z;
34 int face_XY = face_X * face_Y;
38 int face_parity = (param.parity + face_num *(
X[dim] - nLayers)) & 1;
49 int aux1 = face_idx / face_X;
50 int aux2 = aux1 / face_Y;
51 int y = aux1 - aux2 * face_Y;
52 int t = aux2 / face_Z;
53 int z = aux2 - t * face_Z;
54 face_idx += (face_parity + t + z + y) & 1;
55 }
else if (!(face_Y & 1)) {
56 int t = face_idx / face_XYZ;
57 int z = (face_idx / face_XY) % face_Z;
58 face_idx += (face_parity + t + z) & 1;
59 }
else if (!(face_Z & 1)) {
60 int t = face_idx / face_XYZ;
61 face_idx += (face_parity + t) & 1;
63 face_idx += face_parity;
71 int gap =
X[dim] - nLayers;
74 aux = face_idx / face_X;
75 idx += (aux + face_num)*gap + (1 - 2*face_num)*
R[0];
78 aux = face_idx / face_XY;
79 idx += ((aux + face_num)*gap + (1 - 2*face_num)*
R[1])*face_X;
82 aux = face_idx / face_XYZ;
83 idx += ((aux + face_num)*gap + (1 - 2*face_num)*
R[2])* face_XY;
86 idx += (face_num*gap + (1 - 2*face_num)*
R[3])*face_XYZ;
109 template <
int dim,
int nLayers,
int face_num,
typename Param>
112 const auto *
X = param.dc.X;
113 const auto *
dims = param.dc.dims[dim];
114 const auto &V4 = param.dc.volume_4d;
117 int face_parity = (param.parity + face_num *(
X[dim] - nLayers)) & 1;
123 int s = face_idx_in / param.dc.face_XYZT[dim];
124 int face_idx = face_idx_in - s*param.dc.face_XYZT[dim];
127 int aux1 = face_idx /
dims[0];
128 int aux2 = aux1 /
dims[1];
129 int y = aux1 - aux2 *
dims[1];
130 int t = aux2 / dims[2];
131 int z = aux2 - t * dims[2];
132 face_idx += (face_parity + t + z + y) & 1;
135 int gap =
X[dim] - nLayers;
141 idx += face_num*gap + aux*(
X[0]-1);
142 idx += (idx/V4)*(1-V4);
145 aux = face_idx / param.dc.face_X[dim];
146 idx += face_num * gap * param.dc.face_X[dim] + aux*(
X[1]-1)*param.dc.face_X[dim];
147 idx += (idx/V4)*(
X[0]-V4);
150 aux = face_idx / param.dc.face_XY[dim];
151 idx += face_num * gap * param.dc.face_XY[dim] +aux*(
X[2]-1)*param.dc.face_XY[dim];
152 idx += (idx/V4)*((
X[1]*
X[0])-V4);
155 idx += face_num * gap * param.dc.face_XYZ[dim];
160 return (idx + s*V4) >> 1;
178 template <
int dim,
int nLayers,
int face_num,
typename Param>
181 const auto *
X = param.dc.X;
182 const auto *
R = param.R;
186 int V =
X[0]*
X[1]*
X[2]*
X[3];
187 int face_X = X[0], face_Y = X[1], face_Z = X[2];
191 dims[0]=X[1]; dims[1]=X[2]; dims[2]=X[3];
195 dims[0]=X[0];dims[1]=X[2]; dims[2]=X[3];
199 dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
203 dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
206 int face_XYZ = face_X * face_Y * face_Z;
207 int face_XY = face_X * face_Y;
210 int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
215 int aux1 = face_idx / dims[0];
216 int aux2 = aux1 / dims[1];
217 int y = aux1 - aux2 * dims[1];
218 int t = aux2 / dims[2];
219 int z = aux2 - t * dims[2];
220 face_idx += (face_parity + t + z + y) & 1;
225 int gap = X[dim] - nLayers - 2*
R[dim];
229 idx += face_num*gap + aux*(X[0]-1);
230 idx += (idx/
V)*(1-V);
234 aux = face_idx / face_X;
235 idx += face_num * gap * face_X + aux*(X[1]-1)*face_X;
236 idx += idx/V*(X[0]-
V);
240 aux = face_idx / face_XY;
241 idx += face_num * gap * face_XY +aux*(X[2]-1)*face_XY;
242 idx += idx/V*(face_XY-
V);
246 idx += ((face_num*gap) + R[3])*face_XYZ;
264 template<KernelType dim,
int nLayers,
int Dir,
typename Param>
267 const auto *
X = param.dc.X;
268 const auto *Xh = param.dc.Xh;
270 int za, x1h, x0h, zb;
274 x1h = idx - za*Xh[1];
278 x[3] = zb - x[0]*X[3];
280 x[0] += ((x[0] >= nLayers) ? (X[0] - 2*nLayers) : 0);
282 x[0] += (X[0] - nLayers);
284 x[1] = 2*x1h + ((x[0] + x[2] + x[3] + param.parity) & 1);
288 x0h = idx - za*Xh[0];
292 x[3] = zb - x[1]*X[3];
294 x[1] += ((x[1] >= nLayers) ? (X[1] - 2*nLayers) : 0);
296 x[1] += (X[1] - nLayers);
298 x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
302 x0h = idx - za*Xh[0];
306 x[3] = zb - x[2]*X[3];
308 x[2] += ((x[2] >= nLayers) ? (X[2] - 2*nLayers) : 0);
310 x[2] += (X[2] - nLayers);
312 x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
316 x0h = idx - za*Xh[0];
320 x[2] = zb - x[3]*X[2];
322 x[3] += ((x[3] >= nLayers) ? (X[3] - 2*nLayers) : 0);
324 x[3] += (X[3] - nLayers);
326 x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
351 template <
int nDim, QudaPCType pc_type, IndexType
idxType,
typename T,
typename Param>
414 const auto *
X = param.dc.X;
416 int XYZT = param.dc.Vh << 1;
417 int XYZ = param.dc.X3X2X1;
418 int XY = param.dc.X2X1;
428 #if DSLASH_TUNE_TILE // tiled indexing - experimental and disabled for now 429 const auto *block = param.block;
430 const auto *grid = param.grid;
434 for (
int i=0; i<4; i++) aux[i+1] = aux[i] / block[i];
435 for (
int i=4; i<8; i++) aux[i+1] = aux[i] / grid[i];
437 for (
int i=0; i<4; i++) x[i] = aux[i] - aux[i+1] * block[i];
438 for (
int i=0; i<4; i++) x[i] += block[i]*(aux[i+4] - aux[i+5] * grid[i]);
439 x[4] = (nDim == 5) ? aux[8] : 0;
441 int oddbit = (pc_type ==
QUDA_4D_PC ? (param.parity + t + z + y) : (param.parity +
s + t + z + y)) & 1;
445 cb_idx = (nDim == 5 ? (((x[4]*
X[3]+x[3])*
X[2]+x[2])*
X[1]+x[1])*
X[0]+x[0] : ((x[3]*
X[2]+x[2])*
X[1]+x[1])*
X[0]+x[0]) >> 1;
446 idx = 2*cb_idx + oddbit;
450 for (
int i=0; i<4; i++) aux[i+1] = aux[i] /
X[i];
452 x[0] = aux[0] - aux[1] *
X[0];
453 x[1] = aux[1] - aux[2] * X[1];
454 x[2] = aux[2] - aux[3] * X[2];
455 x[3] = aux[3] - (nDim == 5 ? aux[4] * X[3] : 0);
456 x[4] = (nDim == 5) ? aux[4] : 0;
458 int oddbit = (pc_type ==
QUDA_4D_PC ? (param.parity + x[3] + x[2] + x[1]) : (param.parity + x[4] + x[3] + x[2] + x[1])) & 1;
463 }
else if (idxType ==
EVEN_Y ) {
465 x[3] = (idx / XYZ) %
X[3];
466 x[2] = (idx / XY) %
X[2];
467 idx += (param.parity + x[3] + x[2]) & 1;
468 x[1] = (idx /
X[0]) %
X[1];
470 }
else if (idxType ==
EVEN_Z ) {
472 x[3] = (idx / XYZ) %
X[3];
473 idx += (param.parity + x[3]) & 1;
474 x[2] = (idx / XY) %
X[2];
475 x[1] = (idx /
X[0]) %
X[1];
479 idx += (param.parity + x[4]) & 1;
481 x[2] = (idx / XY) %
X[2];
482 x[1] = (idx /
X[0]) %
X[1];
498 template <IndexType
idxType,
typename Int,
typename Param>
500 const auto *
X = param.x;
503 int xt = blockIdx.x*blockDim.x + threadIdx.x;
506 x[0] = aux - x[3]*
X[0];
507 x[1] = blockIdx.y*blockDim.y + threadIdx.y;
508 x[2] = blockIdx.z*blockDim.z + threadIdx.z;
509 x[0] += (param.parity + x[3] + x[2] + x[1]) &1;
510 idx = ((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0];
528 template <
int dim,
typename T>
529 static inline __device__
bool inBoundary(
const int depth,
const int coord[],
const T
X[]){
530 return ((coord[dim] >= X[dim] - depth) || (coord[dim] < depth));
554 template <
typename T>
555 static inline __device__
bool isActive(
const int threadDim,
int offsetDim,
int offset,
const int y[],
const int partitioned[],
const T
X[])
562 if(!partitioned[offsetDim])
return false;
564 if(threadDim < offsetDim)
return false;
565 int width = (offset > 0) ? offset : -offset;
572 if(!partitioned[3])
break;
573 if(partitioned[3] && inBoundary<3>(width, y, X))
return false;
577 if((!partitioned[3]) && (!partitioned[2]))
break;
578 if(partitioned[3] && inBoundary<3>(width, y, X))
return false;
579 if(partitioned[2] && inBoundary<2>(width, y, X))
return false;
583 if((!partitioned[3]) && (!partitioned[2]) && (!partitioned[1]))
break;
584 if(partitioned[3] && inBoundary<3>(width, y, X))
return false;
585 if(partitioned[2] && inBoundary<2>(width, y, X))
return false;
586 if(partitioned[1] && inBoundary<1>(width, y, X))
return false;
605 template<
int nDim,
int nLayers,
typename I,
typename Param>
608 int D[4] = {param.dc.X[0], param.dc.X[1], param.dc.X[2], param.dc.X[3]};
609 int y[5] = {x[0], x[1], x[2], x[3], (nDim==5 ? x[4] : 0)};
611 y[face_dim] = (y[face_dim] < nLayers) ? y[face_dim] : y[face_dim] - (D[face_dim] - nLayers);
612 D[face_dim] = nLayers;
614 if (nDim == 5) face_idx = (((((D[3]*y[4] + y[3])*D[2] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
615 else if (nDim == 4) face_idx = ((((D[2]*y[3] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
627 float sign = signbit(a) ? -1.0f : 1.0f;
628 float power = __powf(fabsf(a), b);
629 return b&1 ? sign * power : power;
static __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param ¶m)
Compute the full-lattice coordinates from the input face index. This is used by the staggered halo up...
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int *const x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ int indexFromFaceIndexExtended(int face_idx, const Param ¶m)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param ¶m)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ void faceIndexFromCoords(int &face_idx, I *const x, int face_dim, const Param ¶m)
Compute the face index from the lattice coordinates.
static __device__ bool inBoundary(const int depth, const int coord[], const T X[])
Compute whether the provided coordinate is within the halo region boundary of a given dimension...
__device__ float __fast_pow(float a, int b)
static __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param ¶m)
Compute global extended checkerboard index from face index. The following indexing routines work for ...