12 template <
typename I,
typename J,
typename K>
13 static __device__ __host__
inline int linkIndexShift(
const I x[],
const J dx[],
const K
X[4]) {
16 for (
int i = 0; i < 4; i++ ) y[i] = (x[i] + dx[i] + X[i]) % X[i];
17 int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
30 template <
typename I,
typename J,
typename K>
31 static __device__ __host__
inline int linkIndexShift(I y[],
const I x[],
const J dx[],
const K
X[4]) {
33 for (
int i = 0; i < 4; i++ ) y[i] = (x[i] + dx[i] + X[i]) % X[i];
34 int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
46 static __device__ __host__
inline int linkIndex(
const int x[],
const I
X[4]) {
47 int idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
60 static __device__ __host__
inline int linkIndex(
int y[],
const int x[],
const I
X[4]) {
61 int idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
62 y[0] = x[0]; y[1] = x[1]; y[2] = x[2]; y[3] = x[3];
75 template <
typename I,
int n>
76 static __device__ __host__
inline int linkIndexDn(
const int x[],
const I
X[4],
const int mu)
80 for (
int i = 0; i < 4; i++ ) y[i] = x[i];
81 y[
mu] = (y[
mu] + n + X[
mu]) % X[mu];
82 int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
94 template <
typename I>
static __device__ __host__
inline int linkIndexM1(
const int x[],
const I
X[4],
const int mu)
107 template <
typename I>
static __device__ __host__
inline int linkIndexM3(
const int x[],
const I
X[4],
const int mu)
120 template <
typename I>
124 for (
int i = 0; i < 4; i++ ) y[i] = x[i];
125 y[
mu] = (y[
mu] + 1 + X[
mu]) % X[mu];
126 int idx = ((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0];
138 template <
typename I>
139 static __device__ __host__
inline int linkIndexP1(
const int x[],
const I
X[4],
const int mu) {
140 return linkIndexDn<I, 1>(x,
X,
mu);
151 template <
typename I>
static __device__ __host__
inline int linkIndexP3(
const int x[],
const I
X[4],
const int mu)
153 return linkIndexDn<I, 3>(x,
X,
mu);
165 template <
int nDim = 4,
typename Arg>
168 int idx = nDim == 4 ? ((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0] :
169 (((x[4] * arg.X[3] + x[3]) * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0];
173 case 0:
return (x[0] == arg.X[0] - 1 ? idx - (arg.X[0] - 1) : idx + 1) >> 1;
174 case 1:
return (x[1] == arg.X[1] - 1 ? idx - arg.X2X1mX1 : idx + arg.X[0]) >> 1;
175 case 2:
return (x[2] == arg.X[2] - 1 ? idx - arg.X3X2X1mX2X1 : idx + arg.X2X1) >> 1;
176 case 3:
return (x[3] == arg.X[3] - 1 ? idx - arg.X4X3X2X1mX3X2X1 : idx + arg.X3X2X1) >> 1;
177 case 4:
return (x[4] == arg.X[4] - 1 ? idx - arg.X5X4X3X2X1mX4X3X2X1 : idx + arg.X4X3X2X1) >> 1;
181 case 0:
return (x[0] == 0 ? idx + (arg.X[0] - 1) : idx - 1) >> 1;
182 case 1:
return (x[1] == 0 ? idx + arg.X2X1mX1 : idx - arg.X[0]) >> 1;
183 case 2:
return (x[2] == 0 ? idx + arg.X3X2X1mX2X1 : idx - arg.X2X1) >> 1;
184 case 3:
return (x[3] == 0 ? idx + arg.X4X3X2X1mX3X2X1 : idx - arg.X3X2X1) >> 1;
185 case 4:
return (x[4] == 0 ? idx + arg.X5X4X3X2X1mX4X3X2X1 : idx - arg.X4X3X2X1) >> 1;
200 template <
typename I,
typename J>
201 static __device__ __host__
inline void getCoordsCB(
int x[],
int cb_index,
const I
X[], J X0h,
int parity)
208 int za = (cb_index / X0h);
209 int zb = (za / X[1]);
210 x[1] = (za - zb * X[1]);
212 x[2] = (zb - x[3] * X[2]);
213 int x1odd = (x[1] + x[2] + x[3] +
parity) & 1;
214 x[0] = (2 * cb_index + x1odd - za * X[0]);
228 template <
typename I>
static __device__ __host__
inline void getCoords(
int x[],
int cb_index,
const I
X[],
int parity)
241 template <
typename I,
typename J>
248 int za = (cb_index / (X[0] >> 1));
249 int zb = (za / X[1]);
250 x[1] = (za - zb * X[1]);
252 x[2] = (zb - x[3] * X[2]);
253 int x1odd = (x[1] + x[2] + x[3] +
parity) & 1;
254 x[0] = (2 * cb_index + x1odd - za * X[0]);
256 for (
int d=0; d<4; d++) x[d] += R[d];
269 template <
typename I,
typename J>
279 int za = (cb_index / X0h);
280 int zb = (za / X[1]);
281 x[1] = za - zb * X[1];
285 x[3] = zc - x[4] * X[3];
286 int x1odd = (x[1] + x[2] + x[3] + (pc_type==
QUDA_5D_PC ? x[4] : 0) + parity) & 1;
287 x[0] = (2 * cb_index + x1odd) - za * X[0];
300 template <
typename I>
303 getCoords5CB(x, cb_index, X, X[0] >> 1, parity, pc_type);
315 template <
typename I>
317 int za = (cb_index / (X[0] / 2));
318 int zb = (za / X[1]);
319 int x1 = za - zb * X[1];
320 int x3 = (zb / X[2]);
321 int x2 = zb - x3 * X[2];
322 int x1odd = (x1 + x2 + x3 +
parity) & 1;
323 return 2 * cb_index + x1odd;
334 template <
int dir,
int nDim = 4,
typename I>
335 __device__ __host__
inline int ghostFaceIndex(
const int x_[],
const I X_[],
int dim,
int nFace)
337 static_assert((nDim == 4 || nDim == 5),
"Number of dimensions must be 4 or 5");
339 const int x[] = {x_[0], x_[1], x_[2], x_[3], nDim == 5 ? x_[4] : 0};
340 const int X[] = {(int)X_[0], (
int)X_[1], (int)X_[2], (
int)X_[3], nDim == 5 ? (int)X_[4] : 1};
346 index = (x[0]*X[4]*X[3]*X[2]*X[1] + x[4]*X[3]*X[2]*X[1] + x[3]*(X[2]*X[1])+x[2]*X[1] + x[1])>>1;
349 index = ((x[0]-X[0]+nFace)*X[4]*X[3]*X[2]*X[1] + x[4]*X[3]*X[2]*X[1] + x[3]*(X[2]*X[1]) + x[2]*X[1] + x[1])>>1;
356 index = (x[1]*X[4]*X[3]*X[2]*X[0] + x[4]*X[3]*X[2]*X[0] + x[3]*X[2]*X[0]+x[2]*X[0]+x[0])>>1;
359 index = ((x[1]-X[1]+nFace)*X[4]*X[3]*X[2]*X[0] +x[4]*X[3]*X[2]*X[0]+ x[3]*X[2]*X[0] + x[2]*X[0] + x[0])>>1;
366 index = (x[2]*X[4]*X[3]*X[1]*X[0] + x[4]*X[3]*X[1]*X[0] + x[3]*X[1]*X[0]+x[1]*X[0]+x[0])>>1;
369 index = ((x[2]-X[2]+nFace)*X[4]*X[3]*X[1]*X[0] + x[4]*X[3]*X[1]*X[0] + x[3]*X[1]*X[0] + x[1]*X[0] + x[0])>>1;
376 index = (x[3]*X[4]*X[2]*X[1]*X[0] + x[4]*X[2]*X[1]*X[0] + x[2]*X[1]*X[0]+x[1]*X[0]+x[0])>>1;
379 index = ((x[3]-X[3]+nFace)*X[4]*X[2]*X[1]*X[0] + x[4]*X[2]*X[1]*X[0] + x[2]*X[1]*X[0]+x[1]*X[0] + x[0])>>1;
395 template <
int dir,
int nDim = 4,
typename I>
398 static_assert((nDim == 4 || nDim == 5),
"Number of dimensions must be 4 or 5");
400 const int x[] = {x_[0], x_[1], x_[2], x_[3], nDim == 5 ? x_[4] : 0};
401 const int X[] = {(int)X_[0], (
int)X_[1], (int)X_[2], (
int)X_[3], nDim == 5 ? (int)X_[4] : 1};
407 index = ((x[0] + nFace - 1) * X[4] * X[3] * X[2] * X[1] + x[4] * X[3] * X[2] * X[1] + x[3] * (X[2] * X[1])
408 + x[2] * X[1] + x[1])
412 index = ((x[0] - X[0] + nFace) * X[4] * X[3] * X[2] * X[1] + x[4] * X[3] * X[2] * X[1] + x[3] * (X[2] * X[1])
413 + x[2] * X[1] + x[1])
421 index = ((x[1] + nFace - 1) * X[4] * X[3] * X[2] * X[0] + x[4] * X[3] * X[2] * X[0] + x[3] * X[2] * X[0]
422 + x[2] * X[0] + x[0])
426 index = ((x[1] - X[1] + nFace) * X[4] * X[3] * X[2] * X[0] + x[4] * X[3] * X[2] * X[0] + x[3] * X[2] * X[0]
427 + x[2] * X[0] + x[0])
435 index = ((x[2] + nFace - 1) * X[4] * X[3] * X[1] * X[0] + x[4] * X[3] * X[1] * X[0] + x[3] * X[1] * X[0]
436 + x[1] * X[0] + x[0])
440 index = ((x[2] - X[2] + nFace) * X[4] * X[3] * X[1] * X[0] + x[4] * X[3] * X[1] * X[0] + x[3] * X[1] * X[0]
441 + x[1] * X[0] + x[0])
449 index = ((x[3] + nFace - 1) * X[4] * X[2] * X[1] * X[0] + x[4] * X[2] * X[1] * X[0] + x[2] * X[1] * X[0]
450 + x[1] * X[0] + x[0])
454 index = ((x[3] - X[3] + nFace) * X[4] * X[2] * X[1] * X[0] + x[4] * X[2] * X[1] * X[0] + x[2] * X[1] * X[0]
455 + x[1] * X[0] + x[0])
487 template <
int nDim, QudaPCType type,
int dim_,
int nLayers,
typename Int,
typename Arg>
489 int &idx,
int &cb_idx, Int *
const x,
int face_idx,
const int &face_num,
int parity,
const Arg &
arg)
493 const auto *
X = arg.dc.X;
494 const auto &face_X = arg.dc.face_X[dim];
495 const auto &face_Y = arg.dc.face_Y[dim];
496 const auto &face_Z = arg.dc.face_Z[dim];
497 const auto &face_T = arg.dc.face_T[dim];
498 const auto &face_XYZT = arg.dc.face_XYZT[dim];
499 const auto &face_XYZ = arg.dc.face_XYZ[dim];
500 const auto &face_XY = arg.dc.face_XY[dim];
503 int face_parity = (parity + face_num * (arg.dc.X[dim] - nLayers)) & 1;
516 int aux1 = face_idx / face_X;
517 int aux2 = aux1 / face_Y;
518 int aux3 = aux2 / face_Z;
519 int aux4 = (nDim == 5 ? aux3 / face_T : 0);
521 x[0] = face_idx - aux1 * face_X;
522 x[1] = aux1 - aux2 * face_Y;
523 x[2] = aux2 - aux3 * face_Z;
524 x[3] = aux3 - aux4 * face_T;
528 x[0] += (face_parity + x[4] + x[3] + x[2] + x[1]) & 1;
530 x[0] += (face_parity + x[3] + x[2] + x[1]) & 1;
532 }
else if (!(face_Y & 1)) {
534 x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
535 x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
536 x[2] = (face_idx / face_XY) % face_Z;
539 face_idx += (face_parity + x[4] + x[3] + x[2]) & 1;
541 face_idx += (face_parity + x[3] + x[2]) & 1;
543 x[1] = (face_idx / face_X) % face_Y;
544 x[0] = face_idx % face_X;
546 }
else if (!(face_Z & 1)) {
548 x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
549 x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
552 face_idx += (face_parity + x[4] + x[3]) & 1;
554 face_idx += (face_parity + x[3]) & 1;
556 x[2] = (face_idx / face_XY) % face_Z;
557 x[1] = (face_idx / face_X) % face_Y;
558 x[0] = face_idx % face_X;
562 x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
563 face_idx += face_parity;
564 x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
565 x[2] = (face_idx / face_XY) % face_Z;
566 x[1] = (face_idx / face_X) % face_Y;
567 x[0] = face_idx % face_X;
571 x[dim] += face_num * (
X[dim] - nLayers);
574 idx =
X[0] * (
X[1] * (
X[2] * (
X[3] * x[4] + x[3]) + x[2]) + x[1]) + x[0];
584 template <
int nDim, QudaPCType type,
int dim_,
int nLayers,
typename Int,
typename Arg>
586 int &idx,
int &cb_idx, Int *
const x,
int face_idx,
const int &face_num,
const Arg &
arg)
588 coordsFromFaceIndex<nDim, type, dim_, nLayers>(idx, cb_idx, x, face_idx, face_num, arg.parity,
arg);
600 template <
int nDim, QudaPCType type,
int dim,
int nLayers,
int face_num,
typename Arg>
604 int face_parity = (parity + face_num * (arg.dc.X[dim] - nLayers)) & 1;
609 if (!(arg.dc.face_X[dim] & 1)) {
616 int aux1 = face_idx / arg.dc.face_X[dim];
617 int aux2 = aux1 / arg.dc.face_Y[dim];
618 int aux3 = aux2 / arg.dc.face_Z[dim];
619 int aux4 = (nDim == 5 ? aux3 / arg.dc.face_T[dim] : 0);
621 int y = aux1 - aux2 * arg.dc.face_Y[dim];
622 int z = aux2 - aux3 * arg.dc.face_Z[dim];
623 int t = aux3 - aux4 * arg.dc.face_T[dim];
627 face_idx += (face_parity + s + t + z + y) & 1;
629 face_idx += (face_parity + t + z + y) & 1;
631 }
else if (!(arg.dc.face_Y[dim] & 1)) {
633 int s = face_idx / arg.dc.face_XYZT[dim];
634 int t = (nDim == 5 ? (face_idx / arg.dc.face_XYZ[dim]) % arg.dc.face_T[dim] : (face_idx / arg.dc.face_XYZ[dim]));
635 int z = (face_idx / arg.dc.face_XY[dim]) % arg.dc.face_Z[dim];
637 face_idx += (face_parity + s + t + z) & 1;
639 face_idx += (face_parity + t + z) & 1;
641 }
else if (!(arg.dc.face_Z[dim] & 1)) {
643 int s = face_idx / arg.dc.face_XYZT[dim];
644 int t = (nDim == 5 ? (face_idx / arg.dc.face_XYZ[dim]) % arg.dc.face_T[dim] : (face_idx / arg.dc.face_XYZ[dim]));
646 face_idx += (face_parity + s + t) & 1;
648 face_idx += (face_parity + t) & 1;
650 }
else if (!(arg.dc.face_T[dim]) && nDim == 5) {
652 int s = face_idx / arg.dc.face_XYZT[dim];
654 face_idx += (face_parity +
s) & 1;
656 face_idx += face_parity;
659 face_idx += face_parity;
663 int gap = arg.dc.X[dim] - nLayers;
668 aux = face_idx / arg.dc.face_X[dim];
669 idx += (aux + face_num) * gap;
672 aux = face_idx / arg.dc.face_XY[dim];
673 idx += (aux + face_num) * gap * arg.dc.face_X[dim];
676 aux = face_idx / arg.dc.face_XYZ[dim];
677 idx += (aux + face_num) * gap * arg.dc.face_XY[dim];
680 aux = (nDim == 5 ? face_idx / arg.dc.face_XYZT[dim] : 0);
681 idx += (aux + face_num) * gap * arg.dc.face_XYZ[dim];
693 template <
int nDim, QudaPCType type,
int dim,
int nLayers,
int face_num,
typename Arg>
696 return indexFromFaceIndex<nDim, type, dim, nLayers, face_num>(face_idx, arg.parity,
arg);
716 template <
int nDim, QudaPCType type,
int dim,
int nLayers,
int face_num,
typename Arg>
719 const auto *
X = arg.dc.X;
720 const auto *
dims = arg.dc.dims[dim];
721 const auto &V4 = arg.dc.volume_4d;
724 int face_parity = (parity + face_num * (
X[dim] - nLayers)) & 1;
730 int s = face_idx_in / arg.dc.face_XYZT[dim];
731 int face_idx = face_idx_in - s * arg.dc.face_XYZT[dim];
734 int aux1 = face_idx /
dims[0];
735 int aux2 = aux1 /
dims[1];
736 int y = aux1 - aux2 *
dims[1];
737 int t = aux2 / dims[2];
738 int z = aux2 - t * dims[2];
739 face_idx += (face_parity + t + z + y) & 1;
742 int gap =
X[dim] - nLayers;
748 idx += face_num * gap + aux * (
X[0] - 1);
749 idx += (idx / V4) * (1 - V4);
752 aux = face_idx / arg.dc.face_X[dim];
753 idx += face_num * gap * arg.dc.face_X[dim] + aux * (
X[1] - 1) * arg.dc.face_X[dim];
754 idx += (idx / V4) * (
X[0] - V4);
757 aux = face_idx / arg.dc.face_XY[dim];
758 idx += face_num * gap * arg.dc.face_XY[dim] + aux * (
X[2] - 1) * arg.dc.face_XY[dim];
759 idx += (idx / V4) * ((
X[1] *
X[0]) - V4);
761 case 3: idx += face_num * gap * arg.dc.face_XYZ[dim];
break;
765 return (idx + s * V4) >> 1;
782 template <
int nDim = 4,
typename Arg>
787 const int s = (nDim == 5 ? tid / arg.threads : 0);
789 face_idx = tid - s * arg.threads;
791 if (face_idx < arg.threadDimMapUpper[0]) {
792 face_idx += s * arg.threadDimMapUpper[0];
794 }
else if (face_idx < arg.threadDimMapUpper[1]) {
795 face_idx -= arg.threadDimMapLower[1];
796 face_idx += s * (arg.threadDimMapUpper[1] - arg.threadDimMapLower[1]);
798 }
else if (face_idx < arg.threadDimMapUpper[2]) {
799 face_idx -= arg.threadDimMapLower[2];
800 face_idx += s * (arg.threadDimMapUpper[2] - arg.threadDimMapLower[2]);
803 face_idx -= arg.threadDimMapLower[3];
804 face_idx += s * (arg.threadDimMapUpper[3] - arg.threadDimMapLower[3]);
811 return dimFromFaceIndex<nDim>(face_idx, face_idx,
arg);
834 template <
typename T> __device__
inline int block_idx(
const T &swizzle)
838 const int gridp = gridDim.x - gridDim.x % swizzle;
841 if (blockIdx.x < gridp) {
843 const int i = blockIdx.x % swizzle;
844 const int j = blockIdx.x / swizzle;
847 block_idx = i * (gridp / swizzle) + j;
867 template <
typename Arg>
868 __device__ __host__
inline auto StaggeredPhase(
const int coords[],
int dim,
int dir,
const Arg &
arg) ->
typename Arg::real
870 using real =
typename Arg::real;
871 constexpr
auto phase = Arg::phase;
876 const auto *
X =
arg.dim;
879 case 0: sign = (coords[3]) % 2 == 0 ? static_cast<real>(1.0) :
static_cast<real
>(-1.0);
break;
880 case 1: sign = (coords[3] + coords[0]) % 2 == 0 ? static_cast<real>(1.0) :
static_cast<real
>(-1.0);
break;
882 sign = (coords[3] + coords[1] + coords[0]) % 2 == 0 ? static_cast<real>(1.0) :
static_cast<real
>(-1.0);
884 case 3: sign = (coords[3] + dir >=
X[3] &&
arg.is_last_time_slice) || (coords[3] + dir < 0 &&
arg.is_first_time_slice) ?
885 arg.tboundary :
static_cast<real
>(1.0);
break;
886 default: sign =
static_cast<real
>(1.0);
890 case 0: sign = (coords[3] + coords[2] + coords[1]) % 2 == 0 ? -1 : 1;
break;
891 case 1: sign = ((coords[3] + coords[2]) % 2 == 1) ? -1 : 1;
break;
892 case 2: sign = (coords[3] % 2 == 0) ? -1 : 1;
break;
893 case 3: sign = (coords[3] + dir >=
X[3] &&
arg.is_last_time_slice) || (coords[3] + dir < 0 &&
arg.is_first_time_slice) ?
894 arg.tboundary :
static_cast<real
>(1.0);
break;
895 default: sign =
static_cast<real
>(1.0);
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
static __device__ __host__ void getCoords5CB(int x[5], int cb_index, const I X[5], J X0h, int parity, QudaPCType pc_type)
__host__ __device__ int dimFromFaceIndex(int &face_idx, int tid, const Arg &arg)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
__device__ __host__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, int parity, const Arg &arg)
Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo ...
enum QudaPCType_s QudaPCType
static __device__ __host__ void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
__device__ __host__ int ghostFaceIndexStaggered(const int x_[], const I X_[], int dim, int nFace)
__device__ __host__ int indexFromFaceIndex(int face_idx, int parity, const Arg &arg)
Compute the checkerboard lattice index from the input face index. This is used by the Wilson-like hal...
static __device__ __host__ int linkIndexP3(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkIndexM3(const int x[], const I X[4], const int mu)
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type)
static int index(int ndim, const int *dims, const int *x)
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
static __device__ __host__ int getNeighborIndexCB(const int x[], int mu, int dir, const Arg &arg)
Compute the checkerboard 1-d index for the nearest neighbor.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ int ghostFaceIndex(const int x_[], const I X_[], int dim, int nFace)
static __device__ __host__ int linkIndexDn(const int x[], const I X[4], const int mu)
__device__ __host__ auto StaggeredPhase(const int coords[], int dim, int dir, const Arg &arg) -> typename Arg::real
Compute the staggered phase factor at unit shift from the current lattice coordinates. The routine below optimizes out the shift where possible, hence is only visible where we need to consider the boundary condition.
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.