10 template <
typename Float,
int nColor_,
typename Order,
int nDim>
13 static constexpr
int nColor = nColor_;
16 unsigned short X[nDim];
17 unsigned short A[nDim];
18 unsigned short B[nDim];
19 unsigned short C[nDim];
21 bool localParity[nDim];
22 int faceVolumeCB[nDim];
26 const int *B_,
const int *C_,
const int f_[nDim][nDim],
const int *localParity_,
int offset)
27 : order(order), nFace(u.Nface()), offset(offset) {
28 for (
int d=0; d<nDim; d++) {
33 for (
int e=0; e<nDim; e++) f[d][e] = f_[d][e];
34 localParity[d] = localParity_[d];
45 template <
int nDim,
bool extract,
typename Arg>
48 using real =
typename Arg::real;
53 for (
int dim=0; dim<nDim; dim++) {
56 if (!arg.commDim[dim] && !extract)
continue;
63 for (
int d=arg.X[dim]-arg.nFace; d<arg.X[dim]; d++) {
64 for (
int a=0; a<arg.A[dim]; a++) {
65 for (
int b=0; b<arg.B[dim]; b++) {
66 for (
int c=0; c<arg.C[dim]; c++) {
68 int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
70 int oddness = (a+b+c+d) & 1;
72 #ifdef FINE_GRAINED_ACCESS 73 for (
int i=0; i<
nColor; i++) {
74 for (
int j=0; j<
nColor; j++) {
76 arg.order.Ghost(dim, (
parity+arg.localParity[dim])&1, indexGhost, i, j)
77 = arg.order(dim+arg.offset,
parity, indexCB, i, j);
79 arg.order(dim+arg.offset,
parity, indexCB, i, j)
80 = arg.order.Ghost(dim, (
parity+arg.localParity[dim])&1, indexGhost, i, j);
88 arg.order.Ghost(dim, indexGhost, (
parity+arg.localParity[dim])&1) = u;
91 arg.order(dim+arg.offset, indexCB,
parity) = u;
101 assert(indexGhost == arg.faceVolumeCB[dim]);
113 template <
int nDim,
bool extract,
typename Arg>
116 using real =
typename Arg::real;
119 int parity_dim = blockIdx.z * blockDim.z + threadIdx.z;
120 int parity = parity_dim / nDim;
121 int dim = parity_dim % nDim;
122 if (parity_dim >= 2 * nDim)
return;
125 if (!arg.commDim[dim] && !extract)
return;
128 int X = blockIdx.x * blockDim.x + threadIdx.x;
130 if (X >= 2*arg.faceVolumeCB[dim])
return;
132 int dab = X/arg.C[dim];
133 int c = X - dab*arg.C[dim];
134 int da = dab/arg.B[dim];
135 int b = dab - da*arg.B[dim];
136 int d = da / arg.A[dim];
137 int a = da - d * arg.A[dim];
138 d += arg.X[dim]-arg.nFace;
141 int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
143 int oddness = (a+b+c+d)&1;
144 if (oddness == parity) {
145 #ifdef FINE_GRAINED_ACCESS 146 int i = blockIdx.y * blockDim.y + threadIdx.y;
147 if (i >= nColor)
return;
148 for (
int j=0; j<
nColor; j++) {
150 arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j)
151 = arg.order(dim+arg.offset, parity, indexCB, i, j);
153 arg.order(dim+arg.offset, parity, indexCB, i, j)
154 = arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j);
161 arg.order.Ghost(dim, X>>1, (parity+arg.localParity[dim])&1) = u;
164 arg.order(dim+arg.offset, indexCB, parity) = u;
170 template <
int nDim,
typename Arg>
187 #ifndef FINE_GRAINED_ACCESS 188 :
TunableVectorYZ(1, 2*nDim),
arg(arg), meta(meta), location(location), extract(extract) {
193 for (
int d=0; d<nDim; d++)
194 faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
197 #ifndef FINE_GRAINED_ACCESS 198 writeAuxString(
"stride=%d", arg.order.stride);
200 writeAuxString(
"fine-grained");
208 if (extract) extractGhost<nDim,true>(
arg);
209 else extractGhost<nDim,false>(
arg);
222 long long flops()
const {
return 0; }
225 for (
int d=0; d<nDim; d++) sites += arg.faceVolumeCB[d];
226 return 2 * sites * 2 * arg.order.Bytes();
235 template <
typename Float,
int length,
typename Order>
237 const int *
X = u.
X();
238 constexpr
int nDim = 4;
242 int A[nDim], B[nDim], C[nDim];
243 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
244 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
245 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
246 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
250 {X[0]*X[1]*X[2], X[0]*X[1], X[0], 1},
251 {X[0]*X[1]*X[2], X[0]*X[1], 1, X[0]},
252 {X[0]*X[1]*X[2], X[0], 1, X[0]*X[1]},
253 { X[0]*X[1], X[0], 1, X[0]*X[1]*X[2]}
260 int localParity[nDim];
261 for (
int dim=0; dim<nDim; dim++)
263 localParity[dim] = ((X[dim] % 2 ==1) && (
commDim(dim) > 1)) ? 1 : 0;
265 ExtractGhostArg<Float, gauge::Ncolor(length), Order, nDim> arg(order, u, A, B, C, f, localParity, offset);
QudaVerbosity getVerbosity()
const char * VolString() const
const int * SurfaceCB() const
void extractGhost(const GaugeField &u, Float **Ghost, bool extract, int offset)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaFieldLocation_s QudaFieldLocation
static int commDim[QUDA_MAX_DIM]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__global__ void extractGhostKernel(Arg arg)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
int comm_dim_partitioned(int dim)