5 template <
typename Order,
int nDim>
9 unsigned short X[nDim];
10 unsigned short A[nDim];
11 unsigned short B[nDim];
12 unsigned short C[nDim];
19 const int *B_,
const int *C_,
const int f_[nDim][nDim],
const int *localParity_,
int offset)
21 for (
int d=0;
d<nDim;
d++) {
26 for (
int e=0;
e<nDim;
e++)
f[
d][
e] = f_[
d][
e];
38 template <
typename Float,
int length,
int nDim,
typename Order,
bool extract>
47 if (!
arg.commDim[
dim] && !extract)
continue;
61 int oddness = (
a+
b+
c+
d) & 1;
63 #ifdef FINE_GRAINED_ACCESS 93 assert(indexGhost ==
arg.faceVolumeCB[
dim]);
105 template <
typename Float,
int length,
int nDim,
typename Order,
bool extract>
113 if (!
arg.commDim[
dim] && !extract)
continue;
116 int X = blockIdx.x *
blockDim.x + threadIdx.x;
118 if (
X >= 2*
arg.faceVolumeCB[
dim])
continue;
131 int oddness = (
a+
b+
c+
d)&1;
133 #ifdef FINE_GRAINED_ACCESS 164 template <
typename Float,
int length,
int nDim,
typename Order>
184 for (
int d=0;
d<nDim;
d++)
185 faceMax = (
arg.faceVolumeCB[
d] > faceMax ) ?
arg.faceVolumeCB[
d] : faceMax;
188 #ifndef FINE_GRAINED_ACCESS 199 if (
extract) extractGhost<Float,length,nDim,Order,true>(
arg);
200 else extractGhost<Float,length,nDim,Order,false>(
arg);
204 extractGhostKernel<Float, length, nDim, Order, true>
207 extractGhostKernel<Float, length, nDim, Order, false>
215 long long flops()
const {
return 0; }
218 for (
int d=0;
d<nDim;
d++) sites +=
arg.faceVolumeCB[
d];
219 return 2 * sites * 2 *
arg.order.Bytes();
228 template <
typename Float,
int length,
typename Order>
230 const int *
X = u.
X();
231 const int nFace = u.
Nface();
236 int A[nDim], B[nDim], C[nDim];
237 A[0] =
X[3]; B[0] =
X[2]; C[0] =
X[1];
238 A[1] =
X[3]; B[1] =
X[2]; C[1] =
X[0];
239 A[2] =
X[3]; B[2] =
X[1]; C[2] =
X[0];
240 A[3] =
X[2]; B[3] =
X[1]; C[3] =
X[0];
244 {
X[0]*
X[1]*
X[2],
X[0]*
X[1],
X[0], 1},
245 {
X[0]*
X[1]*
X[2],
X[0]*
X[1], 1,
X[0]},
246 {
X[0]*
X[1]*
X[2],
X[0], 1,
X[0]*
X[1]},
247 {
X[0]*
X[1],
X[0], 1,
X[0]*
X[1]*
X[2]}
254 int localParity[nDim];
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
QudaVerbosity getVerbosity()
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const char * VolString() const
const int * SurfaceCB() const
void extractGhost(const GaugeField &u, Float **Ghost, bool extract, int offset)
__global__ void extractGhostKernel(ExtractGhostArg< Order, nDim > arg)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int int int enum cudaChannelFormatKind f
enum QudaFieldLocation_s QudaFieldLocation
int writeAuxString(const char *format,...)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
int comm_dim_partitioned(int dim)