6 template <
typename Order,
int nDim>
23 const int *surfaceCB_,
24 const int *A0_,
const int *A1_,
const int *B0_,
const int *B1_,
25 const int *C0_,
const int *C1_,
const int fBody_[nDim][nDim],
26 const int fBuf_[nDim][nDim],
const int *localParity_)
27 : order(order), dim(dim) {
28 for (
int d=0; d<nDim; d++) {
38 for (
int e=0; e<nDim; e++) {
39 fBody[d][e] = fBody_[d][e];
40 fBuf[d][e] = fBuf_[d][e];
48 template <
typename Float,
int length,
typename Arg>
49 __device__ __host__
void extractor(Arg &
arg,
int dir,
int a,
int b,
50 int c,
int d,
int g,
int parity) {
53 int srcIdx = (a*arg.fBody[
dim][0] + b*arg.fBody[
dim][1] +
54 c*arg.fBody[
dim][2] + d*arg.fBody[
dim][3]) >> 1;
56 int dstIdx = (a*arg.fBuf[
dim][0] + b*arg.fBuf[
dim][1] +
57 c*arg.fBuf[
dim][2] + (d-(dir?arg.X[
dim]:arg.R[
dim]))*arg.fBuf[
dim][3]) >> 1;
60 arg.order.load(u, srcIdx, g, parity);
64 arg.order.saveGhostEx(u, dstIdx, srcIdx, dir, dim, g,
65 (parity+arg.localParity[dim])&1, arg.R);
69 template <
typename Float,
int length,
typename Arg>
70 __device__ __host__
void injector(Arg &
arg,
int dir,
int a,
int b,
71 int c,
int d,
int g,
int parity) {
74 int srcIdx = (a*arg.fBuf[
dim][0] + b*arg.fBuf[
dim][1] +
75 c*arg.fBuf[
dim][2] + (d-dir*(arg.X[
dim]+arg.R[
dim]))*arg.fBuf[
dim][3]) >> 1;
77 int dstIdx = (a*arg.fBody[
dim][0] + b*arg.fBody[
dim][1] +
78 c*arg.fBody[
dim][2] + d*arg.fBody[
dim][3]) >> 1;
82 arg.order.loadGhostEx(u, srcIdx, dstIdx, dir, dim, g,
83 (parity+arg.localParity[dim])&1, arg.R);
85 arg.order.save(u, dstIdx, g, parity);
92 template <
typename Float,
int length,
int nDim,
typename Order,
bool extract>
102 for (
int dir = 0; dir<2; dir++) {
104 int D0 = extract ? dir*arg.
X[
dim] + (1-dir)*arg.
R[dim] : dir*(arg.
X[dim] + arg.
R[dim]);
106 for (
int d=D0; d<D0+arg.
R[
dim]; d++) {
107 for (
int a=arg.
A0[dim]; a<arg.
A1[dim]; a++) {
108 for (
int b=arg.
B0[dim]; b<arg.
B1[dim]; b++) {
109 for (
int c=arg.
C0[dim]; c<arg.
C1[dim]; c++) {
110 for (
int g=0; g<arg.
order.geometry; g++) {
113 int oddness = (a+b+c+d) & 1;
115 if (extract) extractor<Float,length>(
arg, dir, a, b, c, d, g,
parity);
116 else injector<Float,length>(
arg, dir, a, b, c, d, g,
parity);
139 template <
typename Float,
int length,
int nDim,
typename Order,
bool extract>
154 int dir = blockIdx.y;
159 int X = blockIdx.x * blockDim.x + threadIdx.x;
164 int D0 = extract ? dir*arg.
X[
dim] + (1-dir)*arg.
R[dim] : dir*(arg.
X[dim] + arg.
R[dim]);
166 if (X >= arg.
R[dim]*dA*dB*dC*arg.
order.geometry)
return;
171 int c = arg.
C0[
dim] + X - gdab*dC;
173 int b = arg.
B0[
dim] + gdab - gda *dB;
175 int a = arg.
A0[
dim] + gda - gd *dA;
176 int g = gd / arg.
R[
dim];
177 int d = D0 + gd - g *arg.
R[
dim];
180 int oddness = (a+b+c+d) & 1;
181 if (oddness == parity) {
182 if (extract) extractor<Float,length>(
arg, dir, a, b, c, d, g,
parity);
183 else injector<Float,length>(
arg, dir, a, b, c, d, g,
parity);
191 template <
typename Float,
int length,
int nDim,
typename Order>
200 unsigned int sharedBytesPerThread()
const {
return 0; }
201 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
203 bool tuneGridDim()
const {
return false; }
204 unsigned int minThreads()
const {
return size; }
209 : arg(arg), extract(extract), meta(meta), location(location) {
213 size = arg.
R[arg.
dim]*dA*dB*dC*arg.
order.geometry;
222 extractGhostEx<Float,length,nDim,Order,true>(arg);
224 #if (__COMPUTE_CAPABILITY__ >= 200)
228 extractGhostExKernel<Float,length,nDim,Order,true>
231 errorQuda(
"extractGhostEx not supported on pre-Fermi architecture");
237 extractGhostEx<Float,length,nDim,Order,false>(arg);
239 #if (__COMPUTE_CAPABILITY__ >= 200)
243 extractGhostExKernel<Float,length,nDim,Order,false>
246 errorQuda(
"extractGhostEx not supported on pre-Fermi architecture");
255 std::stringstream ps;
256 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
261 long long flops()
const {
return 0; }
262 long long bytes()
const {
return 2 * 2 * 2 * size * arg.order.Bytes(); }
273 template <
typename Float,
int length,
typename Order>
282 for (
int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
285 int A0[nDim] = {R[3], R[3], R[3], 0};
286 int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
288 int B0[nDim] = {R[2], R[2], 0, 0};
289 int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
291 int C0[nDim] = {R[1], 0, 0, 0};
292 int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
294 int fSrc[nDim][nDim] = {
295 {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
296 {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
297 {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
298 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
301 int fBuf[nDim][nDim]={
302 {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
303 {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
304 {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
305 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
312 int localParity[nDim];
313 for (
int d=0; d<nDim; d++)
314 localParity[dim] = ((X[dim] % 2 ==1) && (
commDim(dim) > 1)) ? 1 : 0;
317 ExtractGhostExArg<Order, nDim> arg(order, dim, X, R, surfaceCB, A0, A1, B0, B1,
318 C0, C1, fSrc, fBuf, localParity);
322 cudaDeviceSynchronize();
328 template <
typename Float>
340 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
343 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
347 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
350 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
353 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
356 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
362 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
365 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
369 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
372 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
375 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
378 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
382 #ifdef BUILD_QDP_INTERFACE
384 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
386 errorQuda(
"QDP interface has not been built\n");
391 #ifdef BUILD_QDPJIT_INTERFACE
393 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
395 errorQuda(
"QDPJIT interface has not been built\n");
400 #ifdef BUILD_CPS_INTERFACE
402 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
404 errorQuda(
"CPS interface has not been built\n");
409 #ifdef BUILD_MILC_INTERFACE
411 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
413 errorQuda(
"MILC interface has not been built\n");
418 #ifdef BUILD_BQCD_INTERFACE
420 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
422 errorQuda(
"BQCD interface has not been built\n");
427 #ifdef BUILD_TIFR_INTERFACE
429 dim, u.SurfaceCB(), u.X(), R, extract, u,
location);
431 errorQuda(
"TIFR interface has not been built\n");
441 void **ghost,
bool extract) {
QudaVerbosity getVerbosity()
QudaGaugeFieldOrder Order() const
QudaPrecision Precision() const
void writeAuxString(const char *format,...)
__global__ void extractGhostExKernel(ExtractGhostExArg< Order, nDim > arg)
const QudaFieldLocation location
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
void extractGhostEx(ExtractGhostExArg< Order, nDim > arg)
FloatingPoint< float > Float
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const char * VolString() const
enum QudaFieldLocation_s QudaFieldLocation
QudaLinkType LinkType() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void injector(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)