QUDA  0.9.0
extract_gauge_ghost_extended.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <gauge_field_order.h>
3 
4 namespace quda {
5 
6  using namespace gauge;
7 
8  template <typename Order, int nDim, int dim>
10  Order order;
11  int X[nDim];
12  int R[nDim];
13  int surfaceCB[nDim];
14  int A0[nDim];
15  int A1[nDim];
16  int B0[nDim];
17  int B1[nDim];
18  int C0[nDim];
19  int C1[nDim];
20  int fBody[nDim][nDim];
21  int fBuf[nDim][nDim];
22  int localParity[nDim];
23  int threads;
24  ExtractGhostExArg(const Order &order, const int *X_, const int *R_,
25  const int *surfaceCB_,
26  const int *A0_, const int *A1_, const int *B0_, const int *B1_,
27  const int *C0_, const int *C1_, const int fBody_[nDim][nDim],
28  const int fBuf_[nDim][nDim], const int *localParity_)
29  : order(order), threads(0) {
30 
31  threads = R_[dim]*(A1_[dim]-A0_[dim])*(B1_[dim]-B0_[dim])*(C1_[dim]-C0_[dim])*order.geometry;
32 
33  for (int d=0; d<nDim; d++) {
34  X[d] = X_[d];
35  R[d] = R_[d];
36  surfaceCB[d] = surfaceCB_[d];
37  A0[d] = A0_[d];
38  A1[d] = A1_[d];
39  B0[d] = B0_[d];
40  B1[d] = B1_[d];
41  C0[d] = C0_[d];
42  C1[d] = C1_[d];
43  for (int e=0; e<nDim; e++) {
44  fBody[d][e] = fBody_[d][e];
45  fBuf[d][e] = fBuf_[d][e];
46  }
47  localParity[d] = localParity_[d];
48  }
49  }
50 
51  };
52 
53  template <typename Float, int length, int dim, typename Arg>
54  __device__ __host__ void extractor(Arg &arg, int dir, int a, int b,
55  int c, int d, int g, int parity) {
56  typename mapper<Float>::type u[length];
57  int srcIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
58  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
59 
60  int dstIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
61  c*arg.fBuf[dim][2] + (d-(dir?arg.X[dim]:arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
62 
63  // load the ghost element from the bulk
64  arg.order.load(u, srcIdx, g, parity);
65 
66  // need dir dependence in write
67  // srcIdx is used here to determine boundary condition
68  arg.order.saveGhostEx(u, dstIdx, srcIdx, dir, dim, g, (parity+arg.localParity[dim])&1, arg.R);
69  }
70 
71 
72  template <typename Float, int length, int dim, typename Arg>
73  __device__ __host__ void injector(Arg &arg, int dir, int a, int b,
74  int c, int d, int g, int parity) {
75  typename mapper<Float>::type u[length];
76  int srcIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
77  c*arg.fBuf[dim][2] + (d-dir*(arg.X[dim]+arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
78 
79  int dstIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
80  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
81 
82  int oddness = (parity+arg.localParity[dim])&1;
83 
84  // need dir dependence in read
85  // dstIdx is used here to determine boundary condition
86  arg.order.loadGhostEx(u, srcIdx, dstIdx, dir, dim, g, oddness, arg.R);
87 
88  arg.order.save(u, dstIdx, g, parity); // save the ghost element into the bulk
89  }
90 
95  template <typename Float, int length, int nDim, int dim, typename Order, bool extract>
97  typedef typename mapper<Float>::type RegType;
98 
99  for (int parity=0; parity<2; parity++) {
100 
101  // the following 4-way loop means this is specialized for 4 dimensions
102  // dir = 0 backwards, dir = 1 forwards
103  for (int dir = 0; dir<2; dir++) {
104 
105  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
106 
107  for (int d=D0; d<D0+arg.R[dim]; d++) {
108  for (int a=arg.A0[dim]; a<arg.A1[dim]; a++) { // loop over the interior surface
109  for (int b=arg.B0[dim]; b<arg.B1[dim]; b++) { // loop over the interior surface
110  for (int c=arg.C0[dim]; c<arg.C1[dim]; c++) { // loop over the interior surface
111  for (int g=0; g<arg.order.geometry; g++) {
112 
113  // we only do the extraction for parity we are currently working on
114  int oddness = (a+b+c+d) & 1;
115  if (oddness == parity) {
116  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
117  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
118  } // oddness == parity
119  } // g
120  } // c
121  } // b
122  } // a
123  } // d
124  } // dir
125 
126  } // parity
127 
128  }
129 
140  template <typename Float, int length, int nDim, int dim, typename Order, bool extract>
142  typedef typename mapper<Float>::type RegType;
143 
144  // parallelize over parity and dir using block or grid
145  /*for (int parity=0; parity<2; parity++) {*/
146  {
147  int parity = blockIdx.z;
148 
149  // the following 4-way loop means this is specialized for 4 dimensions
150  // dir = 0 backwards, dir = 1 forwards
151  //for (int dir = 0; dir<2; dir++) {
152  {
153  int dir = blockIdx.y;
154 
155  // this will have two-warp divergence since we only do work on
156  // one parity but parity alternates between threads
157  // linear index used for writing into ghost buffer
158  int X = blockIdx.x * blockDim.x + threadIdx.x;
159  if (X >= arg.threads) return;
160 
161  int dA = arg.A1[dim]-arg.A0[dim];
162  int dB = arg.B1[dim]-arg.B0[dim];
163  int dC = arg.C1[dim]-arg.C0[dim];
164  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
165 
166  // thread order is optimized to maximize coalescing
167  // X = (((g*R + d) * dA + a)*dB + b)*dC + c
168  int gdab = X / dC;
169  int c = arg.C0[dim] + X - gdab*dC;
170  int gda = gdab / dB;
171  int b = arg.B0[dim] + gdab - gda *dB;
172  int gd = gda / dA;
173  int a = arg.A0[dim] + gda - gd *dA;
174  int g = gd / arg.R[dim];
175  int d = D0 + gd - g *arg.R[dim];
176 
177  // we only do the extraction for parity we are currently working on
178  int oddness = (a+b+c+d) & 1;
179  if (oddness == parity) {
180  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
181  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
182  } // oddness == parity
183  } // dir
184 
185  } // parity
186 
187  }
188 
189  template <typename Float, int length, int nDim, int dim, typename Order>
192  int size;
193  bool extract;
194  const GaugeField &meta;
196 
197  private:
198  unsigned int sharedBytesPerThread() const { return 0; }
199  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
200 
201  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
202  unsigned int minThreads() const { return size; }
203 
204  public:
206  const GaugeField &meta, QudaFieldLocation location)
207  : arg(arg), extract(extract), meta(meta), location(location) {
208  int dA = arg.A1[dim]-arg.A0[dim];
209  int dB = arg.B1[dim]-arg.B0[dim];
210  int dC = arg.C1[dim]-arg.C0[dim];
211  size = arg.R[dim]*dA*dB*dC*arg.order.geometry;
212  writeAuxString("prec=%lu,stride=%d,extract=%d,dimension=%d,geometry=%d",
213  sizeof(Float),arg.order.stride, extract, dim, arg.order.geometry);
214  }
215  virtual ~ExtractGhostEx() { ; }
216 
217  void apply(const cudaStream_t &stream) {
218  if (extract) {
219  if (location==QUDA_CPU_FIELD_LOCATION) {
220  extractGhostEx<Float,length,nDim,dim,Order,true>(arg);
221  } else {
222  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
223  tp.grid.y = 2;
224  tp.grid.z = 2;
225  extractGhostExKernel<Float,length,nDim,dim,Order,true>
226  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
227  }
228  } else { // we are injecting
229  if (location==QUDA_CPU_FIELD_LOCATION) {
230  extractGhostEx<Float,length,nDim,dim,Order,false>(arg);
231  } else {
232  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
233  tp.grid.y = 2;
234  tp.grid.z = 2;
235  extractGhostExKernel<Float,length,nDim,dim,Order,false>
236  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
237  }
238  }
239  }
240 
241  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
242 
243  long long flops() const { return 0; }
244  long long bytes() const { return 2 * 2 * 2 * size * arg.order.Bytes(); } // 2 for i/o
245  };
246 
247 
255  template <typename Float, int length, typename Order>
256  void extractGhostEx(Order order, const int dim, const int *surfaceCB, const int *E,
257  const int *R, bool extract, const GaugeField &u, QudaFieldLocation location) {
258  const int nDim = 4;
259  //loop variables: a, b, c with a the most signifcant and c the least significant
260  //A0, B0, C0 the minimum value
261  //A0, B0, C0 the maximum value
262 
263  int X[nDim]; // compute interior dimensions
264  for (int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
265 
266  //..........x..........y............z.............t
267  int A0[nDim] = {R[3], R[3], R[3], 0};
268  int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
269 
270  int B0[nDim] = {R[2], R[2], 0, 0};
271  int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
272 
273  int C0[nDim] = {R[1], 0, 0, 0};
274  int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
275 
276  int fSrc[nDim][nDim] = {
277  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
278  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
279  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
280  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
281  };
282 
283  int fBuf[nDim][nDim]={
284  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
285  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
286  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
287  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
288  };
289 
290  //set the local processor parity
291  //switching odd and even ghost gauge when that dimension size is odd
292  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
293  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
294  int localParity[nDim];
295  for (int d=0; d<nDim; d++)
296  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
297  // localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
298 
299  if (dim==0) {
300  ExtractGhostExArg<Order,nDim,0> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
301  C0, C1, fSrc, fBuf, localParity);
303  extractor.apply(0);
304  } else if (dim==1) {
305  ExtractGhostExArg<Order,nDim,1> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
306  C0, C1, fSrc, fBuf, localParity);
308  extractor.apply(0);
309  } else if (dim==2) {
310  ExtractGhostExArg<Order,nDim,2> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
311  C0, C1, fSrc, fBuf, localParity);
313  extractor.apply(0);
314  } else if (dim==3) {
315  ExtractGhostExArg<Order,nDim,3> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
316  C0, C1, fSrc, fBuf, localParity);
318  extractor.apply(0);
319  } else {
320  errorQuda("Invalid dim=%d", dim);
321  }
322 
323  checkCudaError();
324  }
325 
327  template <typename Float>
328  void extractGhostEx(const GaugeField &u, int dim, const int *R, Float **Ghost, bool extract) {
329 
330  const int length = 18;
331 
332  QudaFieldLocation location =
334 
335  if (u.isNative()) {
336  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
337  if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
338  extractGhostEx<short,length>(FloatNOrder<short,length,2,19>(u, 0, (short**)Ghost),
339  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
340  } else {
342  extractGhostEx<Float,length>(G(u, 0, Ghost),
343  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
344  }
345  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
347  extractGhostEx<Float,length>(G(u, 0, Ghost),
348  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
349  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
351  extractGhostEx<Float,length>(G(u, 0, Ghost),
352  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
353  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
355  extractGhostEx<Float,length>(G(u, 0, Ghost),
356  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
357  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
359  extractGhostEx<Float,length>(G(u, 0, Ghost),
360  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
361  }
362  } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
363 
364 #ifdef BUILD_QDP_INTERFACE
365  extractGhostEx<Float,length>(QDPOrder<Float,length>(u, 0, Ghost),
366  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
367 #else
368  errorQuda("QDP interface has not been built\n");
369 #endif
370 
371  } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
372 
373 #ifdef BUILD_QDPJIT_INTERFACE
374  extractGhostEx<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost),
375  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
376 #else
377  errorQuda("QDPJIT interface has not been built\n");
378 #endif
379 
380  } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
381 
382 #ifdef BUILD_CPS_INTERFACE
383  extractGhostEx<Float,length>(CPSOrder<Float,length>(u, 0, Ghost),
384  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
385 #else
386  errorQuda("CPS interface has not been built\n");
387 #endif
388 
389  } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
390 
391 #ifdef BUILD_MILC_INTERFACE
392  extractGhostEx<Float,length>(MILCOrder<Float,length>(u, 0, Ghost),
393  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
394 #else
395  errorQuda("MILC interface has not been built\n");
396 #endif
397 
398  } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
399 
400 #ifdef BUILD_BQCD_INTERFACE
401  extractGhostEx<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost),
402  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
403 #else
404  errorQuda("BQCD interface has not been built\n");
405 #endif
406 
407  } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
408 
409 #ifdef BUILD_TIFR_INTERFACE
410  extractGhostEx<Float,length>(TIFROrder<Float,length>(u, 0, Ghost),
411  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
412 #else
413  errorQuda("TIFR interface has not been built\n");
414 #endif
415 
416  } else {
417  errorQuda("Gauge field %d order not supported", u.Order());
418  }
419 
420  }
421 
422  void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R,
423  void **ghost, bool extract) {
424 
425  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
426  extractGhostEx(u, dim, R, (double**)ghost, extract);
427  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
428  extractGhostEx(u, dim, R, (float**)ghost, extract);
429  } else if (u.Precision() == QUDA_HALF_PRECISION) {
430  extractGhostEx(u, dim, R, (short**)ghost, extract);
431  } else {
432  errorQuda("Unknown precision type %d", u.Precision());
433  }
434 
435  }
436 
437 } // namespace quda
dim3 dim3 blockDim
unsigned int sharedBytesPerBlock(const TuneParam &param) const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
QudaLinkType LinkType() const
Definition: gauge_field.h:209
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const char * VolString() const
static int R[4]
const int * SurfaceCB() const
ExtractGhostExArg(const Order &order, const int *X_, const int *R_, const int *surfaceCB_, const int *A0_, const int *A1_, const int *B0_, const int *B1_, const int *C0_, const int *C1_, const int fBody_[nDim][nDim], const int fBuf_[nDim][nDim], const int *localParity_)
__global__ void extractGhostExKernel(ExtractGhostExArg< Order, nDim, dim > arg)
int E[4]
Definition: test_util.cpp:36
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
for(int s=0;s< param.dc.Ls;s++)
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void extractGhostEx(ExtractGhostExArg< Order, nDim, dim > arg)
Main header file for host and device accessors to GaugeFields.
unsigned int sharedBytesPerThread() const
enum QudaFieldLocation_s QudaFieldLocation
void apply(const cudaStream_t &stream)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
Accessor routine for CloverFields in native field order.
ExtractGhostEx(ExtractGhostExArg< Order, nDim, dim > &arg, bool extract, const GaugeField &meta, QudaFieldLocation location)
const void * c
ExtractGhostExArg< Order, nDim, dim > arg
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
#define checkCudaError()
Definition: util_quda.h:129
__device__ __host__ void injector(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:53
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
#define a
const int * X() const