QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
extract_gauge_ghost_extended.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <tune_quda.h>
3 #include <gauge_field_order.h>
4 #include <quda_matrix.h>
5 
6 namespace quda {
7 
8  using namespace gauge;
9 
10  template <typename Order, int nDim, int dim>
12  Order order;
13  int X[nDim];
14  int R[nDim];
15  int surfaceCB[nDim];
16  int A0[nDim];
17  int A1[nDim];
18  int B0[nDim];
19  int B1[nDim];
20  int C0[nDim];
21  int C1[nDim];
22  int fBody[nDim][nDim];
23  int fBuf[nDim][nDim];
24  int localParity[nDim];
25  int threads;
26  ExtractGhostExArg(const Order &order, const int *X_, const int *R_,
27  const int *surfaceCB_,
28  const int *A0_, const int *A1_, const int *B0_, const int *B1_,
29  const int *C0_, const int *C1_, const int fBody_[nDim][nDim],
30  const int fBuf_[nDim][nDim], const int *localParity_)
31  : order(order), threads(0) {
32 
33  threads = R_[dim]*(A1_[dim]-A0_[dim])*(B1_[dim]-B0_[dim])*(C1_[dim]-C0_[dim])*order.geometry;
34 
35  for (int d=0; d<nDim; d++) {
36  X[d] = X_[d];
37  R[d] = R_[d];
38  surfaceCB[d] = surfaceCB_[d];
39  A0[d] = A0_[d];
40  A1[d] = A1_[d];
41  B0[d] = B0_[d];
42  B1[d] = B1_[d];
43  C0[d] = C0_[d];
44  C1[d] = C1_[d];
45  for (int e=0; e<nDim; e++) {
46  fBody[d][e] = fBody_[d][e];
47  fBuf[d][e] = fBuf_[d][e];
48  }
49  localParity[d] = localParity_[d];
50  }
51  }
52 
53  };
54 
55  template <typename Float, int length, int dim, typename Arg>
56  __device__ __host__ void extractor(Arg &arg, int dir, int a, int b,
57  int c, int d, int g, int parity) {
58  int srcIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
59  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
60 
61  int dstIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
62  c*arg.fBuf[dim][2] + (d-(dir?arg.X[dim]:arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
63 
65 
66  // load the ghost element from the bulk
67  u = arg.order(g, srcIdx, parity);
68 
69  // need dir dependence in write
70  // srcIdx is used here to determine boundary condition
71  arg.order.saveGhostEx(u.data, dstIdx, srcIdx, dir, dim, g, (parity+arg.localParity[dim])&1, arg.R);
72  }
73 
74 
75  template <typename Float, int length, int dim, typename Arg>
76  __device__ __host__ void injector(Arg &arg, int dir, int a, int b,
77  int c, int d, int g, int parity) {
78  int srcIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
79  c*arg.fBuf[dim][2] + (d-dir*(arg.X[dim]+arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
80 
81  int dstIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
82  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
83 
84  int oddness = (parity+arg.localParity[dim])&1;
85 
87 
88  // need dir dependence in read
89  // dstIdx is used here to determine boundary condition
90  arg.order.loadGhostEx(u.data, srcIdx, dstIdx, dir, dim, g, oddness, arg.R);
91 
92  arg.order(g, dstIdx, parity) = u; // save the ghost element into the bulk
93  }
94 
99  template <typename Float, int length, int nDim, int dim, typename Order, bool extract>
101  {
102  for (int parity=0; parity<2; parity++) {
103 
104  // the following 4-way loop means this is specialized for 4 dimensions
105  // dir = 0 backwards, dir = 1 forwards
106  for (int dir = 0; dir<2; dir++) {
107 
108  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
109 
110  for (int d=D0; d<D0+arg.R[dim]; d++) {
111  for (int a=arg.A0[dim]; a<arg.A1[dim]; a++) { // loop over the interior surface
112  for (int b=arg.B0[dim]; b<arg.B1[dim]; b++) { // loop over the interior surface
113  for (int c=arg.C0[dim]; c<arg.C1[dim]; c++) { // loop over the interior surface
114  for (int g=0; g<arg.order.geometry; g++) {
115 
116  // we only do the extraction for parity we are currently working on
117  int oddness = (a+b+c+d) & 1;
118  if (oddness == parity) {
119  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
120  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
121  } // oddness == parity
122  } // g
123  } // c
124  } // b
125  } // a
126  } // d
127  } // dir
128 
129  } // parity
130 
131  }
132 
143  template <typename Float, int length, int nDim, int dim, typename Order, bool extract>
145  {
146  // parallelize over parity and dir using block or grid
147  /*for (int parity=0; parity<2; parity++) {*/
148  {
149  int parity = blockIdx.z;
150 
151  // the following 4-way loop means this is specialized for 4 dimensions
152  // dir = 0 backwards, dir = 1 forwards
153  //for (int dir = 0; dir<2; dir++) {
154  {
155  int dir = blockIdx.y;
156 
157  // this will have two-warp divergence since we only do work on
158  // one parity but parity alternates between threads
159  // linear index used for writing into ghost buffer
160  int X = blockIdx.x * blockDim.x + threadIdx.x;
161  if (X >= arg.threads) return;
162 
163  int dA = arg.A1[dim]-arg.A0[dim];
164  int dB = arg.B1[dim]-arg.B0[dim];
165  int dC = arg.C1[dim]-arg.C0[dim];
166  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
167 
168  // thread order is optimized to maximize coalescing
169  // X = (((g*R + d) * dA + a)*dB + b)*dC + c
170  int gdab = X / dC;
171  int c = arg.C0[dim] + X - gdab*dC;
172  int gda = gdab / dB;
173  int b = arg.B0[dim] + gdab - gda *dB;
174  int gd = gda / dA;
175  int a = arg.A0[dim] + gda - gd *dA;
176  int g = gd / arg.R[dim];
177  int d = D0 + gd - g *arg.R[dim];
178 
179  // we only do the extraction for parity we are currently working on
180  int oddness = (a+b+c+d) & 1;
181  if (oddness == parity) {
182  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
183  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
184  } // oddness == parity
185  } // dir
186 
187  } // parity
188 
189  }
190 
191  template <typename Float, int length, int nDim, int dim, typename Order>
194  int size;
195  bool extract;
196  const GaugeField &meta;
198 
199  private:
200  unsigned int sharedBytesPerThread() const { return 0; }
201  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
202 
203  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
204  unsigned int minThreads() const { return size; }
205 
206  public:
208  const GaugeField &meta, QudaFieldLocation location)
209  : arg(arg), extract(extract), meta(meta), location(location) {
210  int dA = arg.A1[dim]-arg.A0[dim];
211  int dB = arg.B1[dim]-arg.B0[dim];
212  int dC = arg.C1[dim]-arg.C0[dim];
213  size = arg.R[dim]*dA*dB*dC*arg.order.geometry;
214  writeAuxString("prec=%lu,stride=%d,extract=%d,dimension=%d,geometry=%d",
215  sizeof(Float),arg.order.stride, extract, dim, arg.order.geometry);
216  }
217  virtual ~ExtractGhostEx() { ; }
218 
219  void apply(const cudaStream_t &stream) {
220  if (extract) {
221  if (location==QUDA_CPU_FIELD_LOCATION) {
222  extractGhostEx<Float,length,nDim,dim,Order,true>(arg);
223  } else {
224  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
225  tp.grid.y = 2;
226  tp.grid.z = 2;
227  extractGhostExKernel<Float,length,nDim,dim,Order,true>
228  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
229  }
230  } else { // we are injecting
231  if (location==QUDA_CPU_FIELD_LOCATION) {
232  extractGhostEx<Float,length,nDim,dim,Order,false>(arg);
233  } else {
234  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
235  tp.grid.y = 2;
236  tp.grid.z = 2;
237  extractGhostExKernel<Float,length,nDim,dim,Order,false>
238  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
239  }
240  }
241  }
242 
243  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
244 
245  long long flops() const { return 0; }
246  long long bytes() const { return 2 * 2 * 2 * size * arg.order.Bytes(); } // 2 for i/o
247  };
248 
249 
257  template <typename Float, int length, typename Order>
258  void extractGhostEx(Order order, const int dim, const int *surfaceCB, const int *E,
259  const int *R, bool extract, const GaugeField &u, QudaFieldLocation location) {
260  const int nDim = 4;
261  //loop variables: a, b, c with a the most signifcant and c the least significant
262  //A0, B0, C0 the minimum value
263  //A0, B0, C0 the maximum value
264 
265  int X[nDim]; // compute interior dimensions
266  for (int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
267 
268  //..........x..........y............z.............t
269  int A0[nDim] = {R[3], R[3], R[3], 0};
270  int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
271 
272  int B0[nDim] = {R[2], R[2], 0, 0};
273  int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
274 
275  int C0[nDim] = {R[1], 0, 0, 0};
276  int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
277 
278  int fSrc[nDim][nDim] = {
279  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
280  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
281  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
282  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
283  };
284 
285  int fBuf[nDim][nDim]={
286  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
287  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
288  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
289  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
290  };
291 
292  //set the local processor parity
293  //switching odd and even ghost gauge when that dimension size is odd
294  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
295  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
296  int localParity[nDim];
297  for (int d=0; d<nDim; d++)
298  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
299  // localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
300 
301  if (dim==0) {
302  ExtractGhostExArg<Order,nDim,0> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
303  C0, C1, fSrc, fBuf, localParity);
304  ExtractGhostEx<Float,length,nDim,0,Order> extractor(arg, extract, u, location);
305  extractor.apply(0);
306  } else if (dim==1) {
307  ExtractGhostExArg<Order,nDim,1> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
308  C0, C1, fSrc, fBuf, localParity);
309  ExtractGhostEx<Float,length,nDim,1,Order> extractor(arg, extract, u, location);
310  extractor.apply(0);
311  } else if (dim==2) {
312  ExtractGhostExArg<Order,nDim,2> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
313  C0, C1, fSrc, fBuf, localParity);
314  ExtractGhostEx<Float,length,nDim,2,Order> extractor(arg, extract, u, location);
315  extractor.apply(0);
316  } else if (dim==3) {
317  ExtractGhostExArg<Order,nDim,3> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
318  C0, C1, fSrc, fBuf, localParity);
319  ExtractGhostEx<Float,length,nDim,3,Order> extractor(arg, extract, u, location);
320  extractor.apply(0);
321  } else {
322  errorQuda("Invalid dim=%d", dim);
323  }
324 
325  checkCudaError();
326  }
327 
329  template <typename Float>
330  void extractGhostEx(const GaugeField &u, int dim, const int *R, Float **Ghost, bool extract) {
331 
332  const int length = 18;
333 
334  QudaFieldLocation location =
336 
337  if (u.isNative()) {
338  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
340  extractGhostEx<Float, length>(G(u, 0, Ghost), dim, u.SurfaceCB(), u.X(), R, extract, u, location);
341  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
343  extractGhostEx<Float,length>(G(u, 0, Ghost),
344  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
345  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
347  extractGhostEx<Float,length>(G(u, 0, Ghost),
348  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
349  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
351  extractGhostEx<Float,length>(G(u, 0, Ghost),
352  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
353  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
355  extractGhostEx<Float,length>(G(u, 0, Ghost),
356  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
357  }
358  } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
359 
360 #ifdef BUILD_QDP_INTERFACE
361  extractGhostEx<Float,length>(QDPOrder<Float,length>(u, 0, Ghost),
362  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
363 #else
364  errorQuda("QDP interface has not been built\n");
365 #endif
366 
367  } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
368 
369 #ifdef BUILD_QDPJIT_INTERFACE
370  extractGhostEx<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost),
371  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
372 #else
373  errorQuda("QDPJIT interface has not been built\n");
374 #endif
375 
376  } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
377 
378 #ifdef BUILD_CPS_INTERFACE
379  extractGhostEx<Float,length>(CPSOrder<Float,length>(u, 0, Ghost),
380  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
381 #else
382  errorQuda("CPS interface has not been built\n");
383 #endif
384 
385  } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
386 
387 #ifdef BUILD_MILC_INTERFACE
388  extractGhostEx<Float,length>(MILCOrder<Float,length>(u, 0, Ghost),
389  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
390 #else
391  errorQuda("MILC interface has not been built\n");
392 #endif
393 
394  } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
395 
396 #ifdef BUILD_BQCD_INTERFACE
397  extractGhostEx<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost),
398  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
399 #else
400  errorQuda("BQCD interface has not been built\n");
401 #endif
402 
403  } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
404 
405 #ifdef BUILD_TIFR_INTERFACE
406  extractGhostEx<Float,length>(TIFROrder<Float,length>(u, 0, Ghost),
407  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
408 #else
409  errorQuda("TIFR interface has not been built\n");
410 #endif
411 
412  } else {
413  errorQuda("Gauge field %d order not supported", u.Order());
414  }
415 
416  }
417 
418  void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R,
419  void **ghost, bool extract) {
420 
421  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
422  extractGhostEx(u, dim, R, (double**)ghost, extract);
423  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
424  extractGhostEx(u, dim, R, (float**)ghost, extract);
425  } else if (u.Precision() == QUDA_HALF_PRECISION) {
426  extractGhostEx(u, dim, R, (short**)ghost, extract);
427  } else {
428  errorQuda("Unknown precision type %d", u.Precision());
429  }
430 
431  }
432 
433 } // namespace quda
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
unsigned int sharedBytesPerBlock(const TuneParam &param) const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
const char * VolString() const
static int R[4]
const int * SurfaceCB() const
ExtractGhostExArg(const Order &order, const int *X_, const int *R_, const int *surfaceCB_, const int *A0_, const int *A1_, const int *B0_, const int *B1_, const int *C0_, const int *C1_, const int fBody_[nDim][nDim], const int fBuf_[nDim][nDim], const int *localParity_)
__global__ void extractGhostExKernel(ExtractGhostExArg< Order, nDim, dim > arg)
int E[4]
Definition: test_util.cpp:35
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
T data[N *N]
Definition: quda_matrix.h:72
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
void extractGhostEx(ExtractGhostExArg< Order, nDim, dim > arg)
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
unsigned int sharedBytesPerThread() const
enum QudaFieldLocation_s QudaFieldLocation
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
void apply(const cudaStream_t &stream)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
ExtractGhostEx(ExtractGhostExArg< Order, nDim, dim > &arg, bool extract, const GaugeField &meta, QudaFieldLocation location)
ExtractGhostExArg< Order, nDim, dim > arg
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define checkCudaError()
Definition: util_quda.h:161
__device__ __host__ void injector(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
const int * X() const