QUDA  v1.1.0
A library for QCD on GPUs
extract_gauge_ghost_extended.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <tune_quda.h>
3 #include <gauge_field_order.h>
4 #include <quda_matrix.h>
5 #include <instantiate.h>
6 
7 namespace quda {
8 
9  using namespace gauge;
10 
11  template <typename Order, int nDim, int dim>
12  struct ExtractGhostExArg {
13  Order order;
14  int X[nDim];
15  int R[nDim];
16  int surfaceCB[nDim];
17  int A0[nDim];
18  int A1[nDim];
19  int B0[nDim];
20  int B1[nDim];
21  int C0[nDim];
22  int C1[nDim];
23  int fBody[nDim][nDim];
24  int fBuf[nDim][nDim];
25  int localParity[nDim];
26  int threads;
27  ExtractGhostExArg(const Order &order, const int *X_, const int *R_,
28  const int *surfaceCB_,
29  const int *A0_, const int *A1_, const int *B0_, const int *B1_,
30  const int *C0_, const int *C1_, const int fBody_[nDim][nDim],
31  const int fBuf_[nDim][nDim], const int *localParity_)
32  : order(order), threads(0) {
33 
34  threads = R_[dim]*(A1_[dim]-A0_[dim])*(B1_[dim]-B0_[dim])*(C1_[dim]-C0_[dim])*order.geometry;
35 
36  for (int d=0; d<nDim; d++) {
37  X[d] = X_[d];
38  R[d] = R_[d];
39  surfaceCB[d] = surfaceCB_[d];
40  A0[d] = A0_[d];
41  A1[d] = A1_[d];
42  B0[d] = B0_[d];
43  B1[d] = B1_[d];
44  C0[d] = C0_[d];
45  C1[d] = C1_[d];
46  for (int e=0; e<nDim; e++) {
47  fBody[d][e] = fBody_[d][e];
48  fBuf[d][e] = fBuf_[d][e];
49  }
50  localParity[d] = localParity_[d];
51  }
52  }
53 
54  };
55 
56  template <typename Float, int length, int dim, typename Arg>
57  __device__ __host__ void extractor(Arg &arg, int dir, int a, int b,
58  int c, int d, int g, int parity) {
59  int srcIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
60  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
61 
62  int dstIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
63  c*arg.fBuf[dim][2] + (d-(dir?arg.X[dim]:arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
64 
65  Matrix<complex<typename mapper<Float>::type>, Ncolor(length)> u;
66 
67  // load the ghost element from the bulk
68  u = arg.order(g, srcIdx, parity);
69 
70  // need dir dependence in write
71  // srcIdx is used here to determine boundary condition
72  arg.order.saveGhostEx(u.data, dstIdx, srcIdx, dir, dim, g, (parity+arg.localParity[dim])&1, arg.R);
73  }
74 
75 
76  template <typename Float, int length, int dim, typename Arg>
77  __device__ __host__ void injector(Arg &arg, int dir, int a, int b,
78  int c, int d, int g, int parity) {
79  int srcIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
80  c*arg.fBuf[dim][2] + (d-dir*(arg.X[dim]+arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
81 
82  int dstIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
83  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
84 
85  int oddness = (parity+arg.localParity[dim])&1;
86 
87  Matrix<complex<typename mapper<Float>::type>, Ncolor(length)> u;
88 
89  // need dir dependence in read
90  // dstIdx is used here to determine boundary condition
91  arg.order.loadGhostEx(u.data, srcIdx, dstIdx, dir, dim, g, oddness, arg.R);
92 
93  arg.order(g, dstIdx, parity) = u; // save the ghost element into the bulk
94  }
95 
96  /**
97  Generic CPU gauge ghost extraction and packing
98  NB This routines is specialized to four dimensions
99  */
100  template <typename Float, int length, int dim, bool extract, typename Arg>
101  void extractGhostEx(Arg &arg)
102  {
103  for (int parity=0; parity<2; parity++) {
104 
105  // the following 4-way loop means this is specialized for 4 dimensions
106  // dir = 0 backwards, dir = 1 forwards
107  for (int dir = 0; dir<2; dir++) {
108 
109  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
110 
111  for (int d=D0; d<D0+arg.R[dim]; d++) {
112  for (int a=arg.A0[dim]; a<arg.A1[dim]; a++) { // loop over the interior surface
113  for (int b=arg.B0[dim]; b<arg.B1[dim]; b++) { // loop over the interior surface
114  for (int c=arg.C0[dim]; c<arg.C1[dim]; c++) { // loop over the interior surface
115  for (int g=0; g<arg.order.geometry; g++) {
116 
117  // we only do the extraction for parity we are currently working on
118  int oddness = (a+b+c+d) & 1;
119  if (oddness == parity) {
120  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
121  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
122  } // oddness == parity
123  } // g
124  } // c
125  } // b
126  } // a
127  } // d
128  } // dir
129 
130  } // parity
131 
132  }
133 
134  /**
135  Generic GPU gauge ghost extraction and packing
136  NB This routines is specialized to four dimensions
137  FIXME this implementation will have two-way warp divergence
138  */
139 
140  /**
141  Generic CPU gauge ghost extraction and packing
142  NB This routines is specialized to four dimensions
143  */
144  template <typename Float, int length, int dim, bool extract, typename Arg>
145  __global__ void extractGhostExKernel(Arg arg)
146  {
147  // parallelize over parity and dir using block or grid
148  /*for (int parity=0; parity<2; parity++) {*/
149  {
150  int parity = blockIdx.z;
151 
152  // the following 4-way loop means this is specialized for 4 dimensions
153  // dir = 0 backwards, dir = 1 forwards
154  //for (int dir = 0; dir<2; dir++) {
155  {
156  int dir = blockIdx.y;
157 
158  // this will have two-warp divergence since we only do work on
159  // one parity but parity alternates between threads
160  // linear index used for writing into ghost buffer
161  int X = blockIdx.x * blockDim.x + threadIdx.x;
162  if (X >= arg.threads) return;
163 
164  int dA = arg.A1[dim]-arg.A0[dim];
165  int dB = arg.B1[dim]-arg.B0[dim];
166  int dC = arg.C1[dim]-arg.C0[dim];
167  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
168 
169  // thread order is optimized to maximize coalescing
170  // X = (((g*R + d) * dA + a)*dB + b)*dC + c
171  int gdab = X / dC;
172  int c = arg.C0[dim] + X - gdab*dC;
173  int gda = gdab / dB;
174  int b = arg.B0[dim] + gdab - gda *dB;
175  int gd = gda / dA;
176  int a = arg.A0[dim] + gda - gd *dA;
177  int g = gd / arg.R[dim];
178  int d = D0 + gd - g *arg.R[dim];
179 
180  // we only do the extraction for parity we are currently working on
181  int oddness = (a+b+c+d) & 1;
182  if (oddness == parity) {
183  if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
184  else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
185  } // oddness == parity
186  } // dir
187 
188  } // parity
189 
190  }
191 
192  template <typename Float, int length, int nDim, int dim, typename Order>
193  class ExtractGhostEx : Tunable {
194  ExtractGhostExArg<Order,nDim,dim> arg;
195  int size;
196  bool extract;
197  const GaugeField &meta;
198  QudaFieldLocation location;
199 
200  private:
201  unsigned int sharedBytesPerThread() const { return 0; }
202  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
203 
204  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
205  unsigned int minThreads() const { return size; }
206 
207  public:
208  ExtractGhostEx(ExtractGhostExArg<Order,nDim,dim> &arg, bool extract,
209  const GaugeField &meta, QudaFieldLocation location)
210  : arg(arg), extract(extract), meta(meta), location(location) {
211  int dA = arg.A1[dim]-arg.A0[dim];
212  int dB = arg.B1[dim]-arg.B0[dim];
213  int dC = arg.C1[dim]-arg.C0[dim];
214  size = arg.R[dim]*dA*dB*dC*arg.order.geometry;
215  writeAuxString("prec=%lu,stride=%d,extract=%d,dimension=%d,geometry=%d",
216  sizeof(Float),arg.order.stride, extract, dim, arg.order.geometry);
217  }
218 
219  void apply(const qudaStream_t &stream) {
220  if (extract) {
221  if (location==QUDA_CPU_FIELD_LOCATION) {
222  extractGhostEx<Float,length,dim,true>(arg);
223  } else {
224  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
225  tp.grid.y = 2;
226  tp.grid.z = 2;
227  qudaLaunchKernel(extractGhostExKernel<Float,length,dim,true,decltype(arg)>, tp, stream, arg);
228  }
229  } else { // we are injecting
230  if (location==QUDA_CPU_FIELD_LOCATION) {
231  extractGhostEx<Float,length,dim,false>(arg);
232  } else {
233  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
234  tp.grid.y = 2;
235  tp.grid.z = 2;
236  qudaLaunchKernel(extractGhostExKernel<Float,length,dim,false,decltype(arg)>, tp, stream, arg);
237  }
238  }
239  }
240 
241  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
242 
243  long long flops() const { return 0; }
244  long long bytes() const { return 2 * 2 * 2 * size * arg.order.Bytes(); } // 2 for i/o
245  };
246 
247 
248  /**
249  Generic CPU gauge ghost extraction and packing
250  NB This routines is specialized to four dimensions
251  @param E the extended gauge dimensions
252  @param R array holding the radius of the extended region
253  @param extract Whether we are extracting or injecting the ghost zone
254  */
255  template <typename Float, int length, typename Order>
256  void extractGhostEx(Order order, const int dim, const int *surfaceCB, const int *E,
257  const int *R, bool extract, const GaugeField &u, QudaFieldLocation location) {
258  const int nDim = 4;
259  //loop variables: a, b, c with a the most signifcant and c the least significant
260  //A0, B0, C0 the minimum value
261  //A0, B0, C0 the maximum value
262 
263  int X[nDim]; // compute interior dimensions
264  for (int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
265 
266  //..........x..........y............z.............t
267  int A0[nDim] = {R[3], R[3], R[3], 0};
268  int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
269 
270  int B0[nDim] = {R[2], R[2], 0, 0};
271  int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
272 
273  int C0[nDim] = {R[1], 0, 0, 0};
274  int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
275 
276  int fSrc[nDim][nDim] = {
277  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
278  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
279  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
280  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
281  };
282 
283  int fBuf[nDim][nDim]={
284  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
285  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
286  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
287  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
288  };
289 
290  //set the local processor parity
291  //switching odd and even ghost gauge when that dimension size is odd
292  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
293  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
294  int localParity[nDim];
295  for (int d=0; d<nDim; d++)
296  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
297  // localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
298 
299  if (dim==0) {
300  ExtractGhostExArg<Order,nDim,0> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
301  C0, C1, fSrc, fBuf, localParity);
302  ExtractGhostEx<Float,length,nDim,0,Order> extractor(arg, extract, u, location);
303  extractor.apply(0);
304  } else if (dim==1) {
305  ExtractGhostExArg<Order,nDim,1> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
306  C0, C1, fSrc, fBuf, localParity);
307  ExtractGhostEx<Float,length,nDim,1,Order> extractor(arg, extract, u, location);
308  extractor.apply(0);
309  } else if (dim==2) {
310  ExtractGhostExArg<Order,nDim,2> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
311  C0, C1, fSrc, fBuf, localParity);
312  ExtractGhostEx<Float,length,nDim,2,Order> extractor(arg, extract, u, location);
313  extractor.apply(0);
314  } else if (dim==3) {
315  ExtractGhostExArg<Order,nDim,3> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
316  C0, C1, fSrc, fBuf, localParity);
317  ExtractGhostEx<Float,length,nDim,3,Order> extractor(arg, extract, u, location);
318  extractor.apply(0);
319  } else {
320  errorQuda("Invalid dim=%d", dim);
321  }
322  }
323 
324  /** This is the template driver for extractGhost */
325  template <typename Float> struct GhostExtractEx {
326  GhostExtractEx(const GaugeField &u, int dim, const int *R, void **Ghost_, bool extract)
327  {
328  Float **Ghost = reinterpret_cast<Float**>(Ghost_);
329  const int length = 18;
330 
331  QudaFieldLocation location =
332  (typeid(u)==typeid(cudaGaugeField)) ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION;
333 
334  if (u.isNative()) {
335  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
336  typedef typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type G;
337  extractGhostEx<Float, length>(G(u, 0, Ghost), dim, u.SurfaceCB(), u.X(), R, extract, u, location);
338  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
339 #if QUDA_RECONSTRUCT & 2
340  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
341  extractGhostEx<Float,length>(G(u, 0, Ghost),
342  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
343 #else
344  errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_12", QUDA_RECONSTRUCT);
345 #endif
346  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
347 #if QUDA_RECONSTRUCT & 1
348  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
349  extractGhostEx<Float,length>(G(u, 0, Ghost),
350  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
351 #else
352  errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_8", QUDA_RECONSTRUCT);
353 #endif
354  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
355 #if QUDA_RECONSTRUCT & 2
356  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type G;
357  extractGhostEx<Float,length>(G(u, 0, Ghost),
358  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
359 #else
360  errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_13", QUDA_RECONSTRUCT);
361 #endif
362  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
363 #if QUDA_RECONSTRUCT & 1
364  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type G;
365  extractGhostEx<Float,length>(G(u, 0, Ghost),
366  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
367 #else
368  errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_9", QUDA_RECONSTRUCT);
369 #endif
370  }
371  } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
372 
373 #ifdef BUILD_QDP_INTERFACE
374  extractGhostEx<Float,length>(QDPOrder<Float,length>(u, 0, Ghost),
375  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
376 #else
377  errorQuda("QDP interface has not been built\n");
378 #endif
379 
380  } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
381 
382 #ifdef BUILD_QDPJIT_INTERFACE
383  extractGhostEx<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost),
384  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
385 #else
386  errorQuda("QDPJIT interface has not been built\n");
387 #endif
388 
389  } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
390 
391 #ifdef BUILD_CPS_INTERFACE
392  extractGhostEx<Float,length>(CPSOrder<Float,length>(u, 0, Ghost),
393  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
394 #else
395  errorQuda("CPS interface has not been built\n");
396 #endif
397 
398  } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
399 
400 #ifdef BUILD_MILC_INTERFACE
401  extractGhostEx<Float,length>(MILCOrder<Float,length>(u, 0, Ghost),
402  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
403 #else
404  errorQuda("MILC interface has not been built\n");
405 #endif
406 
407  } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
408 
409 #ifdef BUILD_BQCD_INTERFACE
410  extractGhostEx<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost),
411  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
412 #else
413  errorQuda("BQCD interface has not been built\n");
414 #endif
415 
416  } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
417 
418 #ifdef BUILD_TIFR_INTERFACE
419  extractGhostEx<Float,length>(TIFROrder<Float,length>(u, 0, Ghost),
420  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
421 #else
422  errorQuda("TIFR interface has not been built\n");
423 #endif
424 
425  } else {
426  errorQuda("Gauge field %d order not supported", u.Order());
427  }
428 
429  }
430  };
431 
432  void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R,
433  void **ghost, bool extract)
434  {
435  instantiatePrecision<GhostExtractEx>(u, dim, R, ghost, extract);
436  }
437 
438 } // namespace quda