QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
extract_gauge_ghost_extended.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <gauge_field_order.h>
3 
4 namespace quda {
5 
6  template <typename Order, int nDim>
8  Order order;
9  int dim;
10  int X[nDim];
11  int R[nDim];
12  int surfaceCB[nDim];
13  int A0[nDim];
14  int A1[nDim];
15  int B0[nDim];
16  int B1[nDim];
17  int C0[nDim];
18  int C1[nDim];
19  int fBody[nDim][nDim];
20  int fBuf[nDim][nDim];
21  int localParity[nDim];
22  ExtractGhostExArg(const Order &order, int dim, const int *X_, const int *R_,
23  const int *surfaceCB_,
24  const int *A0_, const int *A1_, const int *B0_, const int *B1_,
25  const int *C0_, const int *C1_, const int fBody_[nDim][nDim],
26  const int fBuf_[nDim][nDim], const int *localParity_)
27  : order(order), dim(dim) {
28  for (int d=0; d<nDim; d++) {
29  X[d] = X_[d];
30  R[d] = R_[d];
31  surfaceCB[d] = surfaceCB_[d];
32  A0[d] = A0_[d];
33  A1[d] = A1_[d];
34  B0[d] = B0_[d];
35  B1[d] = B1_[d];
36  C0[d] = C0_[d];
37  C1[d] = C1_[d];
38  for (int e=0; e<nDim; e++) {
39  fBody[d][e] = fBody_[d][e];
40  fBuf[d][e] = fBuf_[d][e];
41  }
42  localParity[d] = localParity_[d];
43  }
44  }
45 
46  };
47 
48  template <typename Float, int length, typename Arg>
49  __device__ __host__ void extractor(Arg &arg, int dir, int a, int b,
50  int c, int d, int g, int parity) {
51  typename mapper<Float>::type u[length];
52  int &dim = arg.dim;
53  int srcIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
54  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
55 
56  int dstIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
57  c*arg.fBuf[dim][2] + (d-(dir?arg.X[dim]:arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
58 
59  // load the ghost element from the bulk
60  arg.order.load(u, srcIdx, g, parity);
61 
62  // need dir dependence in write
63  // srcIdx is used here to determine boundary condition
64  arg.order.saveGhostEx(u, dstIdx, srcIdx, dir, dim, g,
65  (parity+arg.localParity[dim])&1, arg.R);
66  }
67 
68 
69  template <typename Float, int length, typename Arg>
70  __device__ __host__ void injector(Arg &arg, int dir, int a, int b,
71  int c, int d, int g, int parity) {
72  typename mapper<Float>::type u[length];
73  int &dim = arg.dim;
74  int srcIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
75  c*arg.fBuf[dim][2] + (d-dir*(arg.X[dim]+arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
76 
77  int dstIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
78  c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
79 
80  // need dir dependence in read
81  // dstIdx is used here to determine boundary condition
82  arg.order.loadGhostEx(u, srcIdx, dstIdx, dir, dim, g,
83  (parity+arg.localParity[dim])&1, arg.R);
84 
85  arg.order.save(u, dstIdx, g, parity); // save the ghost element into the bulk
86  }
87 
92  template <typename Float, int length, int nDim, typename Order, bool extract>
94  typedef typename mapper<Float>::type RegType;
95 
96  int dim = arg.dim;
97 
98  for (int parity=0; parity<2; parity++) {
99 
100  // the following 4-way loop means this is specialized for 4 dimensions
101  // dir = 0 backwards, dir = 1 forwards
102  for (int dir = 0; dir<2; dir++) {
103 
104  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
105 
106  for (int d=D0; d<D0+arg.R[dim]; d++) {
107  for (int a=arg.A0[dim]; a<arg.A1[dim]; a++) { // loop over the interior surface
108  for (int b=arg.B0[dim]; b<arg.B1[dim]; b++) { // loop over the interior surface
109  for (int c=arg.C0[dim]; c<arg.C1[dim]; c++) { // loop over the interior surface
110  for (int g=0; g<arg.order.geometry; g++) {
111 
112  // we only do the extraction for parity we are currently working on
113  int oddness = (a+b+c+d) & 1;
114  if (oddness == parity) {
115  if (extract) extractor<Float,length>(arg, dir, a, b, c, d, g, parity);
116  else injector<Float,length>(arg, dir, a, b, c, d, g, parity);
117  } // oddness == parity
118  } // g
119  } // c
120  } // b
121  } // a
122  } // d
123  } // dir
124 
125  } // parity
126 
127  }
128 
139  template <typename Float, int length, int nDim, typename Order, bool extract>
141  typedef typename mapper<Float>::type RegType;
142 
143  int dim = arg.dim;
144 
145  // parallelize over parity and dir using block or grid
146  /*for (int parity=0; parity<2; parity++) {*/
147  {
148  int parity = blockIdx.z;
149 
150  // the following 4-way loop means this is specialized for 4 dimensions
151  // dir = 0 backwards, dir = 1 forwards
152  //for (int dir = 0; dir<2; dir++) {
153  {
154  int dir = blockIdx.y;
155 
156  // this will have two-warp divergence since we only do work on
157  // one parity but parity alternates between threads
158  // linear index used for writing into ghost buffer
159  int X = blockIdx.x * blockDim.x + threadIdx.x;
160 
161  int dA = arg.A1[dim]-arg.A0[dim];
162  int dB = arg.B1[dim]-arg.B0[dim];
163  int dC = arg.C1[dim]-arg.C0[dim];
164  int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
165 
166  if (X >= arg.R[dim]*dA*dB*dC*arg.order.geometry) return;
167 
168  // thread order is optimized to maximize coalescing
169  // X = (((g*R + d) * dA + a)*dB + b)*dC + c
170  int gdab = X / dC;
171  int c = arg.C0[dim] + X - gdab*dC;
172  int gda = gdab / dB;
173  int b = arg.B0[dim] + gdab - gda *dB;
174  int gd = gda / dA;
175  int a = arg.A0[dim] + gda - gd *dA;
176  int g = gd / arg.R[dim];
177  int d = D0 + gd - g *arg.R[dim];
178 
179  // we only do the extraction for parity we are currently working on
180  int oddness = (a+b+c+d) & 1;
181  if (oddness == parity) {
182  if (extract) extractor<Float,length>(arg, dir, a, b, c, d, g, parity);
183  else injector<Float,length>(arg, dir, a, b, c, d, g, parity);
184  } // oddness == parity
185  } // dir
186 
187  } // parity
188 
189  }
190 
191  template <typename Float, int length, int nDim, typename Order>
194  int size;
195  bool extract;
196  const GaugeField &meta;
197  QudaFieldLocation location;
198 
199  private:
200  unsigned int sharedBytesPerThread() const { return 0; }
201  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
202 
203  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
204  unsigned int minThreads() const { return size; }
205 
206  public:
208  const GaugeField &meta, QudaFieldLocation location)
209  : arg(arg), extract(extract), meta(meta), location(location) {
210  int dA = arg.A1[arg.dim]-arg.A0[arg.dim];
211  int dB = arg.B1[arg.dim]-arg.B0[arg.dim];
212  int dC = arg.C1[arg.dim]-arg.C0[arg.dim];
213  size = arg.R[arg.dim]*dA*dB*dC*arg.order.geometry;
214  writeAuxString("prec=%lu,stride=%d,extract=%d,dimension=%d",
215  sizeof(Float),arg.order.stride, extract, arg.dim);
216  }
217  virtual ~ExtractGhostEx() { ; }
218 
219  void apply(const cudaStream_t &stream) {
220  if (extract) {
221  if (location==QUDA_CPU_FIELD_LOCATION) {
222  extractGhostEx<Float,length,nDim,Order,true>(arg);
223  } else {
224 #if (__COMPUTE_CAPABILITY__ >= 200)
225  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
226  tp.grid.y = 2;
227  tp.grid.z = 2;
228  extractGhostExKernel<Float,length,nDim,Order,true>
229  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
230 #else
231  errorQuda("extractGhostEx not supported on pre-Fermi architecture");
232 #endif
233 
234  }
235  } else { // we are injecting
236  if (location==QUDA_CPU_FIELD_LOCATION) {
237  extractGhostEx<Float,length,nDim,Order,false>(arg);
238  } else {
239 #if (__COMPUTE_CAPABILITY__ >= 200)
240  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
241  tp.grid.y = 2;
242  tp.grid.z = 2;
243  extractGhostExKernel<Float,length,nDim,Order,false>
244  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
245 #else
246  errorQuda("extractGhostEx not supported on pre-Fermi architecture");
247 #endif
248  }
249  }
250  }
251 
252  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
253 
254  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.
255  std::stringstream ps;
256  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
257  ps << "shared=" << param.shared_bytes;
258  return ps.str();
259  }
260 
261  long long flops() const { return 0; }
262  long long bytes() const { return 2 * 2 * 2 * size * arg.order.Bytes(); } // 2 for i/o
263  };
264 
265 
273  template <typename Float, int length, typename Order>
274  void extractGhostEx(Order order, const int dim, const int *surfaceCB, const int *E,
275  const int *R, bool extract, const GaugeField &u, QudaFieldLocation location) {
276  const int nDim = 4;
277  //loop variables: a, b, c with a the most signifcant and c the least significant
278  //A0, B0, C0 the minimum value
279  //A0, B0, C0 the maximum value
280 
281  int X[nDim]; // compute interior dimensions
282  for (int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
283 
284  //..........x..........y............z.............t
285  int A0[nDim] = {R[3], R[3], R[3], 0};
286  int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
287 
288  int B0[nDim] = {R[2], R[2], 0, 0};
289  int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
290 
291  int C0[nDim] = {R[1], 0, 0, 0};
292  int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
293 
294  int fSrc[nDim][nDim] = {
295  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
296  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
297  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
298  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
299  };
300 
301  int fBuf[nDim][nDim]={
302  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
303  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
304  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
305  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
306  };
307 
308  //set the local processor parity
309  //switching odd and even ghost gauge when that dimension size is odd
310  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
311  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
312  int localParity[nDim];
313  for (int d=0; d<nDim; d++)
314  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
315  // localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
316 
317  ExtractGhostExArg<Order, nDim> arg(order, dim, X, R, surfaceCB, A0, A1, B0, B1,
318  C0, C1, fSrc, fBuf, localParity);
319  ExtractGhostEx<Float,length,nDim,Order> extractor(arg, extract, u, location);
320  extractor.apply(0);
321  if (location == QUDA_CUDA_FIELD_LOCATION) {
322  cudaDeviceSynchronize(); // need to sync before we commence any communication
323  checkCudaError();
324  }
325  }
326 
328  template <typename Float>
329  void extractGhostEx(const GaugeField &u, int dim, const int *R, Float **Ghost, bool extract) {
330 
331  const int length = 18;
332 
335 
336  if (u.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
337  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
338  if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
339  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,19>(u, 0, Ghost),
340  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
341  } else {
342  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,18>(u, 0, Ghost),
343  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
344  }
345  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
346  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,12>(u, 0, Ghost),
347  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
348  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
349  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,8>(u, 0, Ghost),
350  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
351  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
352  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,13>(u, 0, Ghost),
353  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
354  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
355  extractGhostEx<Float,length>(FloatNOrder<Float,length,2,9>(u, 0, Ghost),
356  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
357  }
358  } else if (u.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
359  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
360  if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
361  extractGhostEx<Float,length>(FloatNOrder<Float,length,1,19>(u, 0, Ghost),
362  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
363  } else {
364  extractGhostEx<Float,length>(FloatNOrder<Float,length,1,18>(u, 0, Ghost),
365  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
366  }
367  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
368  extractGhostEx<Float,length>(FloatNOrder<Float,length,4,12>(u, 0, Ghost),
369  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
370  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
371  extractGhostEx<Float,length>(FloatNOrder<Float,length,4,8>(u, 0, Ghost),
372  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
373  } else if(u.Reconstruct() == QUDA_RECONSTRUCT_13){
374  extractGhostEx<Float,length>(FloatNOrder<Float,length,4,13>(u, 0, Ghost),
375  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
376  } else if(u.Reconstruct() == QUDA_RECONSTRUCT_9){
377  extractGhostEx<Float,length>(FloatNOrder<Float,length,4,9>(u, 0, Ghost),
378  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
379  }
380  } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
381 
382 #ifdef BUILD_QDP_INTERFACE
383  extractGhostEx<Float,length>(QDPOrder<Float,length>(u, 0, Ghost),
384  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
385 #else
386  errorQuda("QDP interface has not been built\n");
387 #endif
388 
389  } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
390 
391 #ifdef BUILD_QDPJIT_INTERFACE
392  extractGhostEx<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost),
393  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
394 #else
395  errorQuda("QDPJIT interface has not been built\n");
396 #endif
397 
398  } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
399 
400 #ifdef BUILD_CPS_INTERFACE
401  extractGhostEx<Float,length>(CPSOrder<Float,length>(u, 0, Ghost),
402  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
403 #else
404  errorQuda("CPS interface has not been built\n");
405 #endif
406 
407  } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
408 
409 #ifdef BUILD_MILC_INTERFACE
410  extractGhostEx<Float,length>(MILCOrder<Float,length>(u, 0, Ghost),
411  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
412 #else
413  errorQuda("MILC interface has not been built\n");
414 #endif
415 
416  } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
417 
418 #ifdef BUILD_BQCD_INTERFACE
419  extractGhostEx<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost),
420  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
421 #else
422  errorQuda("BQCD interface has not been built\n");
423 #endif
424 
425  } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
426 
427 #ifdef BUILD_TIFR_INTERFACE
428  extractGhostEx<Float,length>(TIFROrder<Float,length>(u, 0, Ghost),
429  dim, u.SurfaceCB(), u.X(), R, extract, u, location);
430 #else
431  errorQuda("TIFR interface has not been built\n");
432 #endif
433 
434  } else {
435  errorQuda("Gauge field %d order not supported", u.Order());
436  }
437 
438  }
439 
440  void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R,
441  void **ghost, bool extract) {
442 
443  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
444  extractGhostEx(u, dim, R, (double**)ghost, extract);
445  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
446  extractGhostEx(u, dim, R, (float**)ghost, extract);
447  } else if (u.Precision() == QUDA_HALF_PRECISION) {
448  extractGhostEx(u, dim, R, (short**)ghost, extract);
449  } else {
450  errorQuda("Unknown precision type %d", u.Precision());
451  }
452 
453  }
454 
455 } // namespace quda
int commDim(int)
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
int length[]
std::string paramString(const TuneParam &param) const
QudaGaugeParam param
Definition: pack_test.cpp:17
int E[4]
QudaPrecision Precision() const
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
__global__ void extractGhostExKernel(ExtractGhostExArg< Order, nDim > arg)
const QudaFieldLocation location
Definition: pack_test.cpp:46
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
void extractGhostEx(ExtractGhostExArg< Order, nDim > arg)
FloatingPoint< float > Float
Definition: gtest.h:7350
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
ExtractGhostEx(ExtractGhostExArg< Order, nDim > &arg, bool extract, const GaugeField &meta, QudaFieldLocation location)
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
const char * VolString() const
enum QudaFieldLocation_s QudaFieldLocation
ExtractGhostExArg(const Order &order, int dim, const int *X_, const int *R_, const int *surfaceCB_, const int *A0_, const int *A1_, const int *B0_, const int *B1_, const int *C0_, const int *C1_, const int fBody_[nDim][nDim], const int fBuf_[nDim][nDim], const int *localParity_)
QudaLinkType LinkType() const
Definition: gauge_field.h:174
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
__device__ __host__ void injector(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)