QUDA  0.9.0
extract_gauge_ghost_helper.cuh
Go to the documentation of this file.
1 #pragma once
2 
3 namespace quda {
4 
5  template <typename Order, int nDim>
6  struct ExtractGhostArg {
7  Order order;
8  const unsigned char nFace;
9  unsigned short X[nDim];
10  unsigned short A[nDim];
11  unsigned short B[nDim];
12  unsigned short C[nDim];
13  int f[nDim][nDim];
14  bool localParity[nDim];
15  int faceVolumeCB[nDim];
17  const int offset;
18  ExtractGhostArg(const Order &order, const GaugeField &u, const int *A_,
19  const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_, int offset)
20  : order(order), nFace(u.Nface()), offset(offset) {
21  for (int d=0; d<nDim; d++) {
22  X[d] = u.X()[d];
23  A[d] = A_[d];
24  B[d] = B_[d];
25  C[d] = C_[d];
26  for (int e=0; e<nDim; e++) f[d][e] = f_[d][e];
27  localParity[d] = localParity_[d];
29  faceVolumeCB[d] = u.SurfaceCB(d)*u.Nface();
30  }
31  }
32  };
33 
38  template <typename Float, int length, int nDim, typename Order, bool extract>
40  typedef typename mapper<Float>::type RegType;
41 
42  for (int parity=0; parity<2; parity++) {
43 
44  for (int dim=0; dim<nDim; dim++) {
45 
46  // for now we never inject unless we have partitioned in that dimension
47  if (!arg.commDim[dim] && !extract) continue;
48 
49  // linear index used for reading/writing into ghost buffer
50  int indexGhost = 0;
51  // the following 4-way loop means this is specialized for 4 dimensions
52 
53  // FIXME redefine a, b, c, d such that we always optimize for locality
54  for (int d=arg.X[dim]-arg.nFace; d<arg.X[dim]; d++) { // loop over last nFace faces in this dimension
55  for (int a=0; a<arg.A[dim]; a++) { // loop over the surface elements of this face
56  for (int b=0; b<arg.B[dim]; b++) { // loop over the surface elements of this face
57  for (int c=0; c<arg.C[dim]; c++) { // loop over the surface elements of this face
58  // index is a checkboarded spacetime coordinate
59  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
60  // we only do the extraction for parity we are currently working on
61  int oddness = (a+b+c+d) & 1;
62  if (oddness == parity) {
63 #ifdef FINE_GRAINED_ACCESS
64  for (int i=0; i<gauge::Ncolor(length); i++) {
65  for (int j=0; j<gauge::Ncolor(length); j++) {
66  if (extract) {
67  arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, indexGhost, i, j)
68  = arg.order(dim+arg.offset, parity, indexCB, i, j);
69  } else { // injection
70  arg.order(dim+arg.offset, parity, indexCB, i, j)
71  = arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, indexGhost, i, j);
72  }
73  }
74  }
75 #else
76  if (extract) {
77  RegType u[length];
78  arg.order.load(u, indexCB, dim+arg.offset, parity); // load the ghost element from the bulk
79  arg.order.saveGhost(u, indexGhost, dim, (parity+arg.localParity[dim])&1);
80  } else { // injection
81  RegType u[length];
82  arg.order.loadGhost(u, indexGhost, dim, (parity+arg.localParity[dim])&1);
83  arg.order.save(u, indexCB, dim+arg.offset, parity); // save the ghost element to the bulk
84  }
85 #endif
86  indexGhost++;
87  } // oddness == parity
88  } // c
89  } // b
90  } // a
91  } // d
92 
93  assert(indexGhost == arg.faceVolumeCB[dim]);
94  } // dim
95 
96  } // parity
97 
98  }
99 
105  template <typename Float, int length, int nDim, typename Order, bool extract>
107  typedef typename mapper<Float>::type RegType;
108 
109  for (int parity=0; parity<2; parity++) {
110  for (int dim=0; dim<nDim; dim++) {
111 
112  // for now we never inject unless we have partitioned in that dimension
113  if (!arg.commDim[dim] && !extract) continue;
114 
115  // linear index used for writing into ghost buffer
116  int X = blockIdx.x * blockDim.x + threadIdx.x;
117  //if (X >= 2*arg.nFace*arg.surfaceCB[dim]) continue;
118  if (X >= 2*arg.faceVolumeCB[dim]) continue;
119  // X = ((d * A + a)*B + b)*C + c
120  int dab = X/arg.C[dim];
121  int c = X - dab*arg.C[dim];
122  int da = dab/arg.B[dim];
123  int b = dab - da*arg.B[dim];
124  int d = da / arg.A[dim];
125  int a = da - d * arg.A[dim];
126  d += arg.X[dim]-arg.nFace;
127 
128  // index is a checkboarded spacetime coordinate
129  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
130  // we only do the extraction for parity we are currently working on
131  int oddness = (a+b+c+d)&1;
132  if (oddness == parity) {
133 #ifdef FINE_GRAINED_ACCESS
134  for (int i=0; i<gauge::Ncolor(length); i++) {
135  for (int j=0; j<gauge::Ncolor(length); j++) {
136  if (extract) {
137  arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j)
138  = arg.order(dim+arg.offset, parity, indexCB, i, j);
139  } else { // injection
140  arg.order(dim+arg.offset, parity, indexCB, i, j)
141  = arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j);
142  }
143  }
144  }
145 #else
146  if (extract) {
147  RegType u[length];
148  arg.order.load(u, indexCB, dim+arg.offset, parity); // load the ghost element from the bulk
149  arg.order.saveGhost(u, X>>1, dim, (parity+arg.localParity[dim])&1);
150  } else {
151  RegType u[length];
152  arg.order.loadGhost(u, X>>1, dim, (parity+arg.localParity[dim])&1);
153  arg.order.save(u, indexCB, dim+arg.offset, parity); // save the ghost element to the bulk
154  }
155 #endif
156  } // oddness == parity
157 
158  } // dim
159 
160  } // parity
161 
162  }
163 
164  template <typename Float, int length, int nDim, typename Order>
167  int size;
168  const GaugeField &meta;
170  bool extract;
171 
172  private:
173  unsigned int sharedBytesPerThread() const { return 0; }
174  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
175 
176  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
177  unsigned int minThreads() const { return size; }
178 
179  public:
183  int faceMax = 0;
184  for (int d=0; d<nDim; d++)
185  faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
186  size = 2 * faceMax; // factor of comes from parity
187 
188 #ifndef FINE_GRAINED_ACCESS
189  writeAuxString("stride=%d", arg.order.stride);
190 #else
191  writeAuxString("fine-grained");
192 #endif
193  }
194 
195  virtual ~ExtractGhost() { ; }
196 
197  void apply(const cudaStream_t &stream) {
199  if (extract) extractGhost<Float,length,nDim,Order,true>(arg);
200  else extractGhost<Float,length,nDim,Order,false>(arg);
201  } else {
202  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
203  if (extract) {
204  extractGhostKernel<Float, length, nDim, Order, true>
205  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
206  } else {
207  extractGhostKernel<Float, length, nDim, Order, false>
208  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
209  }
210  }
211  }
212 
213  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
214 
215  long long flops() const { return 0; }
216  long long bytes() const {
217  int sites = 0;
218  for (int d=0; d<nDim; d++) sites += arg.faceVolumeCB[d];
219  return 2 * sites * 2 * arg.order.Bytes(); // parity * sites * i/o * vec size
220  }
221  };
222 
223 
228  template <typename Float, int length, typename Order>
229  void extractGhost(Order order, const GaugeField &u, QudaFieldLocation location, bool extract, int offset) {
230  const int *X = u.X();
231  const int nFace = u.Nface();
232  const int nDim = 4;
233  //loop variables: a, b, c with a the most signifcant and c the least significant
234  //A, B, C the maximum value
235  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
236  int A[nDim], B[nDim], C[nDim];
237  A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; // X dimension face
238  A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; // Y dimension face
239  A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; // Z dimension face
240  A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; // T dimension face
241 
242  //multiplication factor to compute index in original cpu memory
243  int f[nDim][nDim]={
244  {X[0]*X[1]*X[2], X[0]*X[1], X[0], 1},
245  {X[0]*X[1]*X[2], X[0]*X[1], 1, X[0]},
246  {X[0]*X[1]*X[2], X[0], 1, X[0]*X[1]},
247  { X[0]*X[1], X[0], 1, X[0]*X[1]*X[2]}
248  };
249 
250  //set the local processor parity
251  //switching odd and even ghost gauge when that dimension size is odd
252  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
253  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
254  int localParity[nDim];
255  for (int dim=0; dim<nDim; dim++)
256  //localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
257  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
258 
259  ExtractGhostArg<Order, nDim> arg(order, u, A, B, C, f, localParity, offset);
260  ExtractGhost<Float,length,nDim,Order> extractor(arg, u, location, extract);
261  extractor.apply(0);
262 
263  }
264 
265 } // namespace quda
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
dim3 dim3 blockDim
ExtractGhostArg(const Order &order, const GaugeField &u, const int *A_, const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_, int offset)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const char * VolString() const
unsigned int minThreads() const
const int * SurfaceCB() const
void extractGhost(const GaugeField &u, Float **Ghost, bool extract, int offset)
__global__ void extractGhostKernel(ExtractGhostArg< Order, nDim > arg)
int Nface() const
Definition: gauge_field.h:232
unsigned int sharedBytesPerThread() const
size_t size_t offset
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
void apply(const cudaStream_t &stream)
ExtractGhost(ExtractGhostArg< Order, nDim > &arg, const GaugeField &meta, QudaFieldLocation location, bool extract)
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
int int int enum cudaChannelFormatKind f
enum QudaFieldLocation_s QudaFieldLocation
unsigned int sharedBytesPerBlock(const TuneParam &param) const
ExtractGhostArg< Order, nDim > arg
int writeAuxString(const char *format,...)
Definition: tune_quda.h:191
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
const void * c
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
#define a
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
int comm_dim_partitioned(int dim)
const int * X() const