QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
extract_gauge_ghost_helper.cuh
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <quda_matrix.h>
3 
4 #pragma once
5 
6 namespace quda {
7 
8  using namespace gauge;
9 
10  template <typename Float, int nColor_, typename Order, int nDim>
11  struct ExtractGhostArg {
12  using real = typename mapper<Float>::type;
13  static constexpr int nColor = nColor_;
14  Order order;
15  const unsigned char nFace;
16  unsigned short X[nDim];
17  unsigned short A[nDim];
18  unsigned short B[nDim];
19  unsigned short C[nDim];
20  int f[nDim][nDim];
21  bool localParity[nDim];
22  int faceVolumeCB[nDim];
24  const int offset;
25  ExtractGhostArg(const Order &order, const GaugeField &u, const int *A_,
26  const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_, int offset)
27  : order(order), nFace(u.Nface()), offset(offset) {
28  for (int d=0; d<nDim; d++) {
29  X[d] = u.X()[d];
30  A[d] = A_[d];
31  B[d] = B_[d];
32  C[d] = C_[d];
33  for (int e=0; e<nDim; e++) f[d][e] = f_[d][e];
34  localParity[d] = localParity_[d];
35  commDim[d] = comm_dim_partitioned(d);
36  faceVolumeCB[d] = u.SurfaceCB(d)*u.Nface();
37  }
38  }
39  };
40 
45  template <int nDim, bool extract, typename Arg>
47  {
48  using real = typename Arg::real;
49  constexpr int nColor = Arg::nColor;
50 
51  for (int parity=0; parity<2; parity++) {
52 
53  for (int dim=0; dim<nDim; dim++) {
54 
55  // for now we never inject unless we have partitioned in that dimension
56  if (!arg.commDim[dim] && !extract) continue;
57 
58  // linear index used for reading/writing into ghost buffer
59  int indexGhost = 0;
60  // the following 4-way loop means this is specialized for 4 dimensions
61 
62  // FIXME redefine a, b, c, d such that we always optimize for locality
63  for (int d=arg.X[dim]-arg.nFace; d<arg.X[dim]; d++) { // loop over last nFace faces in this dimension
64  for (int a=0; a<arg.A[dim]; a++) { // loop over the surface elements of this face
65  for (int b=0; b<arg.B[dim]; b++) { // loop over the surface elements of this face
66  for (int c=0; c<arg.C[dim]; c++) { // loop over the surface elements of this face
67  // index is a checkboarded spacetime coordinate
68  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
69  // we only do the extraction for parity we are currently working on
70  int oddness = (a+b+c+d) & 1;
71  if (oddness == parity) {
72 #ifdef FINE_GRAINED_ACCESS
73  for (int i=0; i<nColor; i++) {
74  for (int j=0; j<nColor; j++) {
75  if (extract) {
76  arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, indexGhost, i, j)
77  = arg.order(dim+arg.offset, parity, indexCB, i, j);
78  } else { // injection
79  arg.order(dim+arg.offset, parity, indexCB, i, j)
80  = arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, indexGhost, i, j);
81  }
82  }
83  }
84 #else
85  if (extract) {
86  // load the ghost element from the bulk
87  Matrix<complex<real>, nColor> u = arg.order(dim+arg.offset, indexCB, parity);
88  arg.order.Ghost(dim, indexGhost, (parity+arg.localParity[dim])&1) = u;
89  } else { // injection
90  Matrix <complex<real>, nColor> u = arg.order.Ghost(dim, indexGhost, (parity+arg.localParity[dim])&1);
91  arg.order(dim+arg.offset, indexCB, parity) = u; // save the ghost element to the bulk
92  }
93 #endif
94  indexGhost++;
95  } // oddness == parity
96  } // c
97  } // b
98  } // a
99  } // d
100 
101  assert(indexGhost == arg.faceVolumeCB[dim]);
102  } // dim
103 
104  } // parity
105 
106  }
107 
113  template <int nDim, bool extract, typename Arg>
114  __global__ void extractGhostKernel(Arg arg)
115  {
116  using real = typename Arg::real;
117  constexpr int nColor = Arg::nColor;
118 
119  int parity_dim = blockIdx.z * blockDim.z + threadIdx.z; //parity_dim = parity*nDim + dim
120  int parity = parity_dim / nDim;
121  int dim = parity_dim % nDim;
122  if (parity_dim >= 2 * nDim) return;
123 
124  // for now we never inject unless we have partitioned in that dimension
125  if (!arg.commDim[dim] && !extract) return;
126 
127  // linear index used for writing into ghost buffer
128  int X = blockIdx.x * blockDim.x + threadIdx.x;
129  //if (X >= 2*arg.nFace*arg.surfaceCB[dim]) continue;
130  if (X >= 2*arg.faceVolumeCB[dim]) return;
131  // X = ((d * A + a)*B + b)*C + c
132  int dab = X/arg.C[dim];
133  int c = X - dab*arg.C[dim];
134  int da = dab/arg.B[dim];
135  int b = dab - da*arg.B[dim];
136  int d = da / arg.A[dim];
137  int a = da - d * arg.A[dim];
138  d += arg.X[dim]-arg.nFace;
139 
140  // index is a checkboarded spacetime coordinate
141  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
142  // we only do the extraction for parity we are currently working on
143  int oddness = (a+b+c+d)&1;
144  if (oddness == parity) {
145 #ifdef FINE_GRAINED_ACCESS
146  int i = blockIdx.y * blockDim.y + threadIdx.y;
147  if (i >= nColor) return;
148  for (int j=0; j<nColor; j++) {
149  if (extract) {
150  arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j)
151  = arg.order(dim+arg.offset, parity, indexCB, i, j);
152  } else { // injection
153  arg.order(dim+arg.offset, parity, indexCB, i, j)
154  = arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j);
155  }
156  }
157 #else
158  if (extract) {
159  // load the ghost element from the bulk
160  Matrix<complex<real>, nColor> u = arg.order(dim+arg.offset, indexCB, parity);
161  arg.order.Ghost(dim, X>>1, (parity+arg.localParity[dim])&1) = u;
162  } else { // injection
163  Matrix <complex<real>, nColor> u = arg.order.Ghost(dim, X>>1, (parity+arg.localParity[dim])&1);
164  arg.order(dim+arg.offset, indexCB, parity) = u; // save the ghost element to the bulk
165  }
166 #endif
167  } // oddness == parity
168  }
169 
170  template <int nDim, typename Arg>
173  int size;
174  const GaugeField &meta;
176  bool extract;
177 
178  private:
179  unsigned int sharedBytesPerThread() const { return 0; }
180  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
181 
182  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
183  unsigned int minThreads() const { return size; }
184 
185  public:
186  ExtractGhost(Arg &arg, const GaugeField &meta, QudaFieldLocation location, bool extract)
187 #ifndef FINE_GRAINED_ACCESS
188  : TunableVectorYZ(1, 2*nDim), arg(arg), meta(meta), location(location), extract(extract) {
189 #else
190  : TunableVectorYZ(Arg::nColor, 2*nDim), arg(arg), meta(meta), location(location), extract(extract) {
191 #endif
192  int faceMax = 0;
193  for (int d=0; d<nDim; d++)
194  faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
195  size = 2 * faceMax; // factor of comes from parity
196 
197 #ifndef FINE_GRAINED_ACCESS
198  writeAuxString("stride=%d", arg.order.stride);
199 #else
200  writeAuxString("fine-grained");
201 #endif
202  }
203 
204  virtual ~ExtractGhost() { ; }
205 
206  void apply(const cudaStream_t &stream) {
207  if (location==QUDA_CPU_FIELD_LOCATION) {
208  if (extract) extractGhost<nDim,true>(arg);
209  else extractGhost<nDim,false>(arg);
210  } else {
211  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
212  if (extract) {
213  extractGhostKernel<nDim, true> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
214  } else {
215  extractGhostKernel<nDim, false> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
216  }
217  }
218  }
219 
220  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
221 
222  long long flops() const { return 0; }
223  long long bytes() const {
224  int sites = 0;
225  for (int d=0; d<nDim; d++) sites += arg.faceVolumeCB[d];
226  return 2 * sites * 2 * arg.order.Bytes(); // parity * sites * i/o * vec size
227  }
228  };
229 
230 
235  template <typename Float, int length, typename Order>
236  void extractGhost(Order order, const GaugeField &u, QudaFieldLocation location, bool extract, int offset) {
237  const int *X = u.X();
238  constexpr int nDim = 4;
239  //loop variables: a, b, c with a the most signifcant and c the least significant
240  //A, B, C the maximum value
241  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
242  int A[nDim], B[nDim], C[nDim];
243  A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; // X dimension face
244  A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; // Y dimension face
245  A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; // Z dimension face
246  A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; // T dimension face
247 
248  //multiplication factor to compute index in original cpu memory
249  int f[nDim][nDim]={
250  {X[0]*X[1]*X[2], X[0]*X[1], X[0], 1},
251  {X[0]*X[1]*X[2], X[0]*X[1], 1, X[0]},
252  {X[0]*X[1]*X[2], X[0], 1, X[0]*X[1]},
253  { X[0]*X[1], X[0], 1, X[0]*X[1]*X[2]}
254  };
255 
256  //set the local processor parity
257  //switching odd and even ghost gauge when that dimension size is odd
258  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
259  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
260  int localParity[nDim];
261  for (int dim=0; dim<nDim; dim++)
262  //localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
263  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
264 
265  ExtractGhostArg<Float, gauge::Ncolor(length), Order, nDim> arg(order, u, A, B, C, f, localParity, offset);
266  ExtractGhost<nDim, decltype(arg)> extractor(arg, u, location, extract);
267  extractor.apply(0);
268  }
269 
270 } // namespace quda
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
cudaStream_t * stream
const char * VolString() const
const int * SurfaceCB() const
unsigned int minThreads() const
void extractGhost(const GaugeField &u, Float **Ghost, bool extract, int offset)
typename mapper< Float >::type real
ExtractGhostArg(const Order &order, const GaugeField &u, const int *A_, const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_, int offset)
int Nface() const
Definition: gauge_field.h:281
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int sharedBytesPerThread() const
const int nColor
Definition: covdev_test.cpp:75
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
int X[4]
Definition: covdev_test.cpp:70
enum QudaFieldLocation_s QudaFieldLocation
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
unsigned int sharedBytesPerBlock(const TuneParam &param) const
ExtractGhost(Arg &arg, const GaugeField &meta, QudaFieldLocation location, bool extract)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__global__ void extractGhostKernel(Arg arg)
void apply(const cudaStream_t &stream)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaParity parity
Definition: covdev_test.cpp:54
__device__ __host__ void extractor(Arg &arg, int dir, int a, int b, int c, int d, int g, int parity)
int comm_dim_partitioned(int dim)
const int * X() const