QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
extract_gauge_ghost.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 
3 namespace quda {
4  template <typename Order, int nDim>
5  struct ExtractGhostArg {
6  Order order;
7  const unsigned char nFace;
8  unsigned short X[nDim];
9  unsigned short A[nDim];
10  unsigned short B[nDim];
11  unsigned short C[nDim];
12  int f[nDim][nDim];
13  bool localParity[nDim];
14  ExtractGhostArg(const Order &order, int nFace, const int *X_, const int *A_,
15  const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_)
16  : order(order), nFace(nFace) {
17  for (int d=0; d<nDim; d++) {
18  X[d] = X_[d];
19  A[d] = A_[d];
20  B[d] = B_[d];
21  C[d] = C_[d];
22  for (int e=0; e<nDim; e++) f[d][e] = f_[d][e];
23  localParity[d] = localParity_[d];
24  }
25  }
26  };
27 
32  template <typename Float, int length, int nDim, typename Order>
34  typedef typename mapper<Float>::type RegType;
35 
36  for (int parity=0; parity<2; parity++) {
37 
38  for (int dim=0; dim<nDim; dim++) {
39 
40  // linear index used for writing into ghost buffer
41  int indexDst = 0;
42  // the following 4-way loop means this is specialized for 4 dimensions
43 
44  // FIXME redefine a, b, c, d such that we always optimize for locality
45  for (int d=arg.X[dim]-arg.nFace; d<arg.X[dim]; d++) { // loop over last nFace faces in this dimension
46  for (int a=0; a<arg.A[dim]; a++) { // loop over the surface elements of this face
47  for (int b=0; b<arg.B[dim]; b++) { // loop over the surface elements of this face
48  for (int c=0; c<arg.C[dim]; c++) { // loop over the surface elements of this face
49  // index is a checkboarded spacetime coordinate
50  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
51  // we only do the extraction for parity we are currently working on
52  int oddness = (a+b+c+d) & 1;
53  if (oddness == parity) {
54  RegType u[length];
55  arg.order.load(u, indexCB, dim, parity); // load the ghost element from the bulk
56  arg.order.saveGhost(u, indexDst, dim, (parity+arg.localParity[dim])&1);
57  indexDst++;
58  } // oddness == parity
59  } // c
60  } // b
61  } // a
62  } // d
63 
64  //assert(indexDst == arg.nFace*arg.surfaceCB[dim]);
65  assert(indexDst == arg.order.faceVolumeCB[dim]);
66  } // dim
67 
68  } // parity
69 
70  }
71 
77  template <typename Float, int length, int nDim, typename Order>
79  typedef typename mapper<Float>::type RegType;
80 
81  for (int parity=0; parity<2; parity++) {
82  for (int dim=0; dim<nDim; dim++) {
83 
84  // linear index used for writing into ghost buffer
85  int X = blockIdx.x * blockDim.x + threadIdx.x;
86  //if (X >= 2*arg.nFace*arg.surfaceCB[dim]) continue;
87  if (X >= 2*arg.order.faceVolumeCB[dim]) continue;
88  // X = ((d * A + a)*B + b)*C + c
89  int dab = X/arg.C[dim];
90  int c = X - dab*arg.C[dim];
91  int da = dab/arg.B[dim];
92  int b = dab - da*arg.B[dim];
93  int d = da / arg.A[dim];
94  int a = da - d * arg.A[dim];
95  d += arg.X[dim]-arg.nFace;
96 
97  // index is a checkboarded spacetime coordinate
98  int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;
99  // we only do the extraction for parity we are currently working on
100  int oddness = (a+b+c+d)&1;
101  if (oddness == parity) {
102  RegType u[length];
103  arg.order.load(u, indexCB, dim, parity); // load the ghost element from the bulk
104  arg.order.saveGhost(u, X>>1, dim, (parity+arg.localParity[dim])&1);
105  } // oddness == parity
106 
107  } // dim
108 
109  } // parity
110 
111  }
112 
113  template <typename Float, int length, int nDim, typename Order>
116  int size;
117  const GaugeField &meta;
118 
119  private:
120  unsigned int sharedBytesPerThread() const { return 0; }
121  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
122 
123  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
124  unsigned int minThreads() const { return size; }
125 
126  public:
127  ExtractGhost(ExtractGhostArg<Order,nDim> &arg, const GaugeField &meta) : arg(arg), meta(meta) {
128  int faceMax = 0;
129  for (int d=0; d<nDim; d++)
130  faceMax = (arg.order.faceVolumeCB[d] > faceMax )
131  ? arg.order.faceVolumeCB[d] : faceMax;
132  size = 2 * faceMax; // factor of comes from parity
133 
134  writeAuxString("stride=%d", arg.order.stride);
135  }
136 
137  virtual ~ExtractGhost() { ; }
138 
139  void apply(const cudaStream_t &stream) {
140 #if (__COMPUTE_CAPABILITY__ >= 200)
141  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
142  extractGhostKernel<Float, length, nDim, Order>
143  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
144 #else
145  errorQuda("extractGhost not supported on pre-Fermi architecture");
146 #endif
147  }
148 
149  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
150 
151  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.
152  std::stringstream ps;
153  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
154  ps << "shared=" << param.shared_bytes;
155  return ps.str();
156  }
157 
158  long long flops() const { return 0; }
159  long long bytes() const {
160  int sites = 0;
161  for (int d=0; d<nDim; d++) sites += arg.order.faceVolumeCB[d];
162  return 2 * sites * 2 * arg.order.Bytes(); // parity * sites * i/o * vec size
163  }
164  };
165 
166 
171  template <typename Float, int length, typename Order>
172  void extractGhost(Order order, const GaugeField &u, QudaFieldLocation location) {
173  const int *X = u.X();
174  const int nFace = u.Nface();
175  const int nDim = 4;
176  //loop variables: a, b, c with a the most signifcant and c the least significant
177  //A, B, C the maximum value
178  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
179  int A[nDim], B[nDim], C[nDim];
180  A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; // X dimension face
181  A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; // Y dimension face
182  A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; // Z dimension face
183  A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; // T dimension face
184 
185  //multiplication factor to compute index in original cpu memory
186  int f[nDim][nDim]={
187  {X[0]*X[1]*X[2], X[0]*X[1], X[0], 1},
188  {X[0]*X[1]*X[2], X[0]*X[1], 1, X[0]},
189  {X[0]*X[1]*X[2], X[0], 1, X[0]*X[1]},
190  { X[0]*X[1], X[0], 1, X[0]*X[1]*X[2]}
191  };
192 
193  //set the local processor parity
194  //switching odd and even ghost gauge when that dimension size is odd
195  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
196  // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
197  int localParity[nDim];
198  for (int dim=0; dim<nDim; dim++)
199  //localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
200  localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
201 
202  ExtractGhostArg<Order, nDim> arg(order, nFace, X, A, B, C, f, localParity);
203  if (location==QUDA_CPU_FIELD_LOCATION) {
204  extractGhost<Float,length,nDim,Order>(arg);
205  } else {
207  extract.apply(0);
208  }
209 
210  }
211 
213  template <typename Float>
214  void extractGhost(const GaugeField &u, Float **Ghost) {
215 
216  const int length = 18;
217 
220 
221  if (u.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
222  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
223  if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
224  extractGhost<Float,length>(FloatNOrder<Float,length,2,19>(u, 0, Ghost), u, location);
225  } else {
226  extractGhost<Float,length>(FloatNOrder<Float,length,2,18>(u, 0, Ghost), u, location);
227  }
228  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
229  extractGhost<Float,length>(FloatNOrder<Float,length,2,12>(u, 0, Ghost), u, location);
230  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
231  extractGhost<Float,length>(FloatNOrder<Float,length,2,8>(u, 0, Ghost), u, location);
232  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
233  extractGhost<Float,length>(FloatNOrder<Float,length,2,13>(u, 0, Ghost), u, location);
234  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
235  extractGhost<Float,length>(FloatNOrder<Float,length,2,9>(u, 0, Ghost), u, location);
236  }
237  } else if (u.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
238  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
239  if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
240  extractGhost<Float,length>(FloatNOrder<Float,length,1,19>(u, 0, Ghost), u, location);
241  } else {
242  extractGhost<Float,length>(FloatNOrder<Float,length,1,18>(u, 0, Ghost), u, location);
243  }
244  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
245  extractGhost<Float,length>(FloatNOrder<Float,length,4,12>(u, 0, Ghost), u, location);
246  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
247  extractGhost<Float,length>(FloatNOrder<Float,length,4,8>(u, 0, Ghost), u, location);
248  } else if(u.Reconstruct() == QUDA_RECONSTRUCT_13){
249  extractGhost<Float,length>(FloatNOrder<Float,length,4,13>(u, 0, Ghost), u, location);
250  } else if(u.Reconstruct() == QUDA_RECONSTRUCT_9){
251  extractGhost<Float,length>(FloatNOrder<Float,length,4,9>(u, 0, Ghost), u, location);
252  }
253  } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
254 
255 #ifdef BUILD_QDP_INTERFACE
256  extractGhost<Float,length>(QDPOrder<Float,length>(u, 0, Ghost), u, location);
257 #else
258  errorQuda("QDP interface has not been built\n");
259 #endif
260 
261  } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
262 
263 #ifdef BUILD_QDPJIT_INTERFACE
264  extractGhost<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost), u, location);
265 #else
266  errorQuda("QDPJIT interface has not been built\n");
267 #endif
268 
269  } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
270 
271 #ifdef BUILD_CPS_INTERFACE
272  extractGhost<Float,length>(CPSOrder<Float,length>(u, 0, Ghost), u, location);
273 #else
274  errorQuda("CPS interface has not been built\n");
275 #endif
276 
277  } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
278 
279 #ifdef BUILD_MILC_INTERFACE
280  extractGhost<Float,length>(MILCOrder<Float,length>(u, 0, Ghost), u, location);
281 #else
282  errorQuda("MILC interface has not been built\n");
283 #endif
284 
285  } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
286 
287 #ifdef BUILD_BQCD_INTERFACE
288  extractGhost<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost), u, location);
289 #else
290  errorQuda("BQCD interface has not been built\n");
291 #endif
292 
293  } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
294 
295 #ifdef BUILD_TIFR_INTERFACE
296  extractGhost<Float,length>(TIFROrder<Float,length>(u, 0, Ghost), u, location);
297 #else
298  errorQuda("TIFR interface has not been built\n");
299 #endif
300 
301  } else {
302  errorQuda("Gauge field %d order not supported", u.Order());
303  }
304 
305  }
306 
307  void extractGaugeGhost(const GaugeField &u, void **ghost) {
308 
309 #if __COMPUTE_CAPABILITY__ < 200
311  errorQuda("Reconstruct 9/13 not supported on pre-Fermi architecture");
312 #endif
313 
314  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
315  extractGhost(u, (double**)ghost);
316  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
317  extractGhost(u, (float**)ghost);
318  } else if (u.Precision() == QUDA_HALF_PRECISION) {
319  extractGhost(u, (short**)ghost);
320  } else {
321  errorQuda("Unknown precision type %d", u.Precision());
322  }
323  }
324 
325 } // namespace quda
int commDim(int)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
long long bytes() const
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
void extractGaugeGhost(const GaugeField &u, void **ghost)
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
std::string paramString(const TuneParam &param) const
int Nface() const
Definition: gauge_field.h:193
void extractGhost(ExtractGhostArg< Order, nDim > arg)
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaPrecision Precision() const
void apply(const cudaStream_t &stream)
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
const QudaFieldLocation location
Definition: pack_test.cpp:46
FloatingPoint< float > Float
Definition: gtest.h:7350
unsigned short B[nDim]
const unsigned char nFace
unsigned short A[nDim]
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
const char * VolString() const
ExtractGhostArg(const Order &order, int nFace, const int *X_, const int *A_, const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_)
__global__ void extractGhostKernel(ExtractGhostArg< Order, nDim > arg)
TuneKey tuneKey() const
enum QudaFieldLocation_s QudaFieldLocation
QudaLinkType LinkType() const
Definition: gauge_field.h:174
long long flops() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
unsigned short X[nDim]
unsigned short C[nDim]
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
ExtractGhost(ExtractGhostArg< Order, nDim > &arg, const GaugeField &meta)