QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
staggered_oprod.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <staggered_oprod.h>
4 
5 #include <tune_quda.h>
6 #include <quda_internal.h>
7 #include <gauge_field_order.h>
8 #include <quda_matrix.h>
9 #include <dslash_quda.h>
10 
11 namespace quda {
12 
13 #ifdef GPU_STAGGERED_DIRAC
14 
15  namespace { // anonymous
16 #include <texture.h>
17  }
18 
19  enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
20 
21  template <typename Float, typename Output, typename InputA, typename InputB> struct StaggeredOprodArg {
22  unsigned int length;
23  const int parity;
24  int dir;
25  int displacement;
26  OprodKernelType kernelType;
27  const int nFace;
28  const InputA inA;
29  const InputB inB;
30  Output outA;
31  Output outB;
32  Float coeff[2];
33  int X[4];
34  unsigned int ghostOffset[4];
35  bool partitioned[4];
36 
37  StaggeredOprodArg(int parity, int dir, const unsigned int *ghostOffset, int displacement,
38  const OprodKernelType &kernelType, int nFace, const double coeff[2], InputA &inA, InputB &inB,
39  Output &outA, Output &outB, GaugeField &meta) :
40  length(meta.VolumeCB()),
41  parity(parity),
42  dir(dir),
43  displacement(displacement),
44  kernelType(kernelType),
45  nFace(nFace),
46  inA(inA),
47  inB(inB),
48  outA(outA),
49  outB(outB)
50  {
51  this->coeff[0] = coeff[0];
52  this->coeff[1] = coeff[1];
53  for (int i = 0; i < 4; ++i) this->X[i] = meta.X()[i];
54  for (int i = 0; i < 4; ++i) this->ghostOffset[i] = ghostOffset[i];
55  for (int i = 0; i < 4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
56  }
57  };
58 
59  enum IndexType {
60  EVEN_X = 0,
61  EVEN_Y = 1,
62  EVEN_Z = 2,
63  EVEN_T = 3
64  };
65 
66  template <IndexType idxType>
67  __device__ inline void coordsFromIndex(int &idx, int c[4], unsigned int cb_idx, int parity, const int X[4])
68  {
69  const int &LX = X[0];
70  const int &LY = X[1];
71  const int &LZ = X[2];
72  const int XYZ = X[2]*X[1]*X[0];
73  const int XY = X[1]*X[0];
74 
75  idx = 2*cb_idx;
76 
77  int x, y, z, t;
78 
79  if (idxType == EVEN_X ) { // X even
80  // t = idx / XYZ;
81  // z = (idx / XY) % Z;
82  // y = (idx / X) % Y;
83  // idx += (parity + t + z + y) & 1;
84  // x = idx % X;
85  // equivalent to the above, but with fewer divisions/mods:
86  int aux1 = idx / LX;
87  x = idx - aux1 * LX;
88  int aux2 = aux1 / LY;
89  y = aux1 - aux2 * LY;
90  t = aux2 / LZ;
91  z = aux2 - t * LZ;
92  aux1 = (parity + t + z + y) & 1;
93  x += aux1;
94  idx += aux1;
95  } else if (idxType == EVEN_Y ) { // Y even
96  t = idx / XYZ;
97  z = (idx / XY) % LZ;
98  idx += (parity + t + z) & 1;
99  y = (idx / LX) % LY;
100  x = idx % LX;
101  } else if (idxType == EVEN_Z ) { // Z even
102  t = idx / XYZ;
103  idx += (parity + t) & 1;
104  z = (idx / XY) % LZ;
105  y = (idx / LX) % LY;
106  x = idx % LX;
107  } else {
108  idx += parity;
109  t = idx / XYZ;
110  z = (idx / XY) % LZ;
111  y = (idx / LX) % LY;
112  x = idx % LX;
113  }
114 
115  c[0] = x;
116  c[1] = y;
117  c[2] = z;
118  c[3] = t;
119  }
120 
121 
122  // Get the coordinates for the exterior kernels
123  __device__ inline void coordsFromIndex(int x[4], unsigned int cb_idx, const int X[4], int dir, int displacement,
124  int parity)
125  {
126  int Xh[2] = {X[0] / 2, X[1] / 2};
127  switch (dir) {
128  case 0:
129  x[2] = cb_idx / Xh[1] % X[2];
130  x[3] = cb_idx / (Xh[1] * X[2]) % X[3];
131  x[0] = cb_idx / (Xh[1] * X[2] * X[3]);
132  x[0] += (X[0] - displacement);
133  x[1] = 2 * (cb_idx % Xh[1]) + ((x[0] + x[2] + x[3] + parity) & 1);
134  break;
135 
136  case 1:
137  x[2] = cb_idx / Xh[0] % X[2];
138  x[3] = cb_idx / (Xh[0] * X[2]) % X[3];
139  x[1] = cb_idx / (Xh[0] * X[2] * X[3]);
140  x[1] += (X[1] - displacement);
141  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
142  break;
143 
144  case 2:
145  x[1] = cb_idx / Xh[0] % X[1];
146  x[3] = cb_idx / (Xh[0] * X[1]) % X[3];
147  x[2] = cb_idx / (Xh[0] * X[1] * X[3]);
148  x[2] += (X[2] - displacement);
149  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
150  break;
151 
152  case 3:
153  x[1] = cb_idx / Xh[0] % X[1];
154  x[2] = cb_idx / (Xh[0] * X[1]) % X[2];
155  x[3] = cb_idx / (Xh[0] * X[1] * X[2]);
156  x[3] += (X[3] - displacement);
157  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
158  break;
159  }
160  return;
161  }
162 
163  __device__ inline int neighborIndex(unsigned int cb_idx, const int shift[4], const bool partitioned[4], int parity,
164  const int X[4])
165  {
166  int full_idx;
167  int x[4];
168 
169  coordsFromIndex<EVEN_X>(full_idx, x, cb_idx, parity, X);
170 
171  for(int dim = 0; dim<4; ++dim){
172  if( partitioned[dim] )
173  if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
174  }
175 
176  for(int dim=0; dim<4; ++dim){
177  x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
178  }
179  return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
180  }
181 
182  template<typename real, typename Output, typename InputA, typename InputB>
183  __global__ void interiorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
184  {
185  using complex = complex<real>;
186  using matrix = Matrix<complex,3>;
187 
188  unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
189  const unsigned int gridSize = gridDim.x*blockDim.x;
190 
191  complex x[3], y[3], z[3];
192  matrix result;
193 
194  while(idx<arg.length){
195  arg.inA.load(x, idx);
196 
197 #pragma unroll
198  for(int dim=0; dim<4; ++dim){
199  int shift[4] = {0,0,0,0};
200  shift[dim] = 1;
201  const int first_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
202  if(first_nbr_idx >= 0){
203  arg.inB.load(y, first_nbr_idx);
204  outerProd(y,x,&result);
205  matrix tempA = arg.outA(dim, idx, arg.parity);
206  result = tempA + result*arg.coeff[0];
207 
208  arg.outA(dim, idx, arg.parity) = result;
209 
210  if (arg.nFace == 3) {
211  shift[dim] = 3;
212  const int third_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
213  if (third_nbr_idx >= 0) {
214  arg.inB.load(z, third_nbr_idx);
215  outerProd(z, x, &result);
216  matrix tempB = arg.outB(dim, idx, arg.parity);
217  result = tempB + result*arg.coeff[1];
218  arg.outB(dim, idx, arg.parity) = result;
219  }
220  }
221  }
222  } // dim
223 
224  idx += gridSize;
225  }
226  return;
227  } // interiorOprodKernel
228 
229 
230  template<int dim, typename real, typename Output, typename InputA, typename InputB>
231  __global__ void exteriorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
232  {
233  using complex = complex<real>;
234  using matrix = Matrix<complex,3>;
235 
236  unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
237  const unsigned int gridSize = gridDim.x*blockDim.x;
238 
239  complex a[3], b[3];
240  matrix result;
241 
242  Output& out = (arg.displacement == 1) ? arg.outA : arg.outB;
243  real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];
244 
245  int x[4];
246  while(cb_idx<arg.length){
247  coordsFromIndex(x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
248  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
249 
250  matrix inmatrix = out(arg.dir, bulk_cb_idx, arg.parity);
251  arg.inA.load(a, bulk_cb_idx);
252 
253  const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
254  arg.inB.loadGhost(b, ghost_idx, arg.dir);
255 
256  outerProd(b, a, &result);
257  result = inmatrix + result*coeff;
258  out(arg.dir, bulk_cb_idx, arg.parity) = result;
259 
260  cb_idx += gridSize;
261  }
262  return;
263  }
264 
265 
266  template<typename Float, typename Output, typename InputA, typename InputB>
267  class StaggeredOprodField : public Tunable {
268 
269  private:
270  StaggeredOprodArg<Float,Output,InputA,InputB> &arg;
271  const GaugeField &meta;
272 
273  unsigned int sharedBytesPerThread() const { return 0; }
274  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
275 
276  unsigned int minThreads() const { return arg.outA.volumeCB; }
277  bool tunedGridDim() const { return false; }
278 
279  public:
280  StaggeredOprodField(StaggeredOprodArg<Float,Output,InputA,InputB> &arg, const GaugeField &meta)
281  : arg(arg), meta(meta) {
282  writeAuxString("threads=%d,prec=%lu,stride=%d",arg.length,sizeof(Complex)/2,arg.inA.Stride());
283  // this sets the communications pattern for the packing kernel
285  setPackComms(comms);
286  }
287 
288  virtual ~StaggeredOprodField() {}
289 
290  void apply(const cudaStream_t &stream){
291  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
292  // Disable tuning for the time being
293  TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
294  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
295  interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
296  } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
297  if (arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
298  else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
299  else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
300  else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
301  } else {
302  errorQuda("Kernel type not supported\n");
303  }
304  } else { // run the CPU code
305  errorQuda("No CPU support for staggered outer-product calculation\n");
306  }
307  } // apply
308 
309  void preTune(){ this->arg.outA.save(); this->arg.outB.save(); }
310  void postTune(){ this->arg.outA.load(); this->arg.outB.load(); }
311 
312  long long flops() const { return 0; } // FIXME
313  long long bytes() const { return 0; } // FIXME
314  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux);}
315  }; // StaggeredOprodField
316 
317 
318  void exchangeGhost(int nFace, cudaColorSpinorField &a, int parity, int dag) {
319  // need to enable packing in temporal direction to get spin-projector correct
320  pushKernelPackT(true);
321 
322  // first transfer src1
324 
325  MemoryLocation location[2*QUDA_MAX_DIM] = {Device, Device, Device, Device, Device, Device, Device, Device};
326  a.pack(nFace, 1-parity, dag, Nstream-1, location, Device);
327 
329 
330  for(int i=3; i>=0; i--){
331  if(commDimPartitioned(i)){
332  // Initialize the host transfer from the source spinor
333  a.gather(nFace, dag, 2*i);
334  } // commDim(i)
335  } // i=3,..,0
336 
338 
339  for (int i=3; i>=0; i--) {
340  if(commDimPartitioned(i)) {
341  a.commsStart(nFace, 2*i, dag);
342  }
343  }
344 
345  for (int i=3; i>=0; i--) {
346  if(commDimPartitioned(i)) {
347  a.commsWait(nFace, 2*i, dag);
348  a.scatter(nFace, dag, 2*i);
349  }
350  }
351 
353  popKernelPackT(); // restore packing state
354 
355  a.bufferIndex = (1 - a.bufferIndex);
356  comm_barrier();
357  }
358 
359  template <typename Float, typename Output, typename InputA, typename InputB>
360  void computeStaggeredOprodCuda(Output outA, Output outB, GaugeField &outFieldA, GaugeField &outFieldB, InputA &inA,
361  InputB &inB, cudaColorSpinorField &src, int parity, const int faceVolumeCB[4],
362  const double coeff[2], int nFace)
363  {
364  unsigned int ghostOffset[4] = {0, 0, 0, 0};
365  for (int dir = 0; dir < 4; ++dir)
366  ghostOffset[dir] = src.GhostOffset(dir, 1) / src.FieldOrder(); // offset we want is the forwards one
367 
368  // Create the arguments for the interior kernel
369  StaggeredOprodArg<Float, Output, InputA, InputB> arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, nFace, coeff,
370  inA, inB, outA, outB, outFieldA);
371  StaggeredOprodField<Float, Output, InputA, InputB> oprod(arg, outFieldA);
372 
373  arg.kernelType = OPROD_INTERIOR_KERNEL;
374  arg.length = src.VolumeCB();
375  oprod.apply(streams[Nstream - 1]);
376 
377  for (int i = 3; i >= 0; i--) {
378  if (commDimPartitioned(i)) {
379  // update parameters for this exterior kernel
380  arg.kernelType = OPROD_EXTERIOR_KERNEL;
381  arg.dir = i;
382 
383  // First, do the one hop term
384  {
385  arg.displacement = 1;
386  arg.length = faceVolumeCB[i];
387  oprod.apply(streams[Nstream - 1]);
388  }
389 
390  // Now do the 3 hop term
391  if (nFace == 3) {
392  arg.displacement = 3;
393  arg.length = arg.displacement * faceVolumeCB[i];
394  oprod.apply(streams[Nstream - 1]);
395  }
396  }
397  } // i=3,..,0
398 
399  checkCudaError();
400  } // computeStaggeredOprodCuda
401 
402 #endif // GPU_STAGGERED_DIRAC
403 
405  int parity, const double coeff[2], int nFace)
406  {
407 #ifdef GPU_STAGGERED_DIRAC
408  if(outA.Order() != QUDA_FLOAT2_GAUGE_ORDER)
409  errorQuda("Unsupported output ordering: %d\n", outA.Order());
410 
411  if(outB.Order() != QUDA_FLOAT2_GAUGE_ORDER)
412  errorQuda("Unsupported output ordering: %d\n", outB.Order());
413 
414  if(inEven.Precision() != outA.Precision()) errorQuda("Mixed precision not supported: %d %d\n", inEven.Precision(), outA.Precision());
415 
416  cudaColorSpinorField &inA = (parity&1) ? static_cast<cudaColorSpinorField&>(inOdd) : static_cast<cudaColorSpinorField&>(inEven);
417  cudaColorSpinorField &inB = (parity&1) ? static_cast<cudaColorSpinorField&>(inEven) : static_cast<cudaColorSpinorField&>(inOdd);
418 
419  inA.allocateGhostBuffer(nFace);
420  inB.allocateGhostBuffer(nFace);
421 
422  if (inEven.Precision() == QUDA_DOUBLE_PRECISION) {
423  Spinor<double2, double2, 3, 0> spinorA(inA, nFace);
424  Spinor<double2, double2, 3, 0> spinorB(inB, nFace);
425  exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
426 
427  computeStaggeredOprodCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(outA), gauge::FloatNOrder<double, 18, 2, 18>(outB),
428  outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
429  } else if (inEven.Precision() == QUDA_SINGLE_PRECISION) {
430  Spinor<float2, float2, 3, 0> spinorA(inA, nFace);
431  Spinor<float2, float2, 3, 0> spinorB(inB, nFace);
432  exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
433 
434  computeStaggeredOprodCuda<float>(gauge::FloatNOrder<float, 18, 2, 18>(outA), gauge::FloatNOrder<float, 18, 2, 18>(outB),
435  outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
436  } else {
437  errorQuda("Unsupported precision: %d\n", inEven.Precision());
438  }
439 
440 #else // GPU_STAGGERED_DIRAC not defined
441  errorQuda("Staggered Outer Product has not been built!");
442 #endif
443 
444  return;
445  } // computeStaggeredOprod
446 
447  void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
448  {
449  if (nFace == 1) {
450  computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
451  double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites
452  computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 1, coeff_, nFace);
453  } else if (nFace == 3) {
454  computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 0, coeff, nFace);
455  computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 1, coeff, nFace);
456  } else {
457  errorQuda("Invalid nFace=%d", nFace);
458  }
459  }
460 
461 } // namespace quda
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * streams
cudaStream_t * stream
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
const int Nstream
Definition: quda_internal.h:83
int length[]
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:805
void popKernelPackT()
Definition: dslash_quda.cu:42
#define qudaDeviceSynchronize()
IndexType
cpuColorSpinorField * in
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14
void comm_barrier(void)
Definition: comm_mpi.cpp:326