QUDA  0.9.0
staggered_oprod.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <staggered_oprod.h>
4 
5 #include <tune_quda.h>
6 #include <quda_internal.h>
7 #include <gauge_field_order.h>
8 #include <quda_matrix.h>
9 
10 namespace quda {
11 
12 #ifdef GPU_STAGGERED_DIRAC
13 
14  namespace { // anonymous
15 #include <texture.h>
16  }
17 
18  enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
19 
20  template<typename Float, typename Output, typename InputA, typename InputB>
21  struct StaggeredOprodArg {
22  unsigned int length;
23  int X[4];
24  unsigned int parity;
25  unsigned int dir;
26  unsigned int ghostOffset[4];
27  unsigned int displacement;
28  KernelType kernelType;
29  int nFace;
30  bool partitioned[4];
31  InputA inA;
32  InputB inB;
33  Output outA;
34  Output outB;
35  Float coeff[2];
36 
37  StaggeredOprodArg(const unsigned int parity,
38  const unsigned int dir,
39  const unsigned int *ghostOffset,
40  const unsigned int displacement,
41  const KernelType& kernelType,
42  const int nFace,
43  const double coeff[2],
44  InputA& inA,
45  InputB& inB,
46  Output& outA,
47  Output& outB,
48  GaugeField& meta) :
49  length(meta.VolumeCB()), parity(parity), dir(dir),
50  displacement(displacement), kernelType(kernelType), nFace(nFace),
51  inA(inA), inB(inB), outA(outA), outB(outB)
52  {
53  this->coeff[0] = coeff[0];
54  this->coeff[1] = coeff[1];
55  for(int i=0; i<4; ++i) this->X[i] = meta.X()[i];
56  for(int i=0; i<4; ++i) this->ghostOffset[i] = ghostOffset[i];
57  for(int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
58  }
59  };
60 
61  enum IndexType {
62  EVEN_X = 0,
63  EVEN_Y = 1,
64  EVEN_Z = 2,
65  EVEN_T = 3
66  };
67 
68  template <IndexType idxType>
69  static __device__ __forceinline__ void coordsFromIndex(int& idx, int c[4],
70  const unsigned int cb_idx, const unsigned int parity, const int X[4])
71  {
72  const int &LX = X[0];
73  const int &LY = X[1];
74  const int &LZ = X[2];
75  const int XYZ = X[2]*X[1]*X[0];
76  const int XY = X[1]*X[0];
77 
78  idx = 2*cb_idx;
79 
80  int x, y, z, t;
81 
82  if (idxType == EVEN_X ) { // X even
83  // t = idx / XYZ;
84  // z = (idx / XY) % Z;
85  // y = (idx / X) % Y;
86  // idx += (parity + t + z + y) & 1;
87  // x = idx % X;
88  // equivalent to the above, but with fewer divisions/mods:
89  int aux1 = idx / LX;
90  x = idx - aux1 * LX;
91  int aux2 = aux1 / LY;
92  y = aux1 - aux2 * LY;
93  t = aux2 / LZ;
94  z = aux2 - t * LZ;
95  aux1 = (parity + t + z + y) & 1;
96  x += aux1;
97  idx += aux1;
98  } else if (idxType == EVEN_Y ) { // Y even
99  t = idx / XYZ;
100  z = (idx / XY) % LZ;
101  idx += (parity + t + z) & 1;
102  y = (idx / LX) % LY;
103  x = idx % LX;
104  } else if (idxType == EVEN_Z ) { // Z even
105  t = idx / XYZ;
106  idx += (parity + t) & 1;
107  z = (idx / XY) % LZ;
108  y = (idx / LX) % LY;
109  x = idx % LX;
110  } else {
111  idx += parity;
112  t = idx / XYZ;
113  z = (idx / XY) % LZ;
114  y = (idx / LX) % LY;
115  x = idx % LX;
116  }
117 
118  c[0] = x;
119  c[1] = y;
120  c[2] = z;
121  c[3] = t;
122  }
123 
124 
125  // Get the coordinates for the exterior kernels
126  __device__ static void coordsFromIndex(int x[4], const unsigned int cb_idx, const int X[4], const unsigned int dir, const int displacement, const unsigned int parity)
127  {
128  int Xh[2] = {X[0]/2, X[1]/2};
129  switch(dir){
130  case 0:
131  x[2] = cb_idx/Xh[1] % X[2];
132  x[3] = cb_idx/(Xh[1]*X[2]) % X[3];
133  x[0] = cb_idx/(Xh[1]*X[2]*X[3]);
134  x[0] += (X[0] - displacement);
135  x[1] = 2*(cb_idx % Xh[1]) + ((x[0]+x[2]+x[3]+parity)&1);
136  break;
137 
138  case 1:
139  x[2] = cb_idx/Xh[0] % X[2];
140  x[3] = cb_idx/(Xh[0]*X[2]) % X[3];
141  x[1] = cb_idx/(Xh[0]*X[2]*X[3]);
142  x[1] += (X[1] - displacement);
143  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
144  break;
145 
146  case 2:
147  x[1] = cb_idx/Xh[0] % X[1];
148  x[3] = cb_idx/(Xh[0]*X[1]) % X[3];
149  x[2] = cb_idx/(Xh[0]*X[1]*X[3]);
150  x[2] += (X[2] - displacement);
151  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
152  break;
153 
154  case 3:
155  x[1] = cb_idx/Xh[0] % X[1];
156  x[2] = cb_idx/(Xh[0]*X[1]) % X[2];
157  x[3] = cb_idx/(Xh[0]*X[1]*X[2]);
158  x[3] += (X[3] - displacement);
159  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
160  break;
161  }
162  return;
163  }
164 
165 
166  __device__ __forceinline__
167  int neighborIndex(const unsigned int cb_idx, const int shift[4], const bool partitioned[4], const unsigned int parity, const int X[4]){
168  int full_idx;
169  int x[4];
170 
171  coordsFromIndex<EVEN_X>(full_idx, x, cb_idx, parity, X);
172 
173 
174  for(int dim = 0; dim<4; ++dim){
175  if( partitioned[dim] )
176  if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
177  }
178 
179  for(int dim=0; dim<4; ++dim){
180  x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
181  }
182  return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
183  }
184 
185 
186  template<typename real, typename Output, typename InputA, typename InputB>
187  __global__ void interiorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
188  {
189  unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
190  const unsigned int gridSize = gridDim.x*blockDim.x;
191 
192  typedef complex<real> Complex;
193  Complex x[3];
194  Complex y[3];
195  Complex z[3];
196  Matrix<Complex,3> result;
197  Matrix<Complex,3> tempA, tempB; // input
198 
199  while(idx<arg.length){
200  arg.inA.load(x, idx);
201 
202 #pragma unroll
203  for(int dim=0; dim<4; ++dim){
204  int shift[4] = {0,0,0,0};
205  shift[dim] = 1;
206  const int first_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
207  if(first_nbr_idx >= 0){
208  arg.inB.load(y, first_nbr_idx);
209  outerProd(y,x,&result);
210  arg.outA.load(reinterpret_cast<real*>(tempA.data), idx, dim, arg.parity);
211  result = tempA + result*arg.coeff[0];
212 
213  arg.outA.save(reinterpret_cast<real*>(result.data), idx, dim, arg.parity);
214 
215  if (arg.nFace == 3) {
216  shift[dim] = 3;
217  const int third_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
218  if(third_nbr_idx >= 0){
219  arg.inB.load(z, third_nbr_idx);
220  outerProd(z, x, &result);
221  arg.outB.load(reinterpret_cast<real*>(tempB.data), idx, dim, arg.parity);
222  result = tempB + result*arg.coeff[1];
223  arg.outB.save(reinterpret_cast<real*>(result.data), idx, dim, arg.parity);
224  }
225  }
226  }
227  } // dim
228 
229  idx += gridSize;
230  }
231  return;
232  } // interiorOprodKernel
233 
234 
235  template<int dim, typename real, typename Output, typename InputA, typename InputB>
236  __global__ void exteriorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
237  {
238  typedef complex<real> Complex;
239 
240  unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
241  const unsigned int gridSize = gridDim.x*blockDim.x;
242 
243  Complex a[3];
244  Complex b[3];
245  Matrix<Complex,3> result;
246  Matrix<Complex,3> inmatrix; // input
247 
248  Output& out = (arg.displacement == 1) ? arg.outA : arg.outB;
249  real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];
250 
251  int x[4];
252  while(cb_idx<arg.length){
253  coordsFromIndex(x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
254  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
255 
256  out.load(reinterpret_cast<real*>(inmatrix.data), bulk_cb_idx, arg.dir, arg.parity);
257  arg.inA.load(a, bulk_cb_idx);
258 
259  const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
260  arg.inB.loadGhost(b, ghost_idx, arg.dir);
261 
262  outerProd(b,a,&result);
263  result = inmatrix + result*coeff;
264  out.save(reinterpret_cast<real*>(result.data), bulk_cb_idx, arg.dir, arg.parity);
265 
266  cb_idx += gridSize;
267  }
268  return;
269  }
270 
271 
272  template<typename Float, typename Output, typename InputA, typename InputB>
273  class StaggeredOprodField : public Tunable {
274 
275  private:
276  StaggeredOprodArg<Float,Output,InputA,InputB> &arg;
277  const GaugeField &meta;
278 
279  unsigned int sharedBytesPerThread() const { return 0; }
280  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
281 
282  unsigned int minThreads() const { return arg.outA.volumeCB; }
283  bool tunedGridDim() const { return false; }
284 
285  public:
286  StaggeredOprodField(StaggeredOprodArg<Float,Output,InputA,InputB> &arg, const GaugeField &meta)
287  : arg(arg), meta(meta) {
288  writeAuxString("threads=%d,prec=%lu,stride=%d",arg.length,sizeof(Complex)/2,arg.inA.Stride());
289  // this sets the communications pattern for the packing kernel
291  setPackComms(comms);
292  }
293 
294  virtual ~StaggeredOprodField() {}
295 
296  void apply(const cudaStream_t &stream){
297  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
298  // Disable tuning for the time being
299  TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
300  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
301  interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
302  } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
303  if (arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
304  else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
305  else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
306  else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
307  } else {
308  errorQuda("Kernel type not supported\n");
309  }
310  } else { // run the CPU code
311  errorQuda("No CPU support for staggered outer-product calculation\n");
312  }
313  } // apply
314 
315  void preTune(){ this->arg.outA.save(); this->arg.outB.save(); }
316  void postTune(){ this->arg.outA.load(); this->arg.outB.load(); }
317 
318  long long flops() const { return 0; } // FIXME
319  long long bytes() const { return 0; } // FIXME
320  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux);}
321  }; // StaggeredOprodField
322 
323 
324  void exchangeGhost(int nFace, cudaColorSpinorField &a, int parity, int dag) {
325  // need to enable packing in temporal direction to get spin-projector correct
326  bool pack_old = getKernelPackT();
327  setKernelPackT(true);
328 
329  // first transfer src1
331 
333  a.pack(nFace, 1-parity, dag, Nstream-1, location, Device);
334 
336 
337  for(int i=3; i>=0; i--){
338  if(commDimPartitioned(i)){
339  // Initialize the host transfer from the source spinor
340  a.gather(nFace, dag, 2*i);
341  } // commDim(i)
342  } // i=3,..,0
343 
345 
346  for (int i=3; i>=0; i--) {
347  if(commDimPartitioned(i)) {
348  a.commsStart(nFace, 2*i, dag);
349  }
350  }
351 
352  for (int i=3; i>=0; i--) {
353  if(commDimPartitioned(i)) {
354  a.commsWait(nFace, 2*i, dag);
355  a.scatter(nFace, dag, 2*i);
356  }
357  }
358 
360  setKernelPackT(pack_old); // restore packing state
361 
362  a.bufferIndex = (1 - a.bufferIndex);
363  comm_barrier();
364  }
365 
366  template<typename Float, typename Output, typename InputA, typename InputB>
367  void computeStaggeredOprodCuda(Output outA, Output outB, GaugeField& outFieldA, GaugeField& outFieldB, InputA& inA, InputB& inB, cudaColorSpinorField& src,
368  const unsigned int parity, const int faceVolumeCB[4], const double coeff[2], int nFace)
369  {
370  unsigned int ghostOffset[4] = {0,0,0,0};
371  for(int dir=0; dir<4; ++dir) ghostOffset[dir] = src.GhostOffset(dir,1)/src.FieldOrder(); // offset we want is the forwards one
372 
373  // Create the arguments for the interior kernel
374  StaggeredOprodArg<Float,Output,InputA,InputB> arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, nFace, coeff, inA, inB, outA, outB, outFieldA);
375  StaggeredOprodField<Float,Output,InputA,InputB> oprod(arg, outFieldA);
376 
377  arg.kernelType = OPROD_INTERIOR_KERNEL;
378  arg.length = src.VolumeCB();
379  oprod.apply(streams[Nstream-1]);
380 
381  for(int i=3; i>=0; i--){
382  if (commDimPartitioned(i)) {
383  // update parameters for this exterior kernel
384  arg.kernelType = OPROD_EXTERIOR_KERNEL;
385  arg.dir = i;
386 
387  // First, do the one hop term
388  {
389  arg.displacement = 1;
390  arg.length = faceVolumeCB[i];
391  oprod.apply(streams[Nstream-1]);
392  }
393 
394  // Now do the 3 hop term
395  if (nFace == 3) {
396  arg.displacement = 3;
397  arg.length = arg.displacement*faceVolumeCB[i];
398  oprod.apply(streams[Nstream-1]);
399  }
400  }
401  } // i=3,..,0
402 
403  checkCudaError();
404  } // computeStaggeredOprodCuda
405 
406 #endif // GPU_STAGGERED_DIRAC
407 
409  const unsigned int parity, const double coeff[2], int nFace)
410  {
411 #ifdef GPU_STAGGERED_DIRAC
412  if(outA.Order() != QUDA_FLOAT2_GAUGE_ORDER)
413  errorQuda("Unsupported output ordering: %d\n", outA.Order());
414 
415  if(outB.Order() != QUDA_FLOAT2_GAUGE_ORDER)
416  errorQuda("Unsupported output ordering: %d\n", outB.Order());
417 
418  if(inEven.Precision() != outA.Precision()) errorQuda("Mixed precision not supported: %d %d\n", inEven.Precision(), outA.Precision());
419 
420  cudaColorSpinorField &inA = (parity&1) ? static_cast<cudaColorSpinorField&>(inOdd) : static_cast<cudaColorSpinorField&>(inEven);
421  cudaColorSpinorField &inB = (parity&1) ? static_cast<cudaColorSpinorField&>(inEven) : static_cast<cudaColorSpinorField&>(inOdd);
422 
423  inA.allocateGhostBuffer(nFace);
424  inB.allocateGhostBuffer(nFace);
425 
426  if (inEven.Precision() == QUDA_DOUBLE_PRECISION) {
427  Spinor<double2, double2, 3, 0, 0> spinorA(inA, nFace);
428  Spinor<double2, double2, 3, 0, 1> spinorB(inB, nFace);
429  exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
430 
431  computeStaggeredOprodCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(outA), gauge::FloatNOrder<double, 18, 2, 18>(outB),
432  outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
433  } else if (inEven.Precision() == QUDA_SINGLE_PRECISION) {
434  Spinor<float2, float2, 3, 0, 0> spinorA(inA, nFace);
435  Spinor<float2, float2, 3, 0, 1> spinorB(inB, nFace);
436  exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
437 
438  computeStaggeredOprodCuda<float>(gauge::FloatNOrder<float, 18, 2, 18>(outA), gauge::FloatNOrder<float, 18, 2, 18>(outB),
439  outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
440  } else {
441  errorQuda("Unsupported precision: %d\n", inEven.Precision());
442  }
443 
444 #else // GPU_STAGGERED_DIRAC not defined
445  errorQuda("Staggered Outer Product has not been built!");
446 #endif
447 
448  return;
449  } // computeStaggeredOprod
450 
451  void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
452  {
453  if (nFace == 1) {
454  computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
455  double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites
456  computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 1, coeff_, nFace);
457  } else if (nFace == 3) {
458  computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 0, coeff, nFace);
459  computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 1, coeff, nFace);
460  } else {
461  errorQuda("Invalid nFace=%d", nFace);
462  }
463  }
464 
465 } // namespace quda
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
dim3 dim3 blockDim
int commDimPartitioned(int dir)
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
bool getKernelPackT()
Definition: dslash_quda.cu:61
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * src
#define errorQuda(...)
Definition: util_quda.h:90
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:41
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * streams
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
const int Nstream
KernelType
#define b
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:695
IndexType
cpuColorSpinorField * in
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
const auto * Xh
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
static unsigned int unsigned int shift
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
int full_idx
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
const void * c
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
void comm_barrier(void)
Definition: comm_mpi.cpp:328