QUDA  0.9.0
gauge_field_order.h
Go to the documentation of this file.
1 #ifndef _GAUGE_ORDER_H
2 #define _GAUGE_ORDER_H
3 
10 // trove requires the warp shuffle instructions introduced with Kepler
11 #if __COMPUTE_CAPABILITY__ >= 300
12 #include <trove/ptr.h>
13 #else
14 #define DISABLE_TROVE
15 #endif
16 #include <tune_quda.h>
17 #include <assert.h>
18 #include <register_traits.h>
19 #include <complex_quda.h>
20 #include <quda_matrix.h>
21 #include <index_helper.cuh>
22 #include <fast_intdiv.h>
23 #include <type_traits>
24 #include <atomic.cuh>
25 #include <thrust_helper.cuh>
26 
27 namespace quda {
28 
40  template <typename Float, typename T>
41  struct gauge_wrapper {
42  const int dim;
43  const int x_cb;
44  const int parity;
45  T &gauge;
46 
54  __device__ __host__ inline gauge_wrapper<Float,T>(T &gauge, int dim, int x_cb, int parity)
55  : gauge(gauge), dim(dim), x_cb(x_cb), parity(parity) { }
56 
61  template<typename M>
62  __device__ __host__ inline void operator=(const M &a) {
63  gauge.save((Float*)a.data, x_cb, dim, parity);
64  }
65  };
66 
71  template <typename T, int N>
72  template <typename S>
73  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_wrapper<typename RealType<T>::type,S> &a) {
74  a.gauge.load((typename RealType<T>::type*)data, a.x_cb, a.dim, a.parity);
75  }
76 
81  template <typename T, int N>
82  template <typename S>
83  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_wrapper<typename RealType<T>::type,S> &a) {
84  a.gauge.load((typename RealType<T>::type*)data, a.x_cb, a.dim, a.parity);
85  }
86 
98  template <typename Float, typename T>
100  const int dim;
101  const int ghost_idx;
102  const int parity;
103  T &gauge;
104 
112  __device__ __host__ inline gauge_ghost_wrapper<Float,T>(T &gauge, int dim, int ghost_idx, int parity)
114 
119  template<typename M>
120  __device__ __host__ inline void operator=(const M &a) {
121  gauge.saveGhost((Float*)a.data, ghost_idx, dim, parity);
122  }
123  };
124 
129  template <typename T, int N>
130  template <typename S>
131  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
132  a.gauge.loadGhost((typename RealType<T>::type*)data, a.ghost_idx, a.dim, a.parity);
133  }
134 
139  template <typename T, int N>
140  template <typename S>
141  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
142  a.gauge.loadGhost((typename RealType<T>::type*)data, a.ghost_idx, a.dim, a.parity);
143  }
144 
145  namespace gauge {
146 
147  template<typename ReduceType, typename Float> struct square { __host__ __device__ ReduceType operator()(quda::complex<Float> x) { return static_cast<ReduceType>(norm(x)); } };
148 
149  template<typename Float, int nColor, QudaGaugeFieldOrder order> struct Accessor {
150  mutable complex<Float> dummy;
151  Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
152  errorQuda("Not implemented for order=%d", order);
153  }
154  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
155  return dummy;
156  }
157  };
158 
159  template<typename Float, int nColor, QudaGaugeFieldOrder order, bool native_ghost>
160  struct GhostAccessor {
161  mutable complex<Float> dummy;
162  GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
163  errorQuda("Not implemented for order=%d", order);
164  }
165  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
166  return dummy;
167  }
168  };
169 
170  template<typename Float, int nColor>
172  complex <Float> *u[QUDA_MAX_GEOMETRY];
173  const int cb_offset;
174  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
175  : cb_offset((U.Bytes()>>1) / (sizeof(complex<Float>)*U.Geometry())) {
176  for (int d=0; d<U.Geometry(); d++)
177  u[d] = gauge_ ? static_cast<complex<Float>**>(gauge_)[d] :
178  static_cast<complex<Float>**>(const_cast<void*>(U.Gauge_p()))[d];
179  }
180  Accessor(const Accessor<Float,nColor,QUDA_QDP_GAUGE_ORDER> &a) : cb_offset(a.cb_offset) {
181  for (int d=0; d<QUDA_MAX_GEOMETRY; d++)
182  u[d] = a.u[d];
183  }
184  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x, int row, int col) const
185  { return u[d][ parity*cb_offset + (x*nColor + row)*nColor + col]; }
186 
187  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, complex<Float> &val) const {
188 #ifdef __CUDA_ARCH__
189  typedef typename vector<Float,2>::type vec2;
190  vec2 *u2 = reinterpret_cast<vec2*>(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col);
191  atomicAdd(u2, (vec2&)val);
192 #else
193  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col] += val;
194 #endif
195  }
196 
197  __host__ double device_norm2(int dim) const {
198  errorQuda("Not implemented");
199  return 0.0;
200  }
201  };
202 
203  template<typename Float, int nColor, bool native_ghost>
204  struct GhostAccessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,native_ghost> {
205  complex<Float> *ghost[8];
206  int ghostOffset[8];
207  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0) {
208  for (int d=0; d<4; d++) {
209  ghost[d] = ghost_ ? static_cast<complex<Float>*>(ghost_[d]) :
210  static_cast<complex<Float>*>(const_cast<void*>(U.Ghost()[d]));
211  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
212 
213  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
214  ghost_ ? static_cast<complex<Float>*>(ghost_[d+4]) :
215  static_cast<complex<Float>*>(const_cast<void*>(U.Ghost()[d+4]));
216  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
217  }
218  }
220  for (int d=0; d<8; d++) {
221  ghost[d] = a.ghost[d];
222  ghostOffset[d] = a.ghostOffset[d];
223  }
224  }
225  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x, int row, int col) const
226  { return ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col]; }
227  };
228 
229  template<typename Float, int nColor>
231  complex<Float> *u;
232  const int volumeCB;
233  const int geometry;
234  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
235  : u(gauge_ ? static_cast<complex<Float>*>(gauge_) :
236  static_cast<complex<Float>*>(const_cast<void *>(U.Gauge_p()))),
237  volumeCB(U.VolumeCB()), geometry(U.Geometry()) { }
239  : u(a.u), volumeCB(a.volumeCB), geometry(a.geometry) { }
240  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x, int row, int col) const
241  { return u[(((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col]; }
242 
243  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, complex<Float> &val) const {
244 #ifdef __CUDA_ARCH__
245  typedef typename vector<Float,2>::type vec2;
246  vec2 *u2 = reinterpret_cast<vec2*>(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col);
247  atomicAdd(u2, (vec2&)val);
248 #else
249  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col] += val;
250 #endif
251  }
252 
253  __host__ double device_norm2(int dim) const {
254  errorQuda("Not implemented");
255  return 0.0;
256  }
257  };
258 
259  template<typename Float, int nColor, bool native_ghost>
260  struct GhostAccessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,native_ghost> {
261  complex<Float> *ghost[8];
262  int ghostOffset[8];
263  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0) {
264  for (int d=0; d<4; d++) {
265  ghost[d] = ghost_ ? static_cast<complex<Float>*>(ghost_[d]) :
266  static_cast<complex<Float>*>(const_cast<void*>(U.Ghost()[d]));
267  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
268 
269  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
270  ghost_ ? static_cast<complex<Float>*>(ghost_[d+4]) :
271  static_cast<complex<Float>*>(const_cast<void*>(U.Ghost()[d+4]));
272  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
273  }
274  }
276  for (int d=0; d<8; d++) {
277  ghost[d] = a.ghost[d];
278  ghostOffset[d] = a.ghostOffset[d];
279  }
280  }
281  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x, int row, int col) const
282  { return ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col]; }
283  };
284 
285  template<int nColor, int N>
286  __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) {
287  constexpr int M = (2*nColor*nColor) / N;
288  int j = ((row*nColor+col)*2) / N; // factor of two for complexity
289  int i = ((row*nColor+col)*2) % N;
290  int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset
291  index += parity*offset_cb;
292  return index;
293  };
294 
295  template<typename Float, int nColor>
297  complex<Float> *u;
298  const int offset_cb;
299  const int stride;
300  const int geometry;
301  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
302  : u(gauge_ ? static_cast<complex<Float>*>(gauge_) :
303  static_cast<complex<Float>*>(const_cast<void*>(U.Gauge_p()))),
304  offset_cb( (U.Bytes()>>1) / sizeof(complex<Float>)), stride(U.Stride()), geometry(U.Geometry())
305  { }
307  : u(a.u), offset_cb(a.offset_cb), stride(a.stride), geometry(a.geometry) { }
308 
309  __device__ __host__ inline complex<Float>& operator()(int dim, int parity, int x_cb, int row, int col) const
310  { return u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb]; }
311 
312  __device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, complex<Float> &val) const {
313 #ifdef __CUDA_ARCH__
314  typedef typename vector<Float,2>::type vec2;
315  vec2 *u2 = reinterpret_cast<vec2*>(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
316  atomicAdd(u2, (vec2&)val);
317 #else
318  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb] += val;
319 #endif
320  }
321 
322  __host__ double device_norm2(int dim) const {
323  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
325  thrust::device_ptr<complex<Float> > ptr(u);
326  double even = thrust::transform_reduce(thrust::cuda::par(alloc),
327  ptr+0*offset_cb+(dim+0)*stride*nColor*nColor,
328  ptr+0*offset_cb+(dim+1)*stride*nColor*nColor,
329  square<double,Float>(), 0.0, thrust::plus<double>());
330  double odd = thrust::transform_reduce(thrust::cuda::par(alloc),
331  ptr+1*offset_cb+(dim+0)*stride*nColor*nColor,
332  ptr+1*offset_cb+(dim+1)*stride*nColor*nColor,
333  square<double,Float>(), 0.0, thrust::plus<double>());
334  return even + odd;
335  }
336 
337  };
338 
339  template<typename Float, int nColor, bool native_ghost>
340  struct GhostAccessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,native_ghost> {
341  complex<Float> *ghost[8];
342  const int volumeCB;
343  int ghostVolumeCB[8];
345  GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
346  : volumeCB(U.VolumeCB()), accessor(U, gauge_, ghost_)
347  {
348  if (!native_ghost) assert(ghost_ != nullptr);
349  for (int d=0; d<4; d++) {
350  ghost[d] = !native_ghost ? static_cast<complex<Float>*>(ghost_[d]) : nullptr;
351  ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
352  ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<Float>*>(ghost_[d+4]) : nullptr;
353  ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
354  }
355  }
357  : volumeCB(a.volumeCB), accessor(a.accessor)
358  {
359  for (int d=0; d<8; d++) {
360  ghost[d] = a.ghost[d];
361  ghostVolumeCB[d] = a.ghostVolumeCB[d];
362  }
363  }
364  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x_cb, int row, int col) const
365  {
366  if (native_ghost)
367  return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
368  else
369  return ghost[d][ ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb ];
370  }
371  };
372 
373 
379  template <typename Float, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order, bool native_ghost=true>
380  struct FieldOrder {
381 
383  const int volumeCB;
384  const int nDim;
385  const int geometry;
386  static constexpr int nColorCoarse = nColor / nSpinCoarse;
388 
391 
392  public:
397  FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
398  : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),
399  location(U.Location()),
400  accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)
401  {
402  if (U.Reconstruct() != QUDA_RECONSTRUCT_NO)
403  errorQuda("GaugeField ordering not supported with reconstruction");
404  }
405 
407  nDim(o.nDim), geometry(o.geometry),
409  { }
410 
411  virtual ~FieldOrder() { ; }
412 
421  __device__ __host__ const complex<Float>& operator()(int d, int parity, int x, int row, int col) const
422  { return accessor(d,parity,x,row,col); }
423 
432  __device__ __host__ complex<Float>& operator() (int d, int parity, int x, int row, int col)
433  { return accessor(d,parity,x,row,col); }
434 
443  __device__ __host__ const complex<Float>& Ghost(int d, int parity, int x, int row, int col) const
444  { return ghostAccessor(d,parity,x,row,col); }
445 
454  __device__ __host__ complex<Float>& Ghost(int d, int parity, int x, int row, int col)
455  { return ghostAccessor(d,parity,x,row,col); }
456 
467  __device__ __host__ inline const complex<Float>& operator()(int d, int parity, int x, int s_row,
468  int s_col, int c_row, int c_col) const {
469  return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
470  }
471 
482  __device__ __host__ inline complex<Float>& operator()(int d, int parity, int x, int s_row,
483  int s_col, int c_row, int c_col) {
484  return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
485  }
486 
497  __device__ __host__ inline const complex<Float>& Ghost(int d, int parity, int x, int s_row,
498  int s_col, int c_row, int c_col) const {
499  return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
500  }
501 
512  __device__ __host__ inline complex<Float>& Ghost(int d, int parity, int x, int s_row,
513  int s_col, int c_row, int c_col) {
514  return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
515  }
516 
517  __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col,
518  int c_row, int c_col, complex<Float> &val) {
519  accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);
520  }
521 
523  __device__ __host__ inline int Ncolor() const { return nColor; }
524 
526  __device__ __host__ inline int Volume() const { return 2*volumeCB; }
527 
529  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
530 
532  __device__ __host__ inline int Ndim() const { return nDim; }
533 
535  __device__ __host__ inline int Geometry() const { return geometry; }
536 
538  __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; }
539 
541  __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; }
542 
548  __host__ double norm2(int dim) const {
549  double nrm2 = 0;
551  // call device version - specialized for ordering
552  nrm2 = accessor.device_norm2(dim);
553  } else {
554  // do simple norm on host memory
555  for (int parity=0; parity<2; parity++)
556  for (int x_cb=0; x_cb<volumeCB; x_cb++) {
557  for (int row=0; row<nColor; row++)
558  for (int col=0; col<nColor; col++)
559  nrm2 += norm((*this)(dim,parity,x_cb,row,col));
560  }
561  }
562  comm_allreduce(&nrm2);
563  return nrm2;
564  }
565 
567  size_t Bytes() const { return static_cast<size_t>(volumeCB) * nColor * nColor * 2ll * sizeof(Float); }
568  };
569 
570 
572  template <int N, typename Float>
573  struct Reconstruct {
574  typedef typename mapper<Float>::type RegType;
575  Reconstruct(const GaugeField &u) { }
577 
578  __device__ __host__ inline void Pack(RegType out[N], const RegType in[N], int idx ) const {
579 #pragma unroll
580  for (int i=0; i<N; i++) out[i] = in[i];
581  }
582  template<typename I>
583  __device__ __host__ inline void Unpack(RegType out[N], const RegType in[N], int idx, int dir,
584  const RegType phase, const I *X, const int *R) const {
585 #pragma unroll
586  for (int i=0; i<N; i++) out[i] = in[i];
587  }
588  __device__ __host__ inline RegType getPhase(const RegType in[N]) const { return 0; }
589  };
590 
593  template <typename Float>
594  struct Reconstruct<19,Float> {
595  typedef typename mapper<Float>::type RegType;
597  Reconstruct(const GaugeField &u) : scale(u.LinkMax()) { }
598  Reconstruct(const Reconstruct<19,Float> &recon) : scale(recon.scale) { }
599 
600  __device__ __host__ inline void Pack(RegType out[18], const RegType in[18], int idx) const {
601 #pragma unroll
602  for (int i=0; i<18; i++) out[i] = in[i] / scale;
603  }
604  template<typename I>
605  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[18], int idx, int dir,
606  const RegType phase, const I *X, const int *R) const {
607 #pragma unroll
608  for (int i=0; i<18; i++) out[i] = scale * in[i];
609  }
610  __device__ __host__ inline RegType getPhase(const RegType in[18]) const { return 0; }
611  };
612 
623  template <typename Float, typename I>
624  __device__ __host__ inline Float timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
625  QudaTboundary tBoundary, bool isFirstTimeSlice, bool isLastTimeSlice,
627  if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
628  if ( idx >= X[3]*X[2]*X[1]*X[0]/2 ) { // halo region on the first time slice
629  return isFirstTimeSlice ? static_cast<Float>(tBoundary) : static_cast<Float>(1.0);
630  } else if ( idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ) { // last link on the last time slice
631  return isLastTimeSlice ? static_cast<Float>(tBoundary) : static_cast<Float>(1.0);
632  } else {
633  return static_cast<Float>(1.0);
634  }
635  } else {
636  if ( idx >= (R[3]-1)*X[0]*X[1]*X[2]/2 && idx < R[3]*X[0]*X[1]*X[2]/2 ) {
637  // the boundary condition is on the R[3]-1 time slice
638  return isFirstTimeSlice ? static_cast<Float>(tBoundary) : static_cast<Float>(1.0);
639  } else if ( idx >= (X[3]-R[3]-1)*X[0]*X[1]*X[2]/2 && idx < (X[3]-R[3])*X[0]*X[1]*X[2]/2 ) {
640  // the boundary condition lies on the X[3]-R[3]-1 time slice
641  return isLastTimeSlice ? static_cast<Float>(tBoundary) : static_cast<Float>(1.0);
642  } else {
643  return static_cast<Float>(1.0);
644  }
645  }
646  }
647 
648  // not actually used - here for reference
649  template <typename Float, typename I>
650  __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) {
651  // could consider non-extended varient too?
652  Float sign = static_cast<Float>(1.0);
653  switch (dim) {
654  case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
655  case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
656  case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
657  }
658  return sign;
659  }
660 
661  template <typename Float>
662  struct Reconstruct<12,Float> {
663  typedef typename mapper<Float>::type RegType;
664  typedef complex<RegType> Complex;
670 
671  Reconstruct(const GaugeField &u) : anisotropy(u.Anisotropy()), tBoundary(u.TBoundary()),
672  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
673  isLastTimeSlice(comm_coord(3) == comm_dim(3)-1 ? true : false),
674  ghostExchange(u.GhostExchange()) { }
675 
677  tBoundary(recon.tBoundary), isFirstTimeSlice(recon.isFirstTimeSlice),
678  isLastTimeSlice(recon.isLastTimeSlice), ghostExchange(recon.ghostExchange) { }
679 
680  __device__ __host__ inline void Pack(RegType out[12], const RegType in[18], int idx) const {
681 #pragma unroll
682  for (int i=0; i<12; i++) out[i] = in[i];
683  }
684 
685  template<typename I>
686  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[12], int idx, int dir,
687  const RegType phase, const I *X, const int *R) const {
688  const Complex *In = reinterpret_cast<const Complex*>(in);
689  Complex *Out = reinterpret_cast<Complex*>(out);
690 
691  const RegType u0 = dir < 3 ? anisotropy :
692  timeBoundary<RegType>(idx, X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
693 
694 #pragma unroll
695  for(int i=0; i<6; ++i) Out[i] = In[i];
696 
697  Out[6] = u0*conj(Out[1]*Out[5] - Out[2]*Out[4]);
698  Out[7] = u0*conj(Out[2]*Out[3] - Out[0]*Out[5]);
699  Out[8] = u0*conj(Out[0]*Out[4] - Out[1]*Out[3]);
700  }
701 
702  __device__ __host__ inline RegType getPhase(const RegType in[18]) { return 0; }
703  };
704 
705  // FIX ME - 11 is a misnomer to avoid confusion in template instantiation
706  template <typename Float>
707  struct Reconstruct<11,Float> {
708  typedef typename mapper<Float>::type RegType;
709 
710  Reconstruct(const GaugeField &u) { ; }
712 
713  __device__ __host__ inline void Pack(RegType out[10], const RegType in[18], int idx) const {
714 #pragma unroll
715  for (int i=0; i<4; i++) out[i] = in[i+2];
716  out[4] = in[10];
717  out[5] = in[11];
718  out[6] = in[1];
719  out[7] = in[9];
720  out[8] = in[17];
721  out[9] = 0.0;
722  }
723 
724  template<typename I>
725  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[10], int idx, int dir,
726  const RegType phase, const I *X, const int *R) const {
727  out[0] = 0.0;
728  out[1] = in[6];
729 #pragma unroll
730  for (int i=0; i<4; i++) out[i+2] = in[i];
731  out[6] = -out[2];
732  out[7] = out[3];
733  out[8] = 0.0;
734  out[9] = in[7];
735  out[10] = in[4];
736  out[11] = in[5];
737  out[12] = -out[4];
738  out[13] = out[5];
739  out[14] = -out[10];
740  out[15] = out[11];
741  out[16] = 0.0;
742  out[17] = in[8];
743  }
744 
745  __device__ __host__ inline RegType getPhase(const RegType in[18]) { return 0; }
746 
747  };
748 
749  template <typename Float>
750  struct Reconstruct<13,Float> {
751  typedef typename mapper<Float>::type RegType;
752  typedef complex<RegType> Complex;
754  const RegType scale;
755 
756  Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()) { }
757  Reconstruct(const Reconstruct<13,Float> &recon) : reconstruct_12(recon.reconstruct_12),
758  scale(recon.scale) { }
759 
760  __device__ __host__ inline void Pack(RegType out[12], const RegType in[18], int idx) const {
761  reconstruct_12.Pack(out, in, idx);
762  }
763 
764  template<typename I>
765  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[12], int idx, int dir,
766  const RegType phase, const I *X, const int *R) const {
767  const Complex *In = reinterpret_cast<const Complex*>(in);
768  Complex *Out = reinterpret_cast<Complex*>(out);
769  const RegType coeff = static_cast<RegType>(1.0)/scale;
770 
771 #pragma unroll
772  for(int i=0; i<6; ++i) Out[i] = In[i];
773 
774  Out[6] = coeff*conj(Out[1]*Out[5] - Out[2]*Out[4]);
775  Out[7] = coeff*conj(Out[2]*Out[3] - Out[0]*Out[5]);
776  Out[8] = coeff*conj(Out[0]*Out[4] - Out[1]*Out[3]);
777 
778  // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
779  RegType cos_sin[2];
780  Trig<isHalf<RegType>::value,RegType>::SinCos(static_cast<RegType>(3.*phase), &cos_sin[1], &cos_sin[0]);
781  Complex A(cos_sin[0], cos_sin[1]);
782 
783  Out[6] *= A;
784  Out[7] *= A;
785  Out[8] *= A;
786  }
787 
788  __device__ __host__ inline RegType getPhase(const RegType in[18]) const {
789 #if 1 // phase from cross product
790  const Complex *In = reinterpret_cast<const Complex*>(in);
791  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
792  Complex denom = conj(In[0]*In[4] - In[1]*In[3]) / scale;
793  Complex expI3Phase = In[8] / denom; // numerator = U[2][2]
794  RegType phase = arg(expI3Phase)/static_cast<RegType>(3.0);
795 #else // phase from determinant
797 #pragma unroll
798  for (int i=0; i<9; i++) a(i) = Complex(in[2*i]/scale, in[2*i+1]/scale);
799  const Complex det = getDeterminant( a );
800  RegType phase = arg(det)/3;
801 #endif
802  return phase;
803  }
804 
805  };
806 
807 
808  template <typename Float>
809  struct Reconstruct<8,Float> {
810  typedef typename mapper<Float>::type RegType;
811  typedef complex<RegType> Complex;
817 
818  Reconstruct(const GaugeField &u) : anisotropy(u.Anisotropy()), tBoundary(u.TBoundary()),
819  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
820  isLastTimeSlice(comm_coord(3) == comm_dim(3)-1 ? true : false),
821  ghostExchange(u.GhostExchange()) { }
822 
824  tBoundary(recon.tBoundary), isFirstTimeSlice(recon.isFirstTimeSlice),
825  isLastTimeSlice(recon.isLastTimeSlice), ghostExchange(recon.ghostExchange) { }
826 
827  __device__ __host__ inline void Pack(RegType out[8], const RegType in[18], int idx) const {
828  out[0] = Trig<isHalf<Float>::value,RegType>::Atan2(in[1], in[0]);
829  out[1] = Trig<isHalf<Float>::value,RegType>::Atan2(in[13], in[12]);
830 #pragma unroll
831  for (int i=2; i<8; i++) out[i] = in[i];
832  }
833 
834  template<typename I>
835  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase,
836  const I *X, const int *R, const RegType scale=1.0) const {
837  const Complex *In = reinterpret_cast<const Complex*>(in);
838  Complex *Out = reinterpret_cast<Complex*>(out);
839 
840  // First reconstruct first row
841  Out[1] = In[1];
842  Out[2] = In[2];
843  RegType row_sum = norm(Out[1]) + norm(Out[2]);
844 
845  RegType u0 = dir < 3 ? anisotropy :
846  timeBoundary<RegType>(idx, X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
847  u0 *= scale;
848 
849  RegType diff = static_cast<RegType>(1.0)/(u0*u0) - row_sum;
850  RegType U00_mag = sqrt(diff >= static_cast<RegType>(0.0) ? diff : static_cast<RegType>(0.0));
851 
852  Out[0] = U00_mag * Complex(Trig<isHalf<Float>::value,RegType>::Cos(in[0]), Trig<isHalf<Float>::value,RegType>::Sin(in[0]));
853 
854  // Now reconstruct first column
855  Out[3] = In[3];
856  RegType column_sum = norm(Out[0]) + norm(Out[3]);
857 
858  diff = static_cast<RegType>(1.0)/(u0*u0) - column_sum;
859  RegType U20_mag = sqrt(diff >= static_cast<RegType>(0.0) ? diff : static_cast<RegType>(0.0));
860 
861  Out[6] = U20_mag * Complex( Trig<isHalf<Float>::value,RegType>::Cos(in[1]), Trig<isHalf<Float>::value,RegType>::Sin(in[1]));
862  // First column now restored
863 
864  // finally reconstruct last elements from SU(2) rotation
865  RegType r_inv2 = static_cast<RegType>(1.0)/(u0*row_sum);
866 
867  Complex A = conj(Out[0])*Out[3];
868  Out[4] = -(conj(Out[6])*conj(Out[2]) + u0*A*Out[1])*r_inv2; // U11
869  Out[5] = (conj(Out[6])*conj(Out[1]) - u0*A*Out[2])*r_inv2; // U12
870 
871  A = conj(Out[0])*Out[6];
872  Out[7] = (conj(Out[3])*conj(Out[2]) - u0*A*Out[1])*r_inv2; // U21
873  Out[8] = -(conj(Out[3])*conj(Out[1]) + u0*A*Out[2])*r_inv2; // U12
874  }
875 
876  __device__ __host__ inline RegType getPhase(const RegType in[18]){ return 0; }
877  };
878 
879 
880  template <typename Float>
881  struct Reconstruct<9,Float> {
882  typedef typename mapper<Float>::type RegType;
883  typedef complex<RegType> Complex;
885  const RegType scale;
886 
887  Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()) {}
888 
889  Reconstruct(const Reconstruct<9,Float> &recon) : reconstruct_8(recon.reconstruct_8),
890  scale(recon.scale) { }
891 
892  __device__ __host__ inline RegType getPhase(const RegType in[18]) const {
893 #if 1 // phase from cross product
894  const Complex *In = reinterpret_cast<const Complex*>(in);
895  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
896  Complex denom = conj(In[0]*In[4] - In[1]*In[3]) / scale;
897  Complex expI3Phase = In[8] / denom; // numerator = U[2][2]
898  RegType phase = arg(expI3Phase)/static_cast<RegType>(3.0);
899 #else // phase from determinant
901 #pragma unroll
902  for (int i=0; i<9; i++) a(i) = Complex(in[2*i]/scale, in[2*i+1]/scale);
903  const Complex det = getDeterminant( a );
904  RegType phase = arg(det)/3;
905 #endif
906  return phase;
907  }
908 
909  // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,
910  __device__ __host__ inline void Pack(RegType out[8], const RegType in[18], int idx) const {
911  RegType phase = getPhase(in);
912  RegType cos_sin[2];
913  Trig<isHalf<RegType>::value,RegType>::SinCos(static_cast<RegType>(-phase), &cos_sin[1], &cos_sin[0]);
914  Complex z(cos_sin[0], cos_sin[1]);
915  Complex su3[9];
916 #pragma unroll
917  for (int i=0; i<9; i++) su3[i] = z * reinterpret_cast<const Complex*>(in)[i];
918  reconstruct_8.Pack(out, reinterpret_cast<RegType*>(su3), idx);
919  }
920 
921  template<typename I>
922  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[8], int idx, int dir,
923  const RegType phase, const I *X, const int *R) const {
924  reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, scale);
925  RegType cos_sin[2];
926  Trig<isHalf<RegType>::value,RegType>::SinCos(static_cast<RegType>(phase), &cos_sin[1], &cos_sin[0]);
927  Complex z(cos_sin[0], cos_sin[1]);
928 #pragma unroll
929  for (int i=0; i<9; i++) reinterpret_cast<Complex*>(out)[i] *= z;
930  }
931 
932  };
933 
934  __host__ __device__ inline constexpr int ct_sqrt(int n, int i = 1){
935  return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i);
936  }
937 
943  __host__ __device__ inline constexpr int Ncolor(int length) { return ct_sqrt(length/2); }
944 
945  // we default to huge allocations for gauge field (for now)
946  constexpr bool default_huge_alloc = true;
947 
948  template <typename Float, int length, int N, int reconLenParam, QudaStaggeredPhase stag_phase=QUDA_STAGGERED_PHASE_NO, bool huge_alloc=default_huge_alloc>
949  struct FloatNOrder {
950  typedef typename mapper<Float>::type RegType;
954  static const int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
955  static const int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
956  Float *gauge;
958 #ifdef USE_TEXTURE_OBJECTS
959  typedef typename TexVectorType<RegType,N>::type TexVector;
960  cudaTextureObject_t tex;
961  const int tex_offset;
962 #endif
963  Float *ghost[4];
968  const int volumeCB;
969  int faceVolumeCB[4];
970  const int stride;
971  const int geometry;
973  void *backup_h;
974  size_t bytes;
975 
976  FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
977  : reconstruct(u), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
978  offset(u.Bytes()/(2*sizeof(Float))),
979 #ifdef USE_TEXTURE_OBJECTS
980  tex(0), tex_offset(offset/N),
981 #endif
982  ghostExchange(u.GhostExchange()),
983  volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()),
984  phaseOffset(u.PhaseOffset()), backup_h(nullptr), bytes(u.Bytes())
985  {
987  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
988 
989  static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12),
990  "staggered phase only presently supported for 18 and 12 reconstruct");
991  for (int i=0; i<4; i++) {
992  X[i] = u.X()[i];
993  R[i] = u.R()[i];
994  ghost[i] = ghost_ ? ghost_[i] : 0;
995  faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
996  }
997 #ifdef USE_TEXTURE_OBJECTS
998  if (u.Location() == QUDA_CUDA_FIELD_LOCATION) tex = static_cast<const cudaGaugeField&>(u).Tex();
999  if (!huge_alloc && this->gauge != u.Gauge_p() && !override) {
1000  errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
1001  }
1002 #endif
1003  }
1004 
1005  FloatNOrder(const FloatNOrder &order)
1006  : reconstruct(order.reconstruct), gauge(order.gauge), offset(order.offset),
1007 #ifdef USE_TEXTURE_OBJECTS
1008  tex(order.tex), tex_offset(order.tex_offset),
1009 #endif
1011  volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry),
1012  phaseOffset(order.phaseOffset), backup_h(nullptr), bytes(order.bytes)
1013  {
1014  for (int i=0; i<4; i++) {
1015  X[i] = order.X[i];
1016  R[i] = order.R[i];
1017  ghost[i] = order.ghost[i];
1018  faceVolumeCB[i] = order.faceVolumeCB[i];
1019  }
1020  }
1021  virtual ~FloatNOrder() { ; }
1022 
1023  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
1024  const int M = reconLen / N;
1025  RegType tmp[reconLen];
1026 
1027 #pragma unroll
1028  for (int i=0; i<M; i++){
1029  // first do vectorized copy from memory
1030 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
1031  if (!huge_alloc) { // use textures unless we have a huge alloc
1032  TexVector vecTmp = tex1Dfetch<TexVector>(tex, parity*tex_offset + dir*stride*M + stride*i + x);
1033 #pragma unroll
1034  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<RegType*>(&vecTmp)[j]);
1035  } else
1036 #endif
1037  {
1038  Vector vecTmp = vector_load<Vector>(gauge + parity*offset, dir*stride*M + stride*i + x);
1039  // second do copy converting into register type
1040 #pragma unroll
1041  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1042  }
1043  }
1044 
1045  RegType phase = 0.; // TODO - add texture support for phases
1046  if (hasPhase) copy(phase, (gauge+parity*offset)[phaseOffset/sizeof(Float) + stride*dir + x]);
1047 
1048  // The phases come after the ghost matrices
1049  reconstruct.Unpack(v, tmp, x, dir, 2.*M_PI*phase, X, R);
1050 
1051  // FIXME - this is a hack from hell - needs to be moved into the reconstruct type
1052  if (stag_phase == QUDA_STAGGERED_PHASE_MILC && reconLenParam == 12) {
1053  Float sign = (dir == 0 && ((coords[3] - R[3]) & 1) != 0) ||
1054  ( dir == 1 && ((coords[0] - R[0] + coords[3] - R[3]) & 1) != 0) ||
1055  ( dir == 2 && ((coords[0] - R[0] + coords[1] - R[1] + coords[3] - R[3]) & 1) != 0) ? -1.0 : 1.0;
1056 
1057 #pragma unroll
1058  for (int i=12; i<18; i++) v[i] *= sign;
1059  }
1060  }
1061 
1062  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
1063 
1064  const int M = reconLen / N;
1065  RegType tmp[reconLen];
1066  reconstruct.Pack(tmp, v, x);
1067 
1068 #pragma unroll
1069  for (int i=0; i<M; i++){
1070  Vector vecTmp;
1071  // first do copy converting into storage type
1072 #pragma unroll
1073  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1074  // second do vectorized copy into memory
1075  vector_store(gauge + parity*offset, x + dir*stride*M + stride*i, vecTmp);
1076  }
1077  if(hasPhase){
1078  RegType phase = reconstruct.getPhase(v);
1079  copy((gauge+parity*offset)[phaseOffset/sizeof(Float) + dir*stride + x], static_cast<RegType>(phase/(2.*M_PI)));
1080  }
1081  }
1082 
1094  operator()(int dim, int x_cb, int parity) {
1096  }
1097 
1109  operator()(int dim, int x_cb, int parity) const {
1112  }
1113 
1114  __device__ __host__ inline void loadGhost(RegType v[length], int x, int dir, int parity) const {
1115  if (!ghost[dir]) { // load from main field not separate array
1116  load(v, volumeCB+x, dir, parity); // an offset of size volumeCB puts us at the padded region
1117  // This also works perfectly when phases are stored. No need to change this.
1118  } else {
1119  const int M = reconLen / N;
1120  RegType tmp[reconLen];
1121 
1122 #pragma unroll
1123  for (int i=0; i<M; i++) {
1124  // first do vectorized copy from memory into registers
1125  Vector vecTmp = vector_load<Vector>(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase),
1126  i*faceVolumeCB[dir]+x);
1127  // second do copy converting into register type
1128 #pragma unroll
1129  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1130  }
1131  RegType phase=0.;
1132  if(hasPhase) copy(phase, ghost[dir][parity*faceVolumeCB[dir]*(M*N + 1) + faceVolumeCB[dir]*M*N + x]);
1133  reconstruct.Unpack(v, tmp, x, dir, 2.*M_PI*phase, X, R);
1134  }
1135  }
1136 
1137  __device__ __host__ inline void saveGhost(const RegType v[length], int x, int dir, int parity) {
1138  if (!ghost[dir]) { // store in main field not separate array
1139  save(v, volumeCB+x, dir, parity); // an offset of size volumeCB puts us at the padded region
1140  } else {
1141  const int M = reconLen / N;
1142  RegType tmp[reconLen];
1143  reconstruct.Pack(tmp, v, x);
1144 
1145 #pragma unroll
1146  for (int i=0; i<M; i++) {
1147  Vector vecTmp;
1148  // first do copy converting into storage type
1149 #pragma unroll
1150  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1151  // second do vectorized copy into memory
1152  vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);
1153  }
1154 
1155  if (hasPhase) {
1156  RegType phase = reconstruct.getPhase(v);
1157  copy(ghost[dir][parity*faceVolumeCB[dir]*(M*N + 1) + faceVolumeCB[dir]*M*N + x], static_cast<RegType>(phase/(2.*M_PI)));
1158  }
1159  }
1160  }
1161 
1173  Ghost(int dim, int ghost_idx, int parity) {
1175  }
1176 
1188  Ghost(int dim, int ghost_idx, int parity) const {
1191  }
1192 
1193  __device__ __host__ inline void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir,
1194  int dim, int g, int parity, const int R[]) const {
1195  const int M = reconLen / N;
1196  RegType tmp[reconLen];
1197 
1198 #pragma unroll
1199  for (int i=0; i<M; i++) {
1200  // first do vectorized copy from memory
1201  Vector vecTmp = vector_load<Vector>(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
1202  +i*R[dim]*faceVolumeCB[dim]+buff_idx);
1203  // second do copy converting into register type
1204 #pragma unroll
1205  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1206  }
1207  RegType phase=0.;
1208  if(hasPhase) copy(phase, ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + 1)
1209  + R[dim]*faceVolumeCB[dim]*M*N + buff_idx]);
1210 
1211  // use the extended_idx to determine the boundary condition
1212  reconstruct.Unpack(v, tmp, extended_idx, g, 2.*M_PI*phase, X, R);
1213  }
1214 
1215  __device__ __host__ inline void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx,
1216  int dir, int dim, int g, int parity, const int R[]) {
1217  const int M = reconLen / N;
1218  RegType tmp[reconLen];
1219  // use the extended_idx to determine the boundary condition
1220  reconstruct.Pack(tmp, v, extended_idx);
1221 
1222 #pragma unroll
1223  for (int i=0; i<M; i++) {
1224  Vector vecTmp;
1225  // first do copy converting into storage type
1226 #pragma unroll
1227  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1228  // second do vectorized copy to memory
1229  vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
1230  i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);
1231  }
1232  if (hasPhase) {
1233  RegType phase = reconstruct.getPhase(v);
1234  copy(ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + 1) + R[dim]*faceVolumeCB[dim]*M*N + buff_idx],
1235  static_cast<RegType>(phase/(2.*M_PI)));
1236  }
1237  }
1238 
1242  void save() {
1243  if (backup_h) errorQuda("Already allocated host backup");
1245  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
1246  checkCudaError();
1247  }
1248 
1252  void load() {
1253  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
1255  backup_h = nullptr;
1256  checkCudaError();
1257  }
1258 
1259  size_t Bytes() const { return reconLen * sizeof(Float); }
1260  };
1261 
1262 
1269  template <typename real, int length> struct S { real v[length]; };
1270 
1275  template <typename Float, int length>
1276  struct LegacyOrder {
1277  typedef typename mapper<Float>::type RegType;
1280  const int volumeCB;
1281  const int stride;
1282  const int geometry;
1283  const int hasPhase;
1284 
1285  LegacyOrder(const GaugeField &u, Float **ghost_)
1286  : volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()), hasPhase(0) {
1288  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
1289 
1290  for (int i=0; i<4; i++) {
1291  ghost[i] = (ghost_) ? ghost_[i] : (Float*)(u.Ghost()[i]);
1292  faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
1293  }
1294  }
1295 
1296  LegacyOrder(const LegacyOrder &order)
1297  : volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry), hasPhase(0) {
1298  for (int i=0; i<4; i++) {
1299  ghost[i] = order.ghost[i];
1300  faceVolumeCB[i] = order.faceVolumeCB[i];
1301  }
1302  }
1303 
1304  virtual ~LegacyOrder() { ; }
1305 
1306  __device__ __host__ inline void loadGhost(RegType v[length], int x, int dir, int parity) const {
1307 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1308  typedef S<Float,length> structure;
1309  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dir]);
1310  structure v_ = ghost_[parity*faceVolumeCB[dir] + x];
1311  for (int i=0; i<length; i++) v[i] = (RegType)v_.v[i];
1312 #else
1313  for (int i=0; i<length; i++) v[i] = (RegType)ghost[dir][(parity*faceVolumeCB[dir] + x)*length + i];
1314 #endif
1315  }
1316 
1317  __device__ __host__ inline void saveGhost(const RegType v[length], int x, int dir, int parity) {
1318 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1319  typedef S<Float,length> structure;
1320  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dir]);
1321  structure v_;
1322  for (int i=0; i<length; i++) v_.v[i] = (Float)v[i];
1323  ghost_[parity*faceVolumeCB[dir] + x] = v_;
1324 #else
1325  for (int i=0; i<length; i++) ghost[dir][(parity*faceVolumeCB[dir] + x)*length + i] = (Float)v[i];
1326 #endif
1327  }
1328 
1329  __device__ __host__ inline void loadGhostEx(RegType v[length], int x, int dummy, int dir,
1330  int dim, int g, int parity, const int R[]) const {
1331 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1332  typedef S<Float,length> structure;
1333  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
1334  structure v_ = ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];
1335  for (int i=0; i<length; i++) v[i] = (RegType)v_.v[i];
1336 #else
1337  for (int i=0; i<length; i++) {
1338  v[i] = (RegType)ghost[dim][(((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g)*length + i];
1339  }
1340 #endif
1341  }
1342 
1343  __device__ __host__ inline void saveGhostEx(const RegType v[length], int x, int dummy,
1344  int dir, int dim, int g, int parity, const int R[]) {
1345 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1346  typedef S<Float,length> structure;
1347  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
1348  structure v_;
1349  for (int i=0; i<length; i++) v_.v[i] = (Float)v[i];
1350  ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g] = v_;
1351 #else
1352  for (int i=0; i<length; i++) {
1353  ghost[dim]
1354  [(((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g)*length + i] = (Float)v[i];
1355  }
1356 #endif
1357  }
1358 
1359  };
1360 
1365  template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
1366  typedef typename mapper<Float>::type RegType;
1368  const int volumeCB;
1369  QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1370  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
1371  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
1372  QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
1373  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
1374  }
1375  virtual ~QDPOrder() { ; }
1376 
1377  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
1378 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1379  typedef S<Float,length> structure;
1380  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
1381  structure v_ = gauge_[parity*volumeCB + x];
1382  for (int i=0; i<length; i++) v[i] = (RegType)v_.v[i];
1383 #else
1384  for (int i=0; i<length; i++) {
1385  v[i] = (RegType)gauge[dir][(parity*volumeCB + x)*length + i];
1386  }
1387 #endif
1388  }
1389 
1390  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
1391 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1392  typedef S<Float,length> structure;
1393  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
1394  structure v_;
1395  for (int i=0; i<length; i++) v_.v[i] = (Float)v[i];
1396  gauge_[parity*volumeCB + x] = v_;
1397 #else
1398  for (int i=0; i<length; i++) {
1399  gauge[dir][(parity*volumeCB + x)*length + i] = (Float)v[i];
1400  }
1401 #endif
1402  }
1403 
1414  __device__ __host__ inline gauge_wrapper<Float,QDPOrder<Float,length> >
1415  operator()(int dim, int x_cb, int parity) {
1416  return gauge_wrapper<Float,QDPOrder<Float,length> >(*this, dim, x_cb, parity);
1417  }
1418 
1429  __device__ __host__ inline const gauge_wrapper<Float,QDPOrder<Float,length> >
1430  operator()(int dim, int x_cb, int parity) const {
1432  (const_cast<QDPOrder<Float,length>&>(*this), dim, x_cb, parity);
1433  }
1434 
1435  size_t Bytes() const { return length * sizeof(Float); }
1436  };
1437 
1442  template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
1443  typedef typename mapper<Float>::type RegType;
1445  const int volumeCB;
1446  QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1447  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
1448  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
1449  QDPJITOrder(const QDPJITOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
1450  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
1451  }
1452  virtual ~QDPJITOrder() { ; }
1453 
1454  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
1455  for (int i=0; i<length; i++) {
1456  int z = i%2;
1457  int rolcol = i/2;
1458  v[i] = (RegType)gauge[dir][((z*(length/2) + rolcol)*2 + parity)*volumeCB + x];
1459  }
1460  }
1461 
1462  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
1463  for (int i=0; i<length; i++) {
1464  int z = i%2;
1465  int rolcol = i/2;
1466  gauge[dir][((z*(length/2) + rolcol)*2 + parity)*volumeCB + x] = (Float)v[i];
1467  }
1468  }
1469 
1480  __device__ __host__ inline gauge_wrapper<Float,QDPJITOrder<Float,length> >
1481  operator()(int dim, int x_cb, int parity) {
1483  }
1484 
1495  __device__ __host__ inline const gauge_wrapper<Float,QDPJITOrder<Float,length> >
1496  operator()(int dim, int x_cb, int parity) const {
1498  (const_cast<QDPJITOrder<Float,length>&>(*this), dim, x_cb, parity);
1499  }
1500 
1501  size_t Bytes() const { return length * sizeof(Float); }
1502  };
1503 
1508  template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
1509  typedef typename mapper<Float>::type RegType;
1510  Float *gauge;
1511  const int volumeCB;
1512  const int geometry;
1513  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
1514  LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1515  volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
1516  MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),
1517  gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
1518  { ; }
1519  virtual ~MILCOrder() { ; }
1520 
1521  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
1522 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1523  typedef S<Float,length> structure;
1524  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1525  structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];
1526  for (int i=0; i<length; i++) v[i] = (RegType)v_.v[i];
1527 #else
1528  for (int i=0; i<length; i++) {
1529  v[i] = (RegType)gauge[((parity*volumeCB+x)*geometry + dir)*length + i];
1530  }
1531 #endif
1532  }
1533 
1534  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
1535 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1536  typedef S<Float,length> structure;
1537  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1538  structure v_;
1539  for (int i=0; i<length; i++) v_.v[i] = (Float)v[i];
1540  gauge_[(parity*volumeCB+x)*geometry + dir] = v_;
1541 #else
1542  for (int i=0; i<length; i++) {
1543  gauge[((parity*volumeCB+x)*geometry + dir)*length + i] = (Float)v[i];
1544  }
1545 #endif
1546  }
1547 
1558  __device__ __host__ inline gauge_wrapper<Float,MILCOrder<Float,length> >
1559  operator()(int dim, int x_cb, int parity) {
1560  return gauge_wrapper<Float,MILCOrder<Float,length> >(*this, dim, x_cb, parity);
1561  }
1562 
1573  __device__ __host__ inline const gauge_wrapper<Float,MILCOrder<Float,length> >
1574  operator()(int dim, int x_cb, int parity) const {
1576  (const_cast<MILCOrder<Float,length>&>(*this), dim, x_cb, parity);
1577  }
1578 
1579  size_t Bytes() const { return length * sizeof(Float); }
1580  };
1581 
1597  template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {
1598  typedef typename mapper<Float>::type RegType;
1599  Float *gauge;
1600  const int volumeCB;
1601  const int geometry;
1602  const size_t offset;
1603  const size_t size;
1604  MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
1605  LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1606  volumeCB(u.VolumeCB()), geometry(u.Geometry()),
1607  offset(u.SiteOffset()), size(u.SiteSize()) { ; }
1608  MILCSiteOrder(const MILCSiteOrder &order) : LegacyOrder<Float,length>(order),
1609  gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry),
1610  offset(order.offset), size(order.size)
1611  { ; }
1612  virtual ~MILCSiteOrder() { ; }
1613 
1614  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
1615  // get base pointer
1616  const Float *gauge0 = reinterpret_cast<const Float*>(reinterpret_cast<const char*>(gauge) + (parity*volumeCB+x)*size + offset);
1617 
1618 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1619  typedef S<Float,length> structure;
1620  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
1621  structure v_ = gauge_[dir];
1622  for (int i=0; i<length; i++) v[i] = (RegType)v_.v[i];
1623 #else
1624  for (int i=0; i<length; i++) {
1625  v[i] = (RegType)gauge0[dir*length + i];
1626  }
1627 #endif
1628  }
1629 
1630  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
1631  // get base pointer
1632  Float *gauge0 = reinterpret_cast<Float*>(reinterpret_cast<char*>(gauge) + (parity*volumeCB+x)*size + offset);
1633 
1634 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1635  typedef S<Float,length> structure;
1636  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
1637  structure v_;
1638  for (int i=0; i<length; i++) v_.v[i] = (Float)v[i];
1639  gauge_[dir] = v_;
1640 #else
1641  for (int i=0; i<length; i++) {
1642  gauge0[dir*length + i] = (Float)v[i];
1643  }
1644 #endif
1645  }
1646 
1647  size_t Bytes() const { return length * sizeof(Float); }
1648  };
1649 
1650 
1655  template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
1656  typedef typename mapper<Float>::type RegType;
1657  Float *gauge;
1658  const int volumeCB;
1659  const Float anisotropy;
1660  static constexpr int Nc = 3;
1661  const int geometry;
1662  CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1663  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1664  volumeCB(u.VolumeCB()), anisotropy(u.Anisotropy()), geometry(u.Geometry())
1665  { if (length != 18) errorQuda("Gauge length %d not supported", length); }
1666  CPSOrder(const CPSOrder &order) : LegacyOrder<Float,length>(order), gauge(order.gauge),
1667  volumeCB(order.volumeCB), anisotropy(order.anisotropy), geometry(order.geometry)
1668  { ; }
1669  virtual ~CPSOrder() { ; }
1670 
1671  // we need to transpose and scale for CPS ordering
1672  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
1673 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1674  typedef S<Float,length> structure;
1675  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1676  structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];
1677  for (int i=0; i<Nc; i++)
1678  for (int j=0; j<Nc; j++)
1679  for (int z=0; z<2; z++)
1680  v[(i*Nc+j)*2+z] = (RegType)v_.v[(j*Nc+i)*2+z] / anisotropy;
1681 #else
1682  for (int i=0; i<Nc; i++) {
1683  for (int j=0; j<Nc; j++) {
1684  for (int z=0; z<2; z++) {
1685  v[(i*Nc+j)*2+z] =
1686  (RegType)(gauge[((((parity*volumeCB+x)*geometry + dir)*Nc + j)*Nc + i)*2 + z] / anisotropy);
1687  }
1688  }
1689  }
1690 #endif
1691  }
1692 
1693  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
1694 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1695  typedef S<Float,length> structure;
1696  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1697  structure v_;
1698  for (int i=0; i<Nc; i++)
1699  for (int j=0; j<Nc; j++)
1700  for (int z=0; z<2; z++)
1701  v_.v[(j*Nc+i)*2+z] = (Float)(anisotropy * v[(i*Nc+j)*2+z]);
1702  gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;
1703 #else
1704  for (int i=0; i<Nc; i++) {
1705  for (int j=0; j<Nc; j++) {
1706  for (int z=0; z<2; z++) {
1707  gauge[((((parity*volumeCB+x)*geometry + dir)*Nc + j)*Nc + i)*2 + z] =
1708  (Float)(anisotropy * v[(i*Nc+j)*2+z]);
1709  }
1710  }
1711  }
1712 #endif
1713  }
1714 
1725  __device__ __host__ inline gauge_wrapper<Float,CPSOrder<Float,length> >
1726  operator()(int dim, int x_cb, int parity) {
1727  return gauge_wrapper<Float,CPSOrder<Float,length> >(*this, dim, x_cb, parity);
1728  }
1729 
1740  __device__ __host__ inline const gauge_wrapper<Float,CPSOrder<Float,length> >
1741  operator()(int dim, int x_cb, int parity) const {
1743  (const_cast<CPSOrder<Float,length>&>(*this), dim, x_cb, parity);
1744  }
1745 
1746  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
1747  };
1748 
1756  template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
1757  typedef typename mapper<Float>::type RegType;
1758  Float *gauge;
1759  const int volumeCB;
1760  int exVolumeCB; // extended checkerboard volume
1761  static constexpr int Nc = 3;
1762  BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1763  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), volumeCB(u.VolumeCB()) {
1764  if (length != 18) errorQuda("Gauge length %d not supported", length);
1765  // compute volumeCB + halo region
1766  exVolumeCB = u.X()[0]/2 + 2;
1767  for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;
1768  }
1769  BQCDOrder(const BQCDOrder &order) : LegacyOrder<Float,length>(order), gauge(order.gauge),
1770  volumeCB(order.volumeCB), exVolumeCB(order.exVolumeCB) {
1771  if (length != 18) errorQuda("Gauge length %d not supported", length);
1772  }
1773 
1774  virtual ~BQCDOrder() { ; }
1775 
1776  // we need to transpose for BQCD ordering
1777  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
1778 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1779  typedef S<Float,length> structure;
1780  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1781  structure v_ = gauge_[(dir*2+parity)*exVolumeCB + x];
1782  for (int i=0; i<Nc; i++)
1783  for (int j=0; j<Nc; j++)
1784  for (int z=0; z<2; z++)
1785  v[(i*Nc+j)*2+z] = (RegType)v_.v[(j*Nc+i)*2+z];
1786 #else
1787  for (int i=0; i<Nc; i++) {
1788  for (int j=0; j<Nc; j++) {
1789  for (int z=0; z<2; z++) {
1790  v[(i*Nc+j)*2+z] = (RegType)gauge[((((dir*2+parity)*exVolumeCB + x)*Nc + j)*Nc + i)*2 + z];
1791  }
1792  }
1793  }
1794 #endif
1795  }
1796 
1797  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
1798 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1799  typedef S<Float,length> structure;
1800  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1801  structure v_;
1802  for (int i=0; i<Nc; i++)
1803  for (int j=0; j<Nc; j++)
1804  for (int z=0; z<2; z++)
1805  v_.v[(j*Nc+i)*2+z] = (Float)(v[(i*Nc+j)*2+z]);
1806  gauge_[(dir*2+parity)*exVolumeCB + x] = v_;
1807 #else
1808  for (int i=0; i<Nc; i++) {
1809  for (int j=0; j<Nc; j++) {
1810  for (int z=0; z<2; z++) {
1811  gauge[((((dir*2+parity)*exVolumeCB + x)*Nc + j)*Nc + i)*2 + z] = (Float)v[(i*Nc+j)*2+z];
1812  }
1813  }
1814  }
1815 #endif
1816  }
1817 
1828  __device__ __host__ inline gauge_wrapper<Float,BQCDOrder<Float,length> >
1829  operator()(int dim, int x_cb, int parity) {
1830  return gauge_wrapper<Float,BQCDOrder<Float,length> >(*this, dim, x_cb, parity);
1831  }
1832 
1843  __device__ __host__ inline const gauge_wrapper<Float,BQCDOrder<Float,length> >
1844  operator()(int dim, int x_cb, int parity) const {
1846  (const_cast<BQCDOrder<Float,length>&>(*this), dim, x_cb, parity);
1847  }
1848 
1849  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
1850  };
1851 
1856  template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
1857  typedef typename mapper<Float>::type RegType;
1858  Float *gauge;
1859  const int volumeCB;
1860  static constexpr int Nc = 3;
1861  const Float scale;
1862  TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1863  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1864  volumeCB(u.VolumeCB()), scale(u.Scale()) {
1865  if (length != 18) errorQuda("Gauge length %d not supported", length);
1866  }
1867  TIFROrder(const TIFROrder &order)
1868  : LegacyOrder<Float,length>(order), gauge(order.gauge), volumeCB(order.volumeCB), scale(order.scale) {
1869  if (length != 18) errorQuda("Gauge length %d not supported", length);
1870  }
1871 
1872  virtual ~TIFROrder() { ; }
1873 
1874  // we need to transpose for TIFR ordering
1875  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
1876 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1877  typedef S<Float,length> structure;
1878  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1879  structure v_ = gauge_[(dir*2+parity)*volumeCB + x];
1880  for (int i=0; i<Nc; i++)
1881  for (int j=0; j<Nc; j++)
1882  for (int z=0; z<2; z++)
1883  v[(i*Nc+j)*2+z] = (RegType)v_.v[(j*Nc+i)*2+z] / scale;
1884 #else
1885  for (int i=0; i<Nc; i++) {
1886  for (int j=0; j<Nc; j++) {
1887  for (int z=0; z<2; z++) {
1888  v[(i*Nc+j)*2+z] = (RegType)gauge[((((dir*2+parity)*volumeCB + x)*Nc + j)*Nc + i)*2 + z] / scale;
1889  }
1890  }
1891  }
1892 #endif
1893  }
1894 
1895  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
1896 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1897  typedef S<Float,length> structure;
1898  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
1899  structure v_;
1900  for (int i=0; i<Nc; i++)
1901  for (int j=0; j<Nc; j++)
1902  for (int z=0; z<2; z++)
1903  v_.v[(j*Nc+i)*2+z] = (Float)(v[(i*Nc+j)*2+z]) * scale;
1904  gauge_[(dir*2+parity)*volumeCB + x] = v_;
1905 #else
1906  for (int i=0; i<Nc; i++) {
1907  for (int j=0; j<Nc; j++) {
1908  for (int z=0; z<2; z++) {
1909  gauge[((((dir*2+parity)*volumeCB + x)*Nc + j)*Nc + i)*2 + z] = (Float)v[(i*Nc+j)*2+z] * scale;
1910  }
1911  }
1912  }
1913 #endif
1914  }
1915 
1926  __device__ __host__ inline gauge_wrapper<Float,TIFROrder<Float,length> >
1927  operator()(int dim, int x_cb, int parity) {
1928  return gauge_wrapper<Float,TIFROrder<Float,length> >(*this, dim, x_cb, parity);
1929  }
1930 
1941  __device__ __host__ inline const gauge_wrapper<Float,TIFROrder<Float,length> >
1942  operator()(int dim, int x_cb, int parity) const {
1944  (const_cast<TIFROrder<Float,length>&>(*this), dim, x_cb, parity);
1945  }
1946 
1947  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
1948  };
1949 
1954  template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {
1955  typedef typename mapper<Float>::type RegType;
1956  Float *gauge;
1957  const int volumeCB;
1959  static constexpr int Nc = 3;
1960  const Float scale;
1961  const int dim[4];
1962  const int exDim[4];
1963  TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
1964  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1965  volumeCB(u.VolumeCB()), exVolumeCB(1), scale(u.Scale()),
1966  dim{ u.X()[0], u.X()[1], u.X()[2], u.X()[3] },
1967  exDim{ u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3] } {
1968  if (length != 18) errorQuda("Gauge length %d not supported", length);
1969 
1970  // exVolumeCB is the padded checkboard volume
1971  for (int i=0; i<4; i++) exVolumeCB *= exDim[i];
1972  exVolumeCB /= 2;
1973  }
1974 
1976  : LegacyOrder<Float,length>(order), gauge(order.gauge), volumeCB(order.volumeCB), exVolumeCB(order.exVolumeCB), scale(order.scale),
1977  dim{order.dim[0], order.dim[1], order.dim[2], order.dim[3]},
1978  exDim{order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]} {
1979  if (length != 18) errorQuda("Gauge length %d not supported", length);
1980  }
1981 
1982  virtual ~TIFRPaddedOrder() { ; }
1983 
1988  __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const {
1989  // find coordinates
1990  int coord[4];
1991  getCoords(coord, x_cb, dim, parity);
1992 
1993  // get z-extended index
1994  coord[2] += 2; // offset for halo
1995  return linkIndex(coord, exDim);
1996  }
1997 
1998  // we need to transpose for TIFR ordering
1999  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
2000 
2001  int y = getPaddedIndex(x, parity);
2002 
2003 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2004  typedef S<Float,length> structure;
2005  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2006  structure v_ = gauge_[(dir*2+parity)*exVolumeCB + y];
2007  for (int i=0; i<Nc; i++)
2008  for (int j=0; j<Nc; j++)
2009  for (int z=0; z<2; z++)
2010  v[(i*Nc+j)*2+z] = (RegType)v_.v[(j*Nc+i)*2+z] / scale;
2011 #else
2012  for (int i=0; i<Nc; i++) {
2013  for (int j=0; j<Nc; j++) {
2014  for (int z=0; z<2; z++) {
2015  v[(i*Nc+j)*2+z] = (RegType)gauge[((((dir*2+parity)*exVolumeCB + y)*Nc + j)*Nc + i)*2 + z] / scale;
2016  }
2017  }
2018  }
2019 #endif
2020  }
2021 
2022  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
2023 
2024  int y = getPaddedIndex(x, parity);
2025 
2026 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2027  typedef S<Float,length> structure;
2028  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2029  structure v_;
2030  for (int i=0; i<Nc; i++)
2031  for (int j=0; j<Nc; j++)
2032  for (int z=0; z<2; z++)
2033  v_.v[(j*Nc+i)*2+z] = (Float)(v[(i*Nc+j)*2+z]) * scale;
2034  gauge_[(dir*2+parity)*exVolumeCB + y] = v_;
2035 #else
2036  for (int i=0; i<Nc; i++) {
2037  for (int j=0; j<Nc; j++) {
2038  for (int z=0; z<2; z++) {
2039  gauge[((((dir*2+parity)*exVolumeCB + y)*Nc + j)*Nc + i)*2 + z] = (Float)v[(i*Nc+j)*2+z] * scale;
2040  }
2041  }
2042  }
2043 #endif
2044  }
2045 
2056  __device__ __host__ inline gauge_wrapper<Float,TIFRPaddedOrder<Float,length> >
2057  operator()(int dim, int x_cb, int parity) {
2059  }
2060 
2071  __device__ __host__ inline const gauge_wrapper<Float,TIFRPaddedOrder<Float,length> >
2072  operator()(int dim, int x_cb, int parity) const {
2074  (const_cast<TIFRPaddedOrder<Float,length>&>(*this), dim, x_cb, parity);
2075  }
2076 
2077  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2078  };
2079 
2080  } // namespace gauge
2081 
2082  // Use traits to reduce the template explosion
2083  template<typename T,QudaReconstructType,int N=18,QudaStaggeredPhase stag=QUDA_STAGGERED_PHASE_NO,bool huge_alloc=gauge::default_huge_alloc> struct gauge_mapper { };
2084 
2085  // double precision
2086  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<double,QUDA_RECONSTRUCT_NO,N,stag,huge_alloc> { typedef gauge::FloatNOrder<double, N, 2, N, stag, huge_alloc> type; };
2087  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<double,QUDA_RECONSTRUCT_13,N,stag,huge_alloc> { typedef gauge::FloatNOrder<double, N, 2, 13, stag, huge_alloc> type; };
2088  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<double,QUDA_RECONSTRUCT_12,N,stag,huge_alloc> { typedef gauge::FloatNOrder<double, N, 2, 12, stag, huge_alloc> type; };
2089  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<double,QUDA_RECONSTRUCT_9,N,stag,huge_alloc> { typedef gauge::FloatNOrder<double, N, 2, 9, stag, huge_alloc> type; };
2090  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<double,QUDA_RECONSTRUCT_8,N,stag,huge_alloc> { typedef gauge::FloatNOrder<double, N, 2, 8, stag, huge_alloc> type; };
2091 
2092  // single precision
2093  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<float,QUDA_RECONSTRUCT_NO,N,stag,huge_alloc> { typedef gauge::FloatNOrder<float, N, 2, N, stag, huge_alloc> type; };
2094  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<float,QUDA_RECONSTRUCT_13,N,stag,huge_alloc> { typedef gauge::FloatNOrder<float, N, 4, 13, stag, huge_alloc> type; };
2095  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<float,QUDA_RECONSTRUCT_12,N,stag,huge_alloc> { typedef gauge::FloatNOrder<float, N, 4, 12, stag, huge_alloc> type; };
2096  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<float,QUDA_RECONSTRUCT_9,N,stag,huge_alloc> { typedef gauge::FloatNOrder<float, N, 4, 9, stag, huge_alloc> type; };
2097  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<float,QUDA_RECONSTRUCT_8,N,stag,huge_alloc> { typedef gauge::FloatNOrder<float, N, 4, 8, stag, huge_alloc> type; };
2098 
2099  // half precision
2100  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<short,QUDA_RECONSTRUCT_NO,N,stag,huge_alloc> { typedef gauge::FloatNOrder<short, N, 2, N, stag, huge_alloc> type; };
2101  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<short,QUDA_RECONSTRUCT_13,N,stag,huge_alloc> { typedef gauge::FloatNOrder<short, N, 4, 13, stag, huge_alloc> type; };
2102  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<short,QUDA_RECONSTRUCT_12,N,stag,huge_alloc> { typedef gauge::FloatNOrder<short, N, 4, 12, stag, huge_alloc> type; };
2103  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<short,QUDA_RECONSTRUCT_9,N,stag,huge_alloc> { typedef gauge::FloatNOrder<short, N, 4, 9, stag, huge_alloc> type; };
2104  template<int N,QudaStaggeredPhase stag,bool huge_alloc> struct gauge_mapper<short,QUDA_RECONSTRUCT_8,N,stag,huge_alloc> { typedef gauge::FloatNOrder<short, N, 4, 8, stag, huge_alloc> type; };
2105 
2106  template<typename T, QudaGaugeFieldOrder order, int Nc> struct gauge_order_mapper { };
2107  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDP_GAUGE_ORDER,Nc> { typedef gauge::QDPOrder<T, 2*Nc*Nc> type; };
2108  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDPJIT_GAUGE_ORDER,Nc> { typedef gauge::QDPJITOrder<T, 2*Nc*Nc> type; };
2109  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_MILC_GAUGE_ORDER,Nc> { typedef gauge::MILCOrder<T, 2*Nc*Nc> type; };
2110  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_BQCD_GAUGE_ORDER,Nc> { typedef gauge::BQCDOrder<T, 2*Nc*Nc> type; };
2111  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_GAUGE_ORDER,Nc> { typedef gauge::TIFROrder<T, 2*Nc*Nc> type; };
2112  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> { typedef gauge::TIFRPaddedOrder<T, 2*Nc*Nc> type; };
2113  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_FLOAT2_GAUGE_ORDER,Nc> { typedef gauge::FloatNOrder<T, 2*Nc*Nc, 2, 2*Nc*Nc> type; };
2114 
2115  // experiments in reducing template instantation boilerplate
2116  // can this be replaced with a C++11 variant that uses variadic templates?
2117 
2118 #define INSTANTIATE_RECONSTRUCT(func, g, ...) \
2119  { \
2120  if (!data.isNative()) \
2121  errorQuda("Field order %d and precision %d is not native", g.Order(), g.Precision()); \
2122  if( g.Reconstruct() == QUDA_RECONSTRUCT_NO) { \
2123  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge; \
2124  func(Gauge(g), g, __VA_ARGS__); \
2125  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_12){ \
2126  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge; \
2127  func(Gauge(g), g, __VA_ARGS__); \
2128  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_8){ \
2129  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge; \
2130  func(Gauge(g), g, __VA_ARGS__); \
2131  } else { \
2132  errorQuda("Reconstruction type %d of gauge field not supported", g.Reconstruct()); \
2133  } \
2134  }
2135 
2136 #define INSTANTIATE_PRECISION(func, lat, ...) \
2137  { \
2138  if (lat.Precision() == QUDA_DOUBLE_PRECISION) { \
2139  func<double>(lat, __VA_ARGS__); \
2140  } else if(lat.Precision() == QUDA_SINGLE_PRECISION) { \
2141  func<float>(lat, __VA_ARGS__); \
2142  } else { \
2143  errorQuda("Precision %d not supported", lat.Precision()); \
2144  } \
2145  }
2146 
2147 } // namespace quda
2148 
2149 #endif // _GAUGE_ORDER_H
__device__ __host__ const gauge_wrapper< Float, BQCDOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ gauge_ghost_wrapper< Float, FloatNOrder< Float, length, N, reconLenParam, stag_phase, huge_alloc > > Ghost(int dim, int ghost_idx, int parity)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ gauge_wrapper< Float, MILCOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Unpack(RegType out[18], const RegType in[18], int idx, int dir, const RegType phase, const I *X, const int *R) const
__device__ __host__ int NcolorCoarse() const
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
__device__ __host__ RegType getPhase(const RegType in[18])
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
Float * ghost[QUDA_MAX_DIM]
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase, const I *X, const int *R) const
const Reconstruct< 12, Float > reconstruct_12
__device__ __host__ complex< Float > & Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
QDPJITOrder(const QDPJITOrder &order)
Reconstruct(const Reconstruct< 11, Float > &recon)
__device__ __host__ const complex< Float > & Ghost(int d, int parity, int x, int row, int col) const
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
TIFROrder(const TIFROrder &order)
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, complex< Float > &val)
__device__ __host__ void Unpack(RegType out[N], const RegType in[N], int idx, int dir, const RegType phase, const I *X, const int *R) const
__device__ __host__ void Pack(RegType out[N], const RegType in[N], int idx) const
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
__device__ __host__ void Pack(RegType out[18], const RegType in[18], int idx) const
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
constexpr bool default_huge_alloc
mapper< Float >::type RegType
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
Reconstruct(const Reconstruct< 8, Float > &recon)
__host__ __device__ constexpr int ct_sqrt(int n, int i=1)
#define errorQuda(...)
Definition: util_quda.h:90
const GhostAccessor< Float, nColor, order, native_ghost > ghostAccessor
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc > type
#define host_free(ptr)
Definition: malloc_quda.h:59
static constexpr int Nc
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
int comm_dim(int dim)
static constexpr int nColorCoarse
mapper< Float >::type RegType
std::complex< double > Complex
Definition: eig_variables.h:13
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost > &a)
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc > type
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:51
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ int NspinCoarse() const
Reconstruct(const Reconstruct< 19, Float > &recon)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
int comm_coord(int dim)
Reconstruct< reconLenParam, Float > reconstruct
__device__ __host__ int Geometry() const
mapper< Float >::type RegType
__host__ __device__ void copy(T1 &a, const T2 &b)
mapper< Float >::type RegType
LegacyOrder(const LegacyOrder &order)
AllocType< huge_alloc >::type AllocInt
static int R[4]
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
double anisotropy
Definition: test_util.cpp:1644
const int * SurfaceCB() const
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212
__device__ __host__ void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
VectorType< Float, N >::type Vector
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ const gauge_ghost_wrapper< Float, FloatNOrder< Float, length, N, reconLenParam, stag_phase, huge_alloc > > Ghost(int dim, int ghost_idx, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
size_t bytes
host memory for backing up the field when tuning
__device__ __host__ gauge_wrapper< Float, TIFROrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
gauge::FloatNOrder< short, N, 4, 8, stag, huge_alloc > type
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc > type
enum QudaTboundary_s QudaTboundary
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, complex< Float > &val) const
QDPOrder(const QDPOrder &order)
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:114
This is just a dummy structure we use for trove to define the required structure size.
int Nface() const
Definition: gauge_field.h:232
complex< Float > dummy
FieldOrder(const FieldOrder &o)
int_fastdiv X[QUDA_MAX_DIM]
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost > &a)
__device__ __host__ RegType getPhase(const RegType in[N]) const
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc > type
mapper< Float >::type RegType
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ Float timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], QudaTboundary tBoundary, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
char * index(const char *, int)
struct to define gauge fields packed into an opaque MILC site struct:
__device__ __host__ complex< Float > & operator()(int dim, int parity, int x_cb, int row, int col) const
__device__ __host__ const complex< Float > & operator()(int d, int parity, int x, int row, int col) const
int Ncolor() const
Definition: gauge_field.h:202
mapper< Float >::type RegType
const int * R() const
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER > &a)
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
MILCOrder(const MILCOrder &order)
__device__ __host__ const gauge_wrapper< Float, TIFROrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER > &a)
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc > type
const Accessor< Float, nColor, order > accessor
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__host__ double norm2(int dim) const
Returns the L2 norm squared of the field in a given dimension.
__device__ __host__ RegType getPhase(const RegType in[18]) const
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
QudaGhostExchange ghostExchange
const int nColor
Definition: covdev_test.cpp:77
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
struct to define BQCD ordered gauge fields:
__device__ __host__ gauge_wrapper< Float, QDPJITOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
cpuColorSpinorField * in
__device__ __host__ void saveGhostEx(const RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
enum QudaGhostExchange_s QudaGhostExchange
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc > type
const Reconstruct< 8, Float > reconstruct_8
MILCSiteOrder(const MILCSiteOrder &order)
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc > type
__device__ __host__ complex< Float > & operator()(int d, int parity, int x_cb, int row, int col) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, complex< Float > &val) const
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc > type
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ RegType getPhase(const RegType in[18]) const
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
mapper< Float >::type RegType
__device__ __host__ complex< Float > & Ghost(int d, int parity, int x, int row, int col)
const void ** Ghost() const
Definition: gauge_field.h:254
__device__ __host__ const gauge_wrapper< Float, MILCOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Float * gauge[QUDA_MAX_DIM]
Provides precision abstractions and defines the register precision given the storage precision using ...
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc > type
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
gauge::FloatNOrder< short, N, 4, 9, stag, huge_alloc > type
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
__device__ __host__ const gauge_wrapper< Float, FloatNOrder< Float, length, N, reconLenParam, stag_phase, huge_alloc > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase, const I *X, const int *R, const RegType scale=1.0) const
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ int VolumeCB() const
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
const void * ptr
#define safe_malloc(size)
Definition: malloc_quda.h:54
void load()
Restore the field from the host after tuning.
__device__ __host__ void loadGhostEx(RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[10], int idx, int dir, const RegType phase, const I *X, const int *R) const
__device__ __host__ gauge_wrapper< Float, TIFRPaddedOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
QudaFieldLocation Location() const
LegacyOrder(const GaugeField &u, Float **ghost_)
__device__ __host__ gauge_wrapper< Float, CPSOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Float * gauge[QUDA_MAX_DIM]
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc > type
__device__ __host__ int Volume() const
__device__ __host__ gauge_wrapper< Float, QDPOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
FloatNOrder(const FloatNOrder &order)
enum QudaFieldLocation_s QudaFieldLocation
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc > type
int faceVolumeCB[QUDA_MAX_DIM]
static constexpr int Nc
Reconstruct(const Reconstruct< 12, Float > &recon)
cpuColorSpinorField * out
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
__device__ __host__ int Ncolor() const
BQCDOrder(const BQCDOrder &order)
__device__ __host__ const complex< Float > & Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ RegType getPhase(const RegType in[18])
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
__device__ __host__ const gauge_wrapper< Float, QDPOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
void save()
Backup the field to the host when tuning.
mapper< Float >::type RegType
__device__ __host__ gauge_wrapper< Float, BQCDOrder< Float, length > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Reconstruct(const Reconstruct< N, Float > &recon)
__device__ __host__ gauge_wrapper< Float, FloatNOrder< Float, length, N, reconLenParam, stag_phase, huge_alloc > > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Pack(RegType out[10], const RegType in[18], int idx) const
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ const gauge_wrapper< Float, TIFRPaddedOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ RegType getPhase(const RegType in[18])
__device__ __host__ RegType getPhase(const RegType in[18]) const
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int Ndim() const
static __inline__ dim3 dim3 void size_t cudaStream_t int enum cudaTextureReadMode readMode static __inline__ const struct texture< T, dim, readMode > & tex
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, complex< Float > &val) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
Reconstruct(const Reconstruct< 13, Float > &recon)
void size_t length
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost > &a)
mapper< Float >::type RegType
__device__ __host__ Matrix()
Definition: quda_matrix.h:76
Reconstruct(const GaugeField &u)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
TIFRPaddedOrder(const TIFRPaddedOrder &order)
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
virtual void * Gauge_p()
Definition: gauge_field.h:246
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc > type
#define checkCudaError()
Definition: util_quda.h:129
CPSOrder(const CPSOrder &order)
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER > &a)
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281
Reconstruct(const Reconstruct< 9, Float > &recon)
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:312
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
static constexpr int Nc
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase, const I *X, const int *R) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase, const I *X, const int *R) const
static __inline__ size_t size_t d
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
static __inline__ enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode int val
QudaParity parity
Definition: covdev_test.cpp:53
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn&#39;t change from unpadded to padded...
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc > type
#define a
__device__ __host__ const gauge_wrapper< Float, CPSOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ const gauge_wrapper< Float, QDPJITOrder< Float, length > > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
mapper< Float >::type RegType
__device__ __host__ const complex< Float > & operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
QudaFieldLocation location
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
mapper< Float >::type RegType
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
This is just a dummy structure we use for trove to define the required structure size.
const int * X() const
__host__ __device__ ReduceType operator()(quda::complex< Float > x)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)