QUDA  v1.1.0
A library for QCD on GPUs
gauge_field_order.h
Go to the documentation of this file.
1 
2 #ifndef _GAUGE_ORDER_H
3 #define _GAUGE_ORDER_H
4 
11 #ifndef __CUDACC_RTC__
12 #include <assert.h>
13 #endif
14 #include <type_traits>
15 
16 #include <register_traits.h>
17 #include <convert.h>
18 #include <complex_quda.h>
19 #include <quda_matrix.h>
20 #include <index_helper.cuh>
21 #include <fast_intdiv.h>
22 #include <type_traits>
23 #include <limits>
24 #include <atomic.cuh>
25 #include <gauge_field.h>
26 #include <index_helper.cuh>
27 #include <trove_helper.cuh>
28 #include <transform_reduce.h>
29 
30 namespace quda {
31 
43  template <typename Float, typename T>
44  struct gauge_wrapper {
45  const int dim;
46  const int x_cb;
47  const int parity;
48  const Float phase;
49  T &gauge;
50 
58  __device__ __host__ inline gauge_wrapper<Float, T>(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) :
59  gauge(gauge),
60  dim(dim),
61  x_cb(x_cb),
62  parity(parity),
63  phase(phase)
64  {
65  }
66 
71  template<typename M>
72  __device__ __host__ inline void operator=(const M &a) {
73  gauge.save(a.data, x_cb, dim, parity);
74  }
75  };
76 
81  template <typename T, int N>
82  template <typename S>
83  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_wrapper<typename RealType<T>::type,S> &a) {
84  a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
85  }
86 
91  template <typename T, int N>
92  template <typename S>
93  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_wrapper<typename RealType<T>::type,S> &a) {
94  a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
95  }
96 
108  template <typename Float, typename T>
110  const int dim;
111  const int ghost_idx;
112  const int parity;
113  const Float phase;
114  T &gauge;
115 
123  __device__ __host__ inline gauge_ghost_wrapper<Float, T>(
124  T &gauge, int dim, int ghost_idx, int parity, Float phase = 1.0) :
125  gauge(gauge),
126  dim(dim),
128  parity(parity),
129  phase(phase)
130  {
131  }
132 
137  template<typename M>
138  __device__ __host__ inline void operator=(const M &a) {
139  gauge.saveGhost(a.data, ghost_idx, dim, parity);
140  }
141  };
142 
147  template <typename T, int N>
148  template <typename S>
149  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
150  a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
151  }
152 
157  template <typename T, int N>
158  template <typename S>
159  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
160  a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
161  }
162 
163  namespace gauge {
164 
165  template<typename ReduceType, typename Float> struct square_ {
166  square_(ReduceType scale) { }
167  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
168  { return static_cast<ReduceType>(norm(x)); }
169  };
170 
171  template <typename ReduceType> struct square_<ReduceType, int8_t> {
172  const ReduceType scale;
173  square_(const ReduceType scale) : scale(scale) { }
174  __host__ __device__ inline ReduceType operator()(const quda::complex<int8_t> &x)
175  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
176  };
177 
178  template<typename ReduceType> struct square_<ReduceType,short> {
179  const ReduceType scale;
180  square_(const ReduceType scale) : scale(scale) { }
181  __host__ __device__ inline ReduceType operator()(const quda::complex<short> &x)
182  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
183  };
184 
185  template<typename ReduceType> struct square_<ReduceType,int> {
186  const ReduceType scale;
187  square_(const ReduceType scale) : scale(scale) { }
188  __host__ __device__ inline ReduceType operator()(const quda::complex<int> &x)
189  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
190  };
191 
192  template<typename Float, typename storeFloat> struct abs_ {
193  abs_(const Float scale) { }
194  __host__ __device__ Float operator()(const quda::complex<storeFloat> &x) { return abs(x); }
195  };
196 
197  template <typename Float> struct abs_<Float, int8_t> {
199  abs_(const Float scale) : scale(scale) { }
200  __host__ __device__ Float operator()(const quda::complex<int8_t> &x)
201  { return abs(scale * complex<Float>(x.real(), x.imag())); }
202  };
203 
204  template<typename Float> struct abs_<Float,short> {
206  abs_(const Float scale) : scale(scale) { }
207  __host__ __device__ Float operator()(const quda::complex<short> &x)
208  { return abs(scale * complex<Float>(x.real(), x.imag())); }
209  };
210 
211  template<typename Float> struct abs_<Float,int> {
213  abs_(const Float scale) : scale(scale) { }
214  __host__ __device__ Float operator()(const quda::complex<int> &x)
215  { return abs(scale * complex<Float>(x.real(), x.imag())); }
216  };
217 
218  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
219  template <> __host__ __device__ inline constexpr bool fixed_point<float, int8_t>() { return true; }
220  template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
221  template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }
222 
223  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
224  template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
225  template<> __host__ __device__ inline constexpr bool match<short,short>() { return true; }
226 
234  template <typename Float, typename storeFloat>
236 
240  using type = Float;
241  using store_type = storeFloat;
243  const int idx;
244  const Float scale;
246  static constexpr bool fixed = fixed_point<Float, storeFloat>();
247 
252  __device__ __host__ inline fieldorder_wrapper(complex<storeFloat> *v, int idx, Float scale, Float scale_inv) :
253  v(v),
254  idx(idx),
255  scale(scale),
257  {
258  }
259 
260  __device__ __host__ inline Float real() const
261  {
262  if (!fixed) {
263  return v[idx].real();
264  } else {
265  return scale_inv * static_cast<Float>(v[idx].real());
266  }
267  }
268 
269  __device__ __host__ inline Float imag() const
270  {
271  if (!fixed) {
272  return v[idx].imag();
273  } else {
274  return scale_inv * static_cast<Float>(v[idx].imag());
275  }
276  }
277 
281  __device__ __host__ inline auto data() { return &v[idx]; }
282 
283  __device__ __host__ inline const auto data() const { return &v[idx]; }
284 
289  __device__ __host__ inline complex<Float> operator-() const
290  {
291  return fixed ? -scale_inv * static_cast<complex<Float>>(v[idx]) : -static_cast<complex<Float>>(v[idx]);
292  }
293 
298  __device__ __host__ inline void operator=(const fieldorder_wrapper<Float, storeFloat> &a)
299  {
300  v[idx] = fixed ? complex<storeFloat>(round(scale * a.real()), round(scale * a.imag())) : a.v[a.idx];
301  }
302 
307  template <typename theirFloat> __device__ __host__ inline void operator=(const complex<theirFloat> &a)
308  {
309  if (match<storeFloat, theirFloat>()) {
310  v[idx] = complex<storeFloat>(a.x, a.y);
311  } else {
312  v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
313  }
314  }
315 
320  template <typename theirFloat> __device__ __host__ inline void operator+=(const complex<theirFloat> &a)
321  {
322  if (match<storeFloat, theirFloat>()) {
323  v[idx] += complex<storeFloat>(a.x, a.y);
324  } else {
325  v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
326  }
327  }
328 
333  template <typename theirFloat> __device__ __host__ inline void operator-=(const complex<theirFloat> &a)
334  {
335  if (match<storeFloat, theirFloat>()) {
336  v[idx] -= complex<storeFloat>(a.x, a.y);
337  } else {
338  v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
339  }
340  }
341  };
342 
343  template<typename Float, typename storeFloat>
344  __device__ __host__ inline complex<Float> operator*(const Float &a, const fieldorder_wrapper<Float,storeFloat> &b)
345  {
346  if (fixed_point<Float,storeFloat>()) return a*complex<Float>(b.real(), b.imag());
347  else return a*complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());
348  }
349 
350  template<typename Float, typename storeFloat>
351  __device__ __host__ inline complex<Float> operator+(const fieldorder_wrapper<Float,storeFloat> &a, const complex<Float> &b) {
352  if (fixed_point<Float,storeFloat>()) return complex<Float>(a.real(), a.imag()) + b;
353  else return complex<Float>(a.v[a.idx].real(),a.v[a.idx].imag()) + b;
354  }
355 
356  template<typename Float, typename storeFloat>
357  __device__ __host__ inline complex<Float> operator+(const complex<Float> &a, const fieldorder_wrapper<Float,storeFloat> &b) {
358  if (fixed_point<Float,storeFloat>()) return a + complex<Float>(b.real(), b.imag());
359  else return a + complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());;
360  }
361 
362  template <typename Float, int nColor, QudaGaugeFieldOrder order, typename storeFloat> struct Accessor {
363  static constexpr bool is_mma_compatible = false;
365  Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
366  errorQuda("Not implemented for order=%d", order);
367  }
368 
370 
371  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
372  return dummy;
373  }
374  };
375 
376  template <typename Float, int nColor, QudaGaugeFieldOrder order, bool native_ghost, typename storeFloat>
377  struct GhostAccessor {
379  GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
380  errorQuda("Not implemented for order=%d", order);
381  }
382 
384 
385  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
386  return dummy;
387  }
388  };
389 
390  template <typename Float, int nColor, typename storeFloat>
391  struct Accessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat> {
392  static constexpr bool is_mma_compatible = false;
394  const int volumeCB;
395  const int geometry;
396  const int cb_offset;
399  static constexpr bool fixed = fixed_point<Float,storeFloat>();
400 
401  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
402  : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(complex<storeFloat>)*U.Geometry())),
403  scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
404  {
405  for (int d=0; d<U.Geometry(); d++)
406  u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :
407  static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];
408  resetScale(U.Scale());
409  }
410 
412  volumeCB(a.volumeCB),
413  geometry(a.geometry),
414  cb_offset(a.cb_offset),
415  scale(a.scale),
416  scale_inv(a.scale_inv)
417  {
418  for (int d = 0; d < QUDA_MAX_GEOMETRY; d++) u[d] = a.u[d];
419  }
420 
421  void resetScale(Float max) {
422  if (fixed) {
423  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
424  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
425  }
426  }
427 
428  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
429  {
430  complex<storeFloat> tmp = u[d][ parity*cb_offset + (x*nColor + row)*nColor + col];
431 
432  if (fixed) {
433  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
434  } else {
435  return complex<Float>(tmp.x,tmp.y);
436  }
437  }
438 
439  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
440  { return fieldorder_wrapper<Float,storeFloat>(u[d], parity*cb_offset + (x*nColor + row)*nColor + col,
441  scale, scale_inv); }
442 
443  template<typename theirFloat>
444  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col,
445  const complex<theirFloat> &val) const {
446 #ifdef __CUDA_ARCH__
447  typedef typename vector<storeFloat,2>::type vec2;
448  vec2 *u2 = reinterpret_cast<vec2*>(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col);
449  if (fixed && !match<storeFloat,theirFloat>()) {
450  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
451  atomicAdd(u2, (vec2&)val_);
452  } else {
453  atomicAdd(u2, (vec2&)val);
454  }
455 #else
456  if (fixed && !match<storeFloat,theirFloat>()) {
457  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
458 #pragma omp atomic update
459  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += val_.x;
460 #pragma omp atomic update
461  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += val_.y;
462  } else {
463 #pragma omp atomic update
464  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
465 #pragma omp atomic update
466  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
467  }
468 #endif
469  }
470 
471  template <typename helper, typename reducer>
472  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
473  {
474  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
475  int lower = (dim == -1) ? 0 : dim;
476  int ndim = (dim == -1 ? geometry : 1);
477  std::vector<double> result(ndim);
478  std::vector<complex<storeFloat> *> v(ndim);
479  for (int d = 0; d < ndim; d++) v[d] = u[d + lower];
480  ::quda::transform_reduce(location, result, v, 2 * volumeCB * nColor * nColor, h, init, r);
481  double total = init;
482  for (auto &res : result) total = r(total, res);
483  return total;
484  }
485  };
486 
487  template <typename Float, int nColor, bool native_ghost, typename storeFloat>
488  struct GhostAccessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat> {
490  int ghostOffset[8];
493  static constexpr bool fixed = fixed_point<Float,storeFloat>();
494 
495  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
496  : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
497  for (int d=0; d<4; d++) {
498  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
499  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
500  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
501 
502  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
503  ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
504  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
505  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
506  }
507 
508  resetScale(U.Scale());
509  }
510 
512  scale(a.scale),
513  scale_inv(a.scale_inv)
514  {
515  for (int d = 0; d < 8; d++) {
516  ghost[d] = a.ghost[d];
517  ghostOffset[d] = a.ghostOffset[d];
518  }
519  }
520 
521  void resetScale(Float max) {
522  if (fixed) {
523  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
524  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
525  }
526  }
527 
528  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
529  {
530  complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
531  if (fixed) {
532  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
533  } else {
534  return complex<Float>(tmp.x,tmp.y);
535  }
536  }
537 
538  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
539  { return fieldorder_wrapper<Float,storeFloat>(ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col,
540  scale, scale_inv); }
541  };
542 
543  template <typename Float, int nColor, typename storeFloat>
544  struct Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat> {
545  static constexpr bool is_mma_compatible = true;
547  const int volumeCB;
548  const int geometry;
551  static constexpr bool fixed = fixed_point<Float,storeFloat>();
552 
553  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
554  : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
555  static_cast<complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),
556  volumeCB(U.VolumeCB()), geometry(U.Geometry()),
557  scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
558  resetScale(U.Scale());
559  }
560 
562  u(a.u),
563  volumeCB(a.volumeCB),
564  geometry(a.geometry),
565  scale(a.scale),
566  scale_inv(a.scale_inv)
567  { }
568 
569  void resetScale(Float max) {
570  if (fixed) {
571  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
572  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
573  }
574  }
575 
576  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
577  {
578  complex<storeFloat> tmp = u[(((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col];
579  if (fixed) {
580  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
581  } else {
582  return complex<Float>(tmp.x,tmp.y);
583  }
584  }
585 
592  __device__ __host__ inline const auto wrap(int d, int parity, int x, int row, int col) const
593  {
595  u, (((parity * volumeCB + x) * geometry + d) * nColor + row) * nColor + col, scale, scale_inv);
596  }
597 
601  __device__ __host__ inline auto wrap(int d, int parity, int x, int row, int col)
602  {
604  u, (((parity * volumeCB + x) * geometry + d) * nColor + row) * nColor + col, scale, scale_inv);
605  }
606 
607  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
609  (u, (((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col, scale, scale_inv); }
610 
611  template <typename theirFloat>
612  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
613 #ifdef __CUDA_ARCH__
614  typedef typename vector<storeFloat,2>::type vec2;
615  vec2 *u2 = reinterpret_cast<vec2*>(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col);
616  if (fixed && !match<storeFloat,theirFloat>()) {
617  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
618  atomicAdd(u2, (vec2&)val_);
619  } else {
620  atomicAdd(u2, (vec2&)val);
621  }
622 #else
623  if (fixed && !match<storeFloat,theirFloat>()) {
624  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
625 #pragma omp atomic update
626  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += val_.x;
627 #pragma omp atomic update
628  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += val_.y;
629  } else {
630 #pragma omp atomic update
631  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
632 #pragma omp atomic update
633  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
634  }
635 #endif
636  }
637 
638  template <typename helper, typename reducer>
639  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
640  {
641  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
642  int start = dim == -1 ? 0 : dim;
643  int count = (dim == -1 ? geometry : 1) * volumeCB * nColor * nColor; // items per parity
644  std::vector<double> result = {init, init};
645  std::vector<decltype(u)> v = {u + (0 * geometry + start) * volumeCB * nColor * nColor,
646  u + (1 * geometry + start) * volumeCB * nColor * nColor};
647  ::quda::transform_reduce(location, result, v, count, h, init, r);
648  return r(result[0], result[1]);
649  }
650  };
651 
652  template <typename Float, int nColor, bool native_ghost, typename storeFloat>
653  struct GhostAccessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat> {
655  int ghostOffset[8];
658  static constexpr bool fixed = fixed_point<Float,storeFloat>();
659 
660  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
661  : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
662  for (int d=0; d<4; d++) {
663  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
664  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
665  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
666 
667  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
668  ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
669  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
670  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
671  }
672 
673  resetScale(U.Scale());
674  }
675 
677  scale(a.scale),
678  scale_inv(a.scale_inv)
679  {
680  for (int d = 0; d < 8; d++) {
681  ghost[d] = a.ghost[d];
682  ghostOffset[d] = a.ghostOffset[d];
683  }
684  }
685 
686  void resetScale(Float max) {
687  if (fixed) {
688  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
689  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
690  }
691  }
692 
693  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
694  {
695  complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
696  if (fixed) {
697  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
698  } else {
699  return complex<Float>(tmp.x,tmp.y);
700  }
701  }
702 
707  __device__ __host__ inline const auto wrap(int d, int parity, int x, int row, int col) const
708  {
710  ghost[d], parity * ghostOffset[d] + (x * nColor + row) * nColor + col, scale, scale_inv);
711  }
712 
716  __device__ __host__ inline auto wrap(int d, int parity, int x, int row, int col)
717  {
719  ghost[d], parity * ghostOffset[d] + (x * nColor + row) * nColor + col, scale, scale_inv);
720  }
721 
722  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
724  (ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col, scale, scale_inv); }
725  };
726 
727  template<int nColor, int N>
728  __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) {
729  constexpr int M = (2*nColor*nColor) / N;
730  int j = ((row*nColor+col)*2) / N; // factor of two for complexity
731  int i = ((row*nColor+col)*2) % N;
732  int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset
733  index += parity*offset_cb;
734  return index;
735  };
736 
737  template <typename Float, int nColor, typename storeFloat>
739  static constexpr bool is_mma_compatible = false;
741  const int offset_cb;
742  const int volumeCB;
743  const int stride;
744  const int geometry;
748  static constexpr bool fixed = fixed_point<Float,storeFloat>();
749 
750  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
751  : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
752  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),
753  offset_cb( (U.Bytes()>>1) / sizeof(complex<storeFloat>)),
754  volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),
755  max(static_cast<Float>(1.0)), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
756  {
757  resetScale(U.Scale());
758  }
759 
761  u(a.u),
762  offset_cb(a.offset_cb),
763  volumeCB(a.volumeCB),
764  stride(a.stride),
765  geometry(a.geometry),
766  scale(a.scale),
767  scale_inv(a.scale_inv)
768  {
769  }
770 
771  void resetScale(Float max_) {
772  if (fixed) {
773  max = max_;
774  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
775  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
776  }
777  }
778 
779  __device__ __host__ inline const complex<Float> operator()(int dim, int parity, int x_cb, int row, int col) const
780  {
782  = u[parity * offset_cb + dim * stride * nColor * nColor + (row * nColor + col) * stride + x_cb];
783  if (fixed) {
784  return scale_inv * complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
785  } else {
786  return complex<Float>(tmp.x, tmp.y);
787  }
788  }
789 
790  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int dim, int parity, int x_cb, int row, int col)
791  {
792  int index = parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb;
793  return fieldorder_wrapper<Float,storeFloat>(u, index, scale, scale_inv);
794  }
795 
796  template <typename theirFloat>
797  __device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
798 #ifdef __CUDA_ARCH__
799  typedef typename vector<storeFloat,2>::type vec2;
800  vec2 *u2 = reinterpret_cast<vec2*>(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
801  if (fixed && !match<storeFloat,theirFloat>()) {
802  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
803  atomicAdd(u2, (vec2&)val_);
804  } else {
805  atomicAdd(u2, (vec2&)val);
806  }
807 #else
808  if (fixed && !match<storeFloat,theirFloat>()) {
809  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
810 #pragma omp atomic update
811  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += val_.x;
812 #pragma omp atomic update
813  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += val_.y;
814  } else {
815 #pragma omp atomic update
816  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += static_cast<storeFloat>(val.x);
817 #pragma omp atomic update
818  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += static_cast<storeFloat>(val.y);
819  }
820 #endif
821  }
822 
823  template <typename helper, typename reducer>
824  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
825  {
826  if (dim >= geometry) errorQuda("Requested dimension %d exceeds dimensionality of the field %d", dim, geometry);
827  int start = (dim == -1) ? 0 : dim;
828  int count = (dim == -1 ? geometry : 1) * stride * nColor * nColor;
829  std::vector<double> result = {init, init};
830  std::vector<decltype(u)> v = {u + 0 * offset_cb + start * count, u + 1 * offset_cb + start * count};
831  ::quda::transform_reduce(location, result, v, count, h, init, r);
832  return r(result[0], result[1]);
833  }
834  };
835 
836  template <typename Float, int nColor, bool native_ghost, typename storeFloat>
837  struct GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> {
839  const int volumeCB;
840  int ghostVolumeCB[8];
843  static constexpr bool fixed = fixed_point<Float,storeFloat>();
845 
846  GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) :
847  volumeCB(U.VolumeCB()),
848  scale(static_cast<Float>(1.0)),
849  scale_inv(static_cast<Float>(1.0)),
850  accessor(U, gauge_, ghost_)
851  {
852  if (!native_ghost) assert(ghost_ != nullptr);
853  for (int d=0; d<4; d++) {
854  ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
855  ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
856  ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
857  ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
858  }
859  resetScale(U.Scale());
860  }
861 
863  volumeCB(a.volumeCB),
864  scale(a.scale),
865  scale_inv(a.scale_inv),
866  accessor(a.accessor)
867  {
868  for (int d=0; d<8; d++) {
869  ghost[d] = a.ghost[d];
870  ghostVolumeCB[d] = a.ghostVolumeCB[d];
871  }
872  }
873 
874  void resetScale(Float max) {
875  accessor.resetScale(max);
876  if (fixed) {
877  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
878  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
879  }
880  }
881 
882  __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x_cb, int row, int col) const
883  {
884  if (native_ghost) {
885  return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
886  } else {
887  complex<storeFloat> tmp = ghost[d][ ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb ];
888  if (fixed) {
889  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
890  } else {
891  return complex<Float>(tmp.x, tmp.y);
892  }
893  }
894  }
895 
896  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x_cb, int row, int col)
897  {
898  if (native_ghost)
899  return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
900  else
902  (ghost[d], ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb, scale, scale_inv);
903  }
904  };
905 
919  template <typename Float_, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order, bool native_ghost = true,
920  typename storeFloat_ = Float_>
921  struct FieldOrder {
922 
924  using Float = Float_;
925  using storeFloat = storeFloat_;
926 
928  const int volumeCB;
929  const int nDim;
932  static constexpr int nColorCoarse = nColor / nSpinCoarse;
933 
938 
940  static constexpr bool supports_ghost_zone = true;
941 
946  FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
947  : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),
948  location(U.Location()),
949  accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)
950  {
951  if (U.Reconstruct() != QUDA_RECONSTRUCT_NO)
952  errorQuda("GaugeField ordering not supported with reconstruction");
953  }
954 
958  { }
959 
960  void resetScale(double max) {
961  accessor.resetScale(max);
962  ghostAccessor.resetScale(max);
963  }
964 
965  static constexpr bool fixedPoint() { return fixed_point<Float,storeFloat>(); }
966 
975  __device__ __host__ complex<Float> operator()(int d, int parity, int x, int row, int col) const
976  {
977  return accessor(d, parity, x, row, col);
978  }
979 
989  __device__ __host__ const auto wrap(int d, int parity, int x) const
990  {
991  return accessor.wrap(d, parity, x, 0, 0);
992  }
993 
997  __device__ __host__ auto wrap(int d, int parity, int x) { return accessor.wrap(d, parity, x, 0, 0); }
998 
1007  __device__ __host__ fieldorder_wrapper<Float, storeFloat> operator()(int d, int parity, int x, int row, int col)
1008  {
1009  return accessor(d, parity, x, row, col);
1010  }
1011 
1020  __device__ __host__ complex<Float> Ghost(int d, int parity, int x, int row, int col) const
1021  {
1022  return ghostAccessor(d, parity, x, row, col);
1023  }
1024 
1025  __device__ __host__ auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); }
1026 
1035  __device__ __host__ fieldorder_wrapper<Float, storeFloat> Ghost(int d, int parity, int x, int row, int col)
1036  {
1037  return ghostAccessor(d, parity, x, row, col);
1038  }
1049  __device__ __host__ const auto wrap_ghost(int d, int parity, int x) const
1050  {
1051  return ghostAccessor.wrap(d, parity, x, 0, 0);
1052  }
1053 
1057  __device__ __host__ auto wrap_ghost(int d, int parity, int x) { return ghostAccessor.wrap(d, parity, x, 0, 0); }
1058 
1069  __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x, int s_row, int s_col,
1070  int c_row, int c_col) const
1071  {
1072  return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);
1073  }
1074 
1085  __device__ __host__ inline fieldorder_wrapper<Float, storeFloat> operator()(int d, int parity, int x, int s_row,
1086  int s_col, int c_row, int c_col)
1087  {
1088  return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);
1089  }
1090 
1101  __device__ __host__ inline const auto wrap(int d, int parity, int x, int s_row, int s_col) const
1102  {
1103  return accessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);
1104  }
1105 
1109  __device__ __host__ inline auto wrap(int d, int parity, int x, int s_row, int s_col)
1110  {
1111  return accessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);
1112  }
1113 
1124  __device__ __host__ inline complex<Float> Ghost(int d, int parity, int x, int s_row, int s_col, int c_row,
1125  int c_col) const
1126  {
1127  return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);
1128  }
1129 
1140  __device__ __host__ inline fieldorder_wrapper<Float, storeFloat> Ghost(int d, int parity, int x, int s_row,
1141  int s_col, int c_row, int c_col)
1142  {
1143  return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);
1144  }
1155  __device__ __host__ inline const auto wrap_ghost(int d, int parity, int x, int s_row, int s_col) const
1156  {
1157  return ghostAccessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);
1158  }
1159 
1163  __device__ __host__ inline auto wrap_ghost(int d, int parity, int x, int s_row, int s_col)
1164  {
1165  return ghostAccessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);
1166  }
1167 
1168  template <typename theirFloat>
1169  __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col,
1170  int c_row, int c_col, const complex<theirFloat> &val) {
1171  accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);
1172  }
1173 
1175  __device__ __host__ inline int Ncolor() const { return nColor; }
1176 
1178  __device__ __host__ inline int Volume() const { return 2*volumeCB; }
1179 
1181  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
1182 
1184  __device__ __host__ inline int Ndim() const { return nDim; }
1185 
1187  __device__ __host__ inline int Geometry() const { return geometry; }
1188 
1190  __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; }
1191 
1193  __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; }
1194 
1200  __host__ double norm1(int dim=-1, bool global=true) const {
1201  double nrm1 = accessor.transform_reduce(location, dim, abs_<double, storeFloat>(accessor.scale_inv), 0.0,
1202  plus<double>());
1203  if (global) comm_allreduce(&nrm1);
1204  return nrm1;
1205  }
1206 
1212  __host__ double norm2(int dim=-1, bool global=true) const {
1213  double nrm2 = accessor.transform_reduce(location, dim, square_<double, storeFloat>(accessor.scale_inv), 0.0,
1214  plus<double>());
1215  if (global) comm_allreduce(&nrm2);
1216  return nrm2;
1217  }
1218 
1224  __host__ double abs_max(int dim=-1, bool global=true) const {
1225  double absmax = accessor.transform_reduce(location, dim, abs_<Float, storeFloat>(accessor.scale_inv), 0.0,
1226  maximum<Float>());
1227  if (global) comm_allreduce_max(&absmax);
1228  return absmax;
1229  }
1230 
1236  __host__ double abs_min(int dim=-1, bool global=true) const {
1237  double absmin = accessor.transform_reduce(location, dim, abs_<Float, storeFloat>(accessor.scale_inv),
1238  std::numeric_limits<double>::max(), minimum<Float>());
1239  if (global) comm_allreduce_min(&absmin);
1240  return absmin;
1241  }
1242 
1244  size_t Bytes() const { return static_cast<size_t>(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); }
1245  };
1246 
1255  template <int N, typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
1256  struct Reconstruct {
1257  using real = typename mapper<Float>::type;
1262  scale(isFixed<Float>::value ? u.LinkMax() : 1.0),
1263  scale_inv(isFixed<Float>::value ? 1.0 / scale : 1.0)
1264  {
1265  }
1266 
1268  {
1269  }
1270 
1271  __device__ __host__ inline void Pack(real out[N], const complex in[N / 2], int idx) const
1272  {
1273  if (isFixed<Float>::value) {
1274 #pragma unroll
1275  for (int i = 0; i < N / 2; i++) {
1276  out[2 * i + 0] = scale_inv * in[i].real();
1277  out[2 * i + 1] = scale_inv * in[i].imag();
1278  }
1279  } else {
1280 #pragma unroll
1281  for (int i = 0; i < N / 2; i++) {
1282  out[2 * i + 0] = in[i].real();
1283  out[2 * i + 1] = in[i].imag();
1284  }
1285  }
1286  }
1287 
1288  template <typename I>
1289  __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int idx, int dir, real phase,
1290  const I *X, const int *R) const
1291  {
1292  if (isFixed<Float>::value) {
1293 #pragma unroll
1294  for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }
1295  } else {
1296 #pragma unroll
1297  for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
1298  }
1299  }
1300  __device__ __host__ inline real getPhase(const complex in[N / 2]) const { return 0; }
1301  };
1302 
1314  template <QudaGhostExchange ghostExchange_, typename T, typename I>
1315  __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
1316  T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,
1317  bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
1318  {
1319 
1320  // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
1321 
1322  if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
1323  || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
1324  if (idx >= firstTimeSliceBound) { // halo region on the first time slice
1325  return isFirstTimeSlice ? tBoundary : scale;
1326  } else if (idx >= lastTimeSliceBound) { // last link on the last time slice
1327  return isLastTimeSlice ? tBoundary : scale;
1328  } else {
1329  return scale;
1330  }
1331  } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
1332  || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
1333  if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
1334  // the boundary condition is on the R[3]-1 time slice
1335  return isFirstTimeSlice ? tBoundary : scale;
1336  } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
1337  // the boundary condition lies on the X[3]-R[3]-1 time slice
1338  return isLastTimeSlice ? tBoundary : scale;
1339  } else {
1340  return scale;
1341  }
1342  }
1343  return scale;
1344  }
1345 
1346  // not actually used - here for reference
1347  template <typename Float, typename I>
1348  __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) {
1349  // could consider non-extended variant too?
1350  Float sign = static_cast<Float>(1.0);
1351  switch (dim) {
1352  case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1353  case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1354  case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1355  }
1356  return sign;
1357  }
1358 
1366  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<12, Float, ghostExchange_> {
1367  using real = typename mapper<Float>::type;
1373  const bool isFirstTimeSlice;
1374  const bool isLastTimeSlice;
1376 
1378  anisotropy(u.Anisotropy()),
1379  tBoundary(static_cast<real>(u.TBoundary())),
1380  firstTimeSliceBound(u.VolumeCB()),
1381  lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1382  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
1383  isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
1384  ghostExchange(u.GhostExchange())
1385  {
1386  }
1387 
1389  anisotropy(recon.anisotropy),
1390  tBoundary(recon.tBoundary),
1391  firstTimeSliceBound(recon.firstTimeSliceBound),
1392  lastTimeSliceBound(recon.lastTimeSliceBound),
1393  isFirstTimeSlice(recon.isFirstTimeSlice),
1394  isLastTimeSlice(recon.isLastTimeSlice),
1395  ghostExchange(recon.ghostExchange)
1396  {
1397  }
1398 
1399  __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
1400  {
1401 #pragma unroll
1402  for (int i = 0; i < 6; i++) {
1403  out[2 * i + 0] = in[i].real();
1404  out[2 * i + 1] = in[i].imag();
1405  }
1406  }
1407 
1408  template <typename I>
1409  __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
1410  const I *X, const int *R) const
1411  {
1412 #pragma unroll
1413  for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);
1414 
1415  const real u0 = dir < 3 ?
1416  anisotropy :
1417  timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
1418  lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1419 
1420  // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);
1421  out[6] = cmul(out[2], out[4]);
1422  out[6] = cmac(out[1], out[5], -out[6]);
1423  out[6] = u0 * conj(out[6]);
1424 
1425  // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]);
1426  out[7] = cmul(out[0], out[5]);
1427  out[7] = cmac(out[2], out[3], -out[7]);
1428  out[7] = u0 * conj(out[7]);
1429 
1430  // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]);
1431  out[8] = cmul(out[1], out[3]);
1432  out[8] = cmac(out[0], out[4], -out[8]);
1433  out[8] = u0 * conj(out[8]);
1434  }
1435 
1436  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1437  };
1438 
1449  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<11, Float, ghostExchange_> {
1450  using real = typename mapper<Float>::type;
1452 
1453  Reconstruct(const GaugeField &u) { ; }
1455 
1456  __device__ __host__ inline void Pack(real out[10], const complex in[9], int idx) const
1457  {
1458 #pragma unroll
1459  for (int i = 0; i < 2; i++) {
1460  out[2 * i + 0] = in[i + 1].real();
1461  out[2 * i + 1] = in[i + 1].imag();
1462  }
1463  out[4] = in[5].real();
1464  out[5] = in[5].imag();
1465  out[6] = in[0].imag();
1466  out[7] = in[4].imag();
1467  out[8] = in[8].imag();
1468  out[9] = 0.0;
1469  }
1470 
1471  template <typename I>
1472  __device__ __host__ inline void Unpack(complex out[9], const real in[10], int idx, int dir, real phase,
1473  const I *X, const int *R) const
1474  {
1475  out[0] = complex(0.0, in[6]);
1476  out[1] = complex(in[0], in[1]);
1477  out[2] = complex(in[2], in[3]);
1478  out[3] = complex(-out[1].real(), out[1].imag());
1479  out[4] = complex(0.0, in[7]);
1480  out[5] = complex(in[4], in[5]);
1481  out[6] = complex(-out[2].real(), out[2].imag());
1482  out[7] = complex(-out[5].real(), out[5].imag());
1483  out[8] = complex(0.0, in[8]);
1484  }
1485 
1486  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1487  };
1488 
1497  template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1498  struct Reconstruct<13, Float, ghostExchange_, stag_phase> {
1499  using real = typename mapper<Float>::type;
1502  const real scale;
1504 
1505  Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {}
1507  reconstruct_12(recon.reconstruct_12),
1508  scale(recon.scale),
1509  scale_inv(recon.scale_inv)
1510  {
1511  }
1512 
1513  __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
1514  {
1515  reconstruct_12.Pack(out, in, idx);
1516  }
1517 
1518  template <typename I>
1519  __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
1520  const I *X, const int *R) const
1521  {
1522 #pragma unroll
1523  for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);
1524 
1525  out[6] = cmul(out[2], out[4]);
1526  out[6] = cmac(out[1], out[5], -out[6]);
1527  out[6] = scale_inv * conj(out[6]);
1528 
1529  out[7] = cmul(out[0], out[5]);
1530  out[7] = cmac(out[2], out[3], -out[7]);
1531  out[7] = scale_inv * conj(out[7]);
1532 
1533  out[8] = cmul(out[1], out[3]);
1534  out[8] = cmac(out[0], out[4], -out[8]);
1535  out[8] = scale_inv * conj(out[8]);
1536 
1537  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
1538  // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
1539  real cos_sin[2];
1540  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(3. * phase), &cos_sin[1], &cos_sin[0]);
1541  complex A(cos_sin[0], cos_sin[1]);
1542  out[6] = cmul(A, out[6]);
1543  out[7] = cmul(A, out[7]);
1544  out[8] = cmul(A, out[8]);
1545  } else { // phase is +/- 1 so real multiply is sufficient
1546  out[6] *= phase;
1547  out[7] *= phase;
1548  out[8] *= phase;
1549  }
1550  }
1551 
1552  __device__ __host__ inline real getPhase(const complex in[9]) const
1553  {
1554 #if 1 // phase from cross product
1555  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
1556  complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1557  complex expI3Phase = in[8] / denom; // numerator = U[2][2]
1558 
1559  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
1560  return arg(expI3Phase) / static_cast<real>(3.0);
1561  } else {
1562  return expI3Phase.real() > 0 ? 1 : -1;
1563  }
1564 #else // phase from determinant
1566 #pragma unroll
1567  for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1568  const complex det = getDeterminant(a);
1569  return phase = arg(det) / 3;
1570 #endif
1571  }
1572  };
1573 
1581  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<8, Float, ghostExchange_> {
1582  using real = typename mapper<Float>::type;
1584  const complex anisotropy; // imaginary value stores inverse
1585  const complex tBoundary; // imaginary value stores inverse
1588  const bool isFirstTimeSlice;
1589  const bool isLastTimeSlice;
1591 
1592  // scale factor is set when using recon-9
1593  Reconstruct(const GaugeField &u, real scale = 1.0) :
1594  anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
1595  tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),
1596  firstTimeSliceBound(u.VolumeCB()),
1597  lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1598  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
1599  isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
1600  ghostExchange(u.GhostExchange())
1601  {
1602  }
1603 
1605  anisotropy(recon.anisotropy),
1606  tBoundary(recon.tBoundary),
1607  firstTimeSliceBound(recon.firstTimeSliceBound),
1608  lastTimeSliceBound(recon.lastTimeSliceBound),
1609  isFirstTimeSlice(recon.isFirstTimeSlice),
1610  isLastTimeSlice(recon.isLastTimeSlice),
1611  ghostExchange(recon.ghostExchange)
1612  {
1613  }
1614 
1615  // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf
1616  // Method was modified to avoid the singularity at unit gauge by
1617  // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}}
1618  // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}}
1619 
1620  __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
1621  {
1622  out[0] = Trig<isFixed<Float>::value, real>::Atan2(in[3].imag(), in[3].real()); // a1 -> b1
1623  out[1] = Trig<isFixed<Float>::value, real>::Atan2(-in[6].imag(), -in[6].real()); // c1 -> -c1
1624 
1625  out[2] = in[4].real();
1626  out[3] = in[4].imag(); // a2 -> b2
1627  out[4] = in[5].real();
1628  out[5] = in[5].imag(); // a3 -> b3
1629  out[6] = in[0].real();
1630  out[7] = in[0].imag(); // b1 -> a1
1631  }
1632 
1633  template <typename I>
1634  __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
1635  const I *X, const int *R, const complex scale, const complex u) const
1636  {
1637  real u0 = u.real();
1638  real u0_inv = u.imag();
1639 
1640 #pragma unroll
1641  for (int i = 1; i <= 3; i++)
1642  out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly
1643 
1644  real tmp[2];
1645  Trig<isFixed<Float>::value, real>::SinCos(in[0], &tmp[1], &tmp[0]);
1646  out[0] = complex(tmp[0], tmp[1]);
1647 
1648  Trig<isFixed<Float>::value, real>::SinCos(in[1], &tmp[1], &tmp[0]);
1649  out[6] = complex(tmp[0], tmp[1]);
1650 
1651  // First, reconstruct first row
1652  real row_sum = out[1].real() * out[1].real();
1653  row_sum += out[1].imag() * out[1].imag();
1654  row_sum += out[2].real() * out[2].real();
1655  row_sum += out[2].imag() * out[2].imag();
1656  real row_sum_inv = static_cast<real>(1.0) / row_sum;
1657 
1658  real diff = u0_inv * u0_inv - row_sum;
1659  real U00_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);
1660 
1661  out[0] *= U00_mag;
1662 
1663  // Second, reconstruct first column
1664  real column_sum = out[0].real() * out[0].real();
1665  column_sum += out[0].imag() * out[0].imag();
1666  column_sum += out[3].real() * out[3].real();
1667  column_sum += out[3].imag() * out[3].imag();
1668 
1669  diff = u0_inv * u0_inv - column_sum;
1670  real U20_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);
1671 
1672  out[6] *= U20_mag;
1673 
1674  // Finally, reconstruct last elements from SU(2) rotation
1675  real r_inv2 = u0_inv * row_sum_inv;
1676  {
1677  complex A = cmul(conj(out[0]), out[3]);
1678 
1679  // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11
1680  out[4] = cmul(conj(out[6]), conj(out[2]));
1681  out[4] = cmac(u0 * A, out[1], out[4]);
1682  out[4] = -r_inv2 * out[4];
1683 
1684  // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12
1685  out[5] = cmul(conj(out[6]), conj(out[1]));
1686  out[5] = cmac(-u0 * A, out[2], out[5]);
1687  out[5] = r_inv2 * out[5];
1688  }
1689 
1690  {
1691  complex A = cmul(conj(out[0]), out[6]);
1692 
1693  // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21
1694  out[7] = cmul(conj(out[3]), conj(out[2]));
1695  out[7] = cmac(-u0 * A, out[1], out[7]);
1696  out[7] = r_inv2 * out[7];
1697 
1698  // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12
1699  out[8] = cmul(conj(out[3]), conj(out[1]));
1700  out[8] = cmac(u0 * A, out[2], out[8]);
1701  out[8] = -r_inv2 * out[8];
1702  }
1703 
1704  // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back
1705  // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}}
1706 #pragma unroll
1707  for (int i = 0; i < 3; i++) {
1708  const auto tmp = out[i];
1709  out[i] = out[i + 3];
1710  out[i + 3] = tmp;
1711  out[i + 6] = -out[i + 6];
1712  }
1713  }
1714 
1715  template <typename I>
1716  __device__ __host__ inline void
1717  Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R,
1718  const complex scale = complex(static_cast<real>(1.0), static_cast<real>(1.0))) const
1719  {
1720  complex u = dir < 3 ?
1721  anisotropy :
1722  timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
1723  isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1724  Unpack(out, in, idx, dir, phase, X, R, scale, u);
1725  }
1726 
1727  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1728  };
1729 
1738  template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1739  struct Reconstruct<9, Float, ghostExchange_, stag_phase> {
1740  using real = typename mapper<Float>::type;
1743  const real scale;
1745 
1746  Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {}
1747 
1749  reconstruct_8(recon.reconstruct_8),
1750  scale(recon.scale),
1751  scale_inv(recon.scale_inv)
1752  {
1753  }
1754 
1755  __device__ __host__ inline real getPhase(const complex in[9]) const
1756  {
1757 #if 1 // phase from cross product
1758  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
1759  complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1760  complex expI3Phase = in[8] / denom; // numerator = U[2][2]
1761  if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
1762  return arg(expI3Phase) / static_cast<real>(3.0);
1763  } else {
1764  return expI3Phase.real() > 0 ? 1 : -1;
1765  }
1766 #else // phase from determinant
1768 #pragma unroll
1769  for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1770  const complex det = getDeterminant(a);
1771  real phase = arg(det) / 3;
1772  return phase;
1773 #endif
1774  }
1775 
1776  // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,
1777  __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
1778  {
1779  real phase = getPhase(in);
1780  complex su3[9];
1781 
1782  if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
1783  real cos_sin[2];
1784  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(-phase), &cos_sin[1], &cos_sin[0]);
1785  complex z(cos_sin[0], cos_sin[1]);
1786  z *= scale_inv;
1787 #pragma unroll
1788  for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]);
1789  } else {
1790 #pragma unroll
1791  for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }
1792  }
1793  reconstruct_8.Pack(out, su3, idx);
1794  }
1795 
1796  template <typename I>
1797  __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
1798  const I *X, const int *R) const
1799  {
1800  reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast<real>(1.0), static_cast<real>(1.0)),
1801  complex(static_cast<real>(1.0), static_cast<real>(1.0)));
1802 
1803  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase
1804  real cos_sin[2];
1805  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(phase), &cos_sin[1], &cos_sin[0]);
1806  complex z(cos_sin[0], cos_sin[1]);
1807  z *= scale;
1808 #pragma unroll
1809  for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
1810  } else { // stagic phase
1811 #pragma unroll
1812  for (int i = 0; i < 18; i++) { out[i] *= phase; }
1813  }
1814  }
1815  };
1816 
1817  __host__ __device__ constexpr int ct_sqrt(int n, int i = 1)
1818  {
1819  return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i);
1820  }
1821 
1827  __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); }
1828 
1829  // we default to huge allocations for gauge field (for now)
1830  constexpr bool default_huge_alloc = true;
1831 
1832  template <QudaStaggeredPhase phase> __host__ __device__ inline bool static_phase()
1833  {
1834  switch (phase) {
1837  case QUDA_STAGGERED_PHASE_TIFR: return true;
1838  default: return false;
1839  }
1840  }
1841 
1842  template <typename Float, int length, int N, int reconLenParam,
1843  QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
1844  QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>
1845  struct FloatNOrder {
1846  using Accessor
1848 
1849  using real = typename mapper<Float>::type;
1854  static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
1855  static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
1863  const int volumeCB;
1865  const int stride;
1866  const int geometry;
1868  void *backup_h;
1869  size_t bytes;
1870 
1871  FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0, bool override = false) :
1872  reconstruct(u),
1873  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
1874  offset(u.Bytes() / (2 * sizeof(Float) * N)),
1875  ghostExchange(u.GhostExchange()),
1876  volumeCB(u.VolumeCB()),
1877  stride(u.Stride()),
1878  geometry(u.Geometry()),
1879  phaseOffset(u.PhaseOffset() / sizeof(Float)),
1880  backup_h(nullptr),
1881  bytes(u.Bytes())
1882  {
1884  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
1885 
1886  // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12),
1887  // "staggered phase only presently supported for 18 and 12 reconstruct");
1888  for (int i = 0; i < 4; i++) {
1889  X[i] = u.X()[i];
1890  R[i] = u.R()[i];
1891  ghost[i] = ghost_ ? ghost_[i] : 0;
1892  faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
1893  }
1894  }
1895 
1896  FloatNOrder(const FloatNOrder &order) :
1897  reconstruct(order.reconstruct),
1898  gauge(order.gauge),
1899  offset(order.offset),
1901  volumeCB(order.volumeCB),
1902  stride(order.stride),
1903  geometry(order.geometry),
1904  phaseOffset(order.phaseOffset),
1905  backup_h(nullptr),
1906  bytes(order.bytes)
1907  {
1908  for (int i = 0; i < 4; i++) {
1909  X[i] = order.X[i];
1910  R[i] = order.R[i];
1911  ghost[i] = order.ghost[i];
1912  faceVolumeCB[i] = order.faceVolumeCB[i];
1913  }
1914  }
1915 
1916  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
1917  {
1918  const int M = reconLen / N;
1919  real tmp[reconLen];
1920 
1921 #pragma unroll
1922  for (int i=0; i<M; i++){
1923  // first load from memory
1924  Vector vecTmp = vector_load<Vector>(gauge, parity * offset + (dir * M + i) * stride + x);
1925  // second do copy converting into register type
1926 #pragma unroll
1927  for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
1928  }
1929 
1930  real phase = 0.;
1931  if (hasPhase) {
1932  if (static_phase<stag_phase>() && (reconLen == 13 || use_inphase)) {
1933  phase = inphase;
1934  } else {
1935  copy(phase, gauge[parity * offset * N + phaseOffset + stride * dir + x]);
1936  phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
1937  }
1938  }
1939 
1940  reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
1941  }
1942 
1943  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
1944  {
1945  const int M = reconLen / N;
1946  real tmp[reconLen];
1947  reconstruct.Pack(tmp, v, x);
1948 
1949 #pragma unroll
1950  for (int i=0; i<M; i++){
1951  Vector vecTmp;
1952  // first do copy converting into storage type
1953 #pragma unroll
1954  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1955  // second do vectorized copy into memory
1956  vector_store(gauge, parity * offset + x + (dir * M + i) * stride, vecTmp);
1957  }
1958  if (hasPhase) {
1959  real phase = reconstruct.getPhase(v);
1960  copy(gauge[parity * offset * N + phaseOffset + dir * stride + x], static_cast<real>(phase / (2. * M_PI)));
1961  }
1962  }
1963 
1974  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity, real phase = 1.0)
1975  {
1976  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity, phase);
1977  }
1978 
1989  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity,
1990  real phase = 1.0) const
1991  {
1992  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity, phase);
1993  }
1994 
1995  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
1996  {
1997  if (!ghost[dir]) { // load from main field not separate array
1998  load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region
1999  // This also works perfectly when phases are stored. No need to change this.
2000  } else {
2001  const int M = reconLen / N;
2002  real tmp[reconLen];
2003 
2004 #pragma unroll
2005  for (int i=0; i<M; i++) {
2006  // first do vectorized copy from memory into registers
2007  Vector vecTmp = vector_load<Vector>(
2008  ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x);
2009  // second do copy converting into register type
2010 #pragma unroll
2011  for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
2012  }
2013  real phase = 0.;
2014 
2015  if (hasPhase) {
2016 
2017  // if(stag_phase == QUDA_STAGGERED_PHASE_MILC ) {
2018  // phase = inphase < static_cast<Float>(0) ? static_cast<Float>(-1./(2.*M_PI)) : static_cast<Float>(1./2.*M_PI);
2019  // } else {
2020  copy(phase, ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x]);
2021  phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
2022  // }
2023  }
2024  reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
2025  }
2026  }
2027 
2028  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
2029  {
2030  if (!ghost[dir]) { // store in main field not separate array
2031  save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region
2032  } else {
2033  const int M = reconLen / N;
2034  real tmp[reconLen];
2035  reconstruct.Pack(tmp, v, x);
2036 
2037 #pragma unroll
2038  for (int i=0; i<M; i++) {
2039  Vector vecTmp;
2040  // first do copy converting into storage type
2041 #pragma unroll
2042  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
2043  // second do vectorized copy into memory
2044  vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);
2045  }
2046 
2047  if (hasPhase) {
2048  real phase = reconstruct.getPhase(v);
2049  copy(ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x],
2050  static_cast<real>(phase / (2. * M_PI)));
2051  }
2052  }
2053  }
2054 
2065  __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2066  real phase = 1.0)
2067  {
2068  return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
2069  }
2070 
2081  __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2082  real phase = 1.0) const
2083  {
2084  return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
2085  }
2086 
2087  __device__ __host__ inline void loadGhostEx(complex v[length / 2], int buff_idx, int extended_idx, int dir,
2088  int dim, int g, int parity, const int R[]) const
2089  {
2090  const int M = reconLen / N;
2091  real tmp[reconLen];
2092 
2093 #pragma unroll
2094  for (int i=0; i<M; i++) {
2095  // first do vectorized copy from memory
2096  Vector vecTmp = vector_load<Vector>(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
2097  +i*R[dim]*faceVolumeCB[dim]+buff_idx);
2098  // second do copy converting into register type
2099 #pragma unroll
2100  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
2101  }
2102  real phase = 0.;
2103  if (hasPhase)
2104  copy(phase,
2105  ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
2106  + R[dim] * faceVolumeCB[dim] * M * N + buff_idx]);
2107 
2108  // use the extended_idx to determine the boundary condition
2109  reconstruct.Unpack(v, tmp, extended_idx, g, 2. * M_PI * phase, X, R);
2110  }
2111 
2112  __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int buff_idx, int extended_idx, int dir,
2113  int dim, int g, int parity, const int R[])
2114  {
2115  const int M = reconLen / N;
2116  real tmp[reconLen];
2117  // use the extended_idx to determine the boundary condition
2118  reconstruct.Pack(tmp, v, extended_idx);
2119 
2120 #pragma unroll
2121  for (int i=0; i<M; i++) {
2122  Vector vecTmp;
2123  // first do copy converting into storage type
2124 #pragma unroll
2125  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
2126  // second do vectorized copy to memory
2127  vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
2128  i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);
2129  }
2130  if (hasPhase) {
2131  real phase = reconstruct.getPhase(v);
2132  copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
2133  + R[dim] * faceVolumeCB[dim] * M * N + buff_idx],
2134  static_cast<real>(phase / (2. * M_PI)));
2135  }
2136  }
2137 
2141  void save() {
2142  if (backup_h) errorQuda("Already allocated host backup");
2144  qudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
2145  }
2146 
2150  void load()
2151  {
2152  qudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
2154  backup_h = nullptr;
2155  }
2156 
2157  size_t Bytes() const { return reconLen * sizeof(Float); }
2158  };
2159 
2166  template <typename real, int length> struct S {
2167  real v[length];
2168  __host__ __device__ const real &operator[](int i) const { return v[i]; }
2169  __host__ __device__ real &operator[](int i) { return v[i]; }
2170  };
2171 
2176  template <typename Float, int length> struct LegacyOrder {
2178  using real = typename mapper<Float>::type;
2182  const int volumeCB;
2183  const int stride;
2184  const int geometry;
2185  const int hasPhase;
2186 
2187  LegacyOrder(const GaugeField &u, Float **ghost_) :
2188  volumeCB(u.VolumeCB()),
2189  stride(u.Stride()),
2190  geometry(u.Geometry()),
2191  hasPhase(0)
2192  {
2194  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
2195 
2196  for (int i = 0; i < 4; i++) {
2197  ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);
2198  faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
2199  }
2200  }
2201 
2202  LegacyOrder(const LegacyOrder &order) :
2203  volumeCB(order.volumeCB),
2204  stride(order.stride),
2205  geometry(order.geometry),
2206  hasPhase(0)
2207  {
2208  for (int i = 0; i < 4; i++) {
2209  ghost[i] = order.ghost[i];
2210  faceVolumeCB[i] = order.faceVolumeCB[i];
2211  }
2212  }
2213 
2214  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const
2215  {
2216 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2217  typedef S<Float, length> structure;
2218  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2219  structure v_ = ghost_[parity * faceVolumeCB[dir] + x];
2220 #else
2221  auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2222 #endif
2223  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2224  }
2225 
2226  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
2227  {
2228 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2229  typedef S<Float, length> structure;
2230  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2231  structure v_;
2232  for (int i = 0; i < length / 2; i++) {
2233  v_[2 * i + 0] = (Float)v[i].real();
2234  v_[2 * i + 1] = (Float)v[i].imag();
2235  }
2236  ghost_[parity * faceVolumeCB[dir] + x] = v_;
2237 #else
2238  auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2239  for (int i = 0; i < length / 2; i++) {
2240  v_[2 * i + 0] = (Float)v[i].real();
2241  v_[2 * i + 1] = (Float)v[i].imag();
2242  }
2243 #endif
2244  }
2245 
2256  __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2257  real phase = 1.0)
2258  {
2259  return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
2260  }
2261 
2272  __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2273  real phase = 1.0) const
2274  {
2275  return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
2276  }
2277 
2278  __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int dummy, int dir, int dim, int g,
2279  int parity, const int R[]) const
2280  {
2281 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2282  typedef S<Float,length> structure;
2283  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
2284  structure v_ = ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];
2285 #else
2286  auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2287 #endif
2288  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2289  }
2290 
2291  __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int dummy, int dir, int dim,
2292  int g, int parity, const int R[])
2293  {
2294 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2295  typedef S<Float, length> structure;
2296  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dim]);
2297  structure v_;
2298  for (int i = 0; i < length / 2; i++) {
2299  v_[2 * i + 0] = (Float)v[i].real();
2300  v_[2 * i + 1] = (Float)v[i].imag();
2301  }
2302  ghost_[((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g] = v_;
2303 #else
2304  auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2305  for (int i = 0; i < length / 2; i++) {
2306  v_[2 * i + 0] = (Float)v[i].real();
2307  v_[2 * i + 1] = (Float)v[i].imag();
2308  }
2309 #endif
2310  }
2311  };
2312 
2317  template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
2319  using real = typename mapper<Float>::type;
2322  const int volumeCB;
2323  QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
2324  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2325  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
2326  QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
2327  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
2328  }
2329 
2330  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2331  {
2332 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2333  typedef S<Float,length> structure;
2334  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2335  structure v_ = gauge_[parity*volumeCB + x];
2336 #else
2337  auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2338 #endif
2339  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2340  }
2341 
2342  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2343  {
2344 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2345  typedef S<Float,length> structure;
2346  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2347  structure v_;
2348  for (int i = 0; i < length / 2; i++) {
2349  v_[2 * i + 0] = (Float)v[i].real();
2350  v_[2 * i + 1] = (Float)v[i].imag();
2351  }
2352  gauge_[parity * volumeCB + x] = v_;
2353 #else
2354  auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2355  for (int i = 0; i < length / 2; i++) {
2356  v_[2 * i + 0] = (Float)v[i].real();
2357  v_[2 * i + 1] = (Float)v[i].imag();
2358  }
2359 #endif
2360  }
2361 
2372  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2373  {
2374  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2375  }
2376 
2387  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2388  {
2389  return gauge_wrapper<real, QDPOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2390  }
2391 
2392  size_t Bytes() const { return length * sizeof(Float); }
2393  };
2394 
2399  template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
2401  using real = typename mapper<Float>::type;
2404  const int volumeCB;
2405  QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
2406  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2407  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
2409  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
2410  }
2411 
2412  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2413  {
2414  for (int i = 0; i < length / 2; i++) {
2415  v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2416  v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2417  }
2418  }
2419 
2420  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2421  {
2422  for (int i = 0; i < length / 2; i++) {
2423  gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].real();
2424  gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].imag();
2425  }
2426  }
2427 
2438  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2439  {
2440  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2441  }
2442 
2453  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2454  {
2455  return gauge_wrapper<real, QDPJITOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2456  }
2457 
2458  size_t Bytes() const { return length * sizeof(Float); }
2459  };
2460 
2465  template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
2467  using real = typename mapper<Float>::type;
2470  const int volumeCB;
2471  const int geometry;
2472  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
2473  LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
2474  volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
2475  MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),
2476  gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
2477  { ; }
2478 
2479  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2480  {
2481 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2482  typedef S<Float,length> structure;
2483  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2484  structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];
2485 #else
2486  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2487 #endif
2488  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2489  }
2490 
2491  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2492  {
2493 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2494  typedef S<Float,length> structure;
2495  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2496  structure v_;
2497  for (int i = 0; i < length / 2; i++) {
2498  v_[2 * i + 0] = v[i].real();
2499  v_[2 * i + 1] = v[i].imag();
2500  }
2501  gauge_[(parity*volumeCB+x)*geometry + dir] = v_;
2502 #else
2503  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2504  for (int i = 0; i < length / 2; i++) {
2505  v_[2 * i + 0] = v[i].real();
2506  v_[2 * i + 1] = v[i].imag();
2507  }
2508 #endif
2509  }
2510 
2521  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2522  {
2523  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2524  }
2525 
2536  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2537  {
2538  return gauge_wrapper<real, MILCOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2539  }
2540 
2541  size_t Bytes() const { return length * sizeof(Float); }
2542  };
2543 
2559  template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {
2561  using real = typename mapper<Float>::type;
2564  const int volumeCB;
2565  const int geometry;
2566  const size_t offset;
2567  const size_t size;
2568  MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2569  LegacyOrder<Float, length>(u, ghost_),
2570  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2571  volumeCB(u.VolumeCB()),
2572  geometry(u.Geometry()),
2573  offset(u.SiteOffset()),
2574  size(u.SiteSize())
2575  {
2576  if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); }
2577  }
2578 
2580  LegacyOrder<Float, length>(order),
2581  gauge(order.gauge),
2582  volumeCB(order.volumeCB),
2583  geometry(order.geometry),
2584  offset(order.offset),
2585  size(order.size)
2586  {
2587  }
2588 
2589  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2590  {
2591  // get base pointer
2592  const Float *gauge0 = reinterpret_cast<const Float*>(reinterpret_cast<const char*>(gauge) + (parity*volumeCB+x)*size + offset);
2593 
2594 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2595  typedef S<Float,length> structure;
2596  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2597  structure v_ = gauge_[dir];
2598 #else
2599  auto v_ = &gauge0[dir * length];
2600 #endif
2601  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2602  }
2603 
2604  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2605  {
2606  // get base pointer
2607  Float *gauge0 = reinterpret_cast<Float*>(reinterpret_cast<char*>(gauge) + (parity*volumeCB+x)*size + offset);
2608 
2609 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2610  typedef S<Float,length> structure;
2611  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2612  structure v_;
2613  for (int i = 0; i < length / 2; i++) {
2614  v_[2 * i + 0] = v[i].real();
2615  v_[2 * i + 1] = v[i].imag();
2616  }
2617  gauge_[dir] = v_;
2618 #else
2619  for (int i = 0; i < length / 2; i++) {
2620  gauge0[dir * length + 2 * i + 0] = v[i].real();
2621  gauge0[dir * length + 2 * i + 1] = v[i].imag();
2622  }
2623 #endif
2624  }
2625 
2636  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2637  {
2638  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2639  }
2640 
2651  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2652  {
2653  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2654  }
2655 
2656  size_t Bytes() const { return length * sizeof(Float); }
2657  };
2658 
2659 
2664  template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
2666  using real = typename mapper<Float>::type;
2669  const int volumeCB;
2672  static constexpr int Nc = 3;
2673  const int geometry;
2674  CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2675  LegacyOrder<Float, length>(u, ghost_),
2676  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2677  volumeCB(u.VolumeCB()),
2678  anisotropy(u.Anisotropy()),
2679  anisotropy_inv(1.0 / anisotropy),
2680  geometry(u.Geometry())
2681  {
2682  if (length != 18) errorQuda("Gauge length %d not supported", length);
2683  }
2684  CPSOrder(const CPSOrder &order) :
2685  LegacyOrder<Float, length>(order),
2686  gauge(order.gauge),
2687  volumeCB(order.volumeCB),
2688  anisotropy(order.anisotropy),
2690  geometry(order.geometry)
2691  {
2692  ;
2693  }
2694 
2695  // we need to transpose and scale for CPS ordering
2696  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float inphase = 1.0) const
2697  {
2698 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2699  typedef S<Float,length> structure;
2700  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2701  structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];
2702 #else
2703  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2704 #endif
2705  for (int i=0; i<Nc; i++) {
2706  for (int j=0; j<Nc; j++) {
2707  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * anisotropy_inv;
2708  }
2709  }
2710  }
2711 
2712  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2713  {
2714 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2715  typedef S<Float,length> structure;
2716  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2717  structure v_;
2718  for (int i=0; i<Nc; i++)
2719  for (int j = 0; j < Nc; j++) {
2720  v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
2721  v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2722  }
2723  gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;
2724 #else
2725  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2726  for (int i=0; i<Nc; i++) {
2727  for (int j=0; j<Nc; j++) {
2728  v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
2729  v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2730  }
2731  }
2732 #endif
2733  }
2734 
2745  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2746  {
2747  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2748  }
2749 
2760  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2761  {
2762  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2763  }
2764 
2765  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2766  };
2767 
2775  template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
2777  using real = typename mapper<Float>::type;
2780  const int volumeCB;
2781  int exVolumeCB; // extended checkerboard volume
2782  static constexpr int Nc = 3;
2783  BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2784  LegacyOrder<Float, length>(u, ghost_),
2785  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2786  volumeCB(u.VolumeCB())
2787  {
2788  if (length != 18) errorQuda("Gauge length %d not supported", length);
2789  // compute volumeCB + halo region
2790  exVolumeCB = u.X()[0]/2 + 2;
2791  for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;
2792  }
2793  BQCDOrder(const BQCDOrder &order) :
2794  LegacyOrder<Float, length>(order),
2795  gauge(order.gauge),
2796  volumeCB(order.volumeCB),
2797  exVolumeCB(order.exVolumeCB)
2798  {
2799  if (length != 18) errorQuda("Gauge length %d not supported", length);
2800  }
2801 
2802  // we need to transpose for BQCD ordering
2803  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
2804  {
2805 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2806  typedef S<Float, length> structure;
2807  trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2808  structure v_ = gauge_[(dir * 2 + parity) * exVolumeCB + x];
2809 #else
2810  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
2811 #endif
2812  for (int i = 0; i < Nc; i++) {
2813  for (int j = 0; j < Nc; j++) { v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]); }
2814  }
2815  }
2816 
2817  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2818  {
2819 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2820  typedef S<Float,length> structure;
2821  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2822  structure v_;
2823  for (int i=0; i<Nc; i++)
2824  for (int j = 0; j < Nc; j++) {
2825  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
2826  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2827  }
2828  gauge_[(dir * 2 + parity) * exVolumeCB + x] = v_;
2829 #else
2830  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
2831  for (int i = 0; i < Nc; i++) {
2832  for (int j = 0; j < Nc; j++) {
2833  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
2834  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2835  }
2836  }
2837 #endif
2838  }
2839 
2850  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2851  {
2852  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2853  }
2854 
2865  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2866  {
2867  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2868  }
2869 
2870  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2871  };
2872 
2877  template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
2879  using real = typename mapper<Float>::type;
2882  const int volumeCB;
2883  static constexpr int Nc = 3;
2884  const real scale;
2886  TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2887  LegacyOrder<Float, length>(u, ghost_),
2888  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2889  volumeCB(u.VolumeCB()),
2890  scale(u.Scale()),
2891  scale_inv(1.0 / scale)
2892  {
2893  if (length != 18) errorQuda("Gauge length %d not supported", length);
2894  }
2895  TIFROrder(const TIFROrder &order) :
2896  LegacyOrder<Float, length>(order),
2897  gauge(order.gauge),
2898  volumeCB(order.volumeCB),
2899  scale(order.scale),
2900  scale_inv(1.0 / scale)
2901  {
2902  if (length != 18) errorQuda("Gauge length %d not supported", length);
2903  }
2904 
2905  // we need to transpose for TIFR ordering
2906  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
2907  {
2908 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2909  typedef S<Float, length> structure;
2910  trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2911  structure v_ = gauge_[(dir * 2 + parity) * volumeCB + x];
2912 #else
2913  auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
2914 #endif
2915  for (int i = 0; i < Nc; i++) {
2916  for (int j = 0; j < Nc; j++) {
2917  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
2918  }
2919  }
2920  }
2921 
2922  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2923  {
2924 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2925  typedef S<Float,length> structure;
2926  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2927  structure v_;
2928  for (int i=0; i<Nc; i++)
2929  for (int j = 0; j < Nc; j++) {
2930  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2931  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2932  }
2933  gauge_[(dir * 2 + parity) * volumeCB + x] = v_;
2934 #else
2935  auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
2936  for (int i = 0; i < Nc; i++) {
2937  for (int j = 0; j < Nc; j++) {
2938  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2939  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2940  }
2941  }
2942 #endif
2943  }
2944 
2955  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2956  {
2957  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2958  }
2959 
2970  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2971  {
2972  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2973  }
2974 
2975  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2976  };
2977 
2982  template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {
2984  using real = typename mapper<Float>::type;
2987  const int volumeCB;
2989  static constexpr int Nc = 3;
2990  const real scale;
2992  const int dim[4];
2993  const int exDim[4];
2994  TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2995  LegacyOrder<Float, length>(u, ghost_),
2996  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2997  volumeCB(u.VolumeCB()),
2998  exVolumeCB(1),
2999  scale(u.Scale()),
3000  scale_inv(1.0 / scale),
3001  dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},
3002  exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}
3003  {
3004  if (length != 18) errorQuda("Gauge length %d not supported", length);
3005 
3006  // exVolumeCB is the padded checkboard volume
3007  for (int i=0; i<4; i++) exVolumeCB *= exDim[i];
3008  exVolumeCB /= 2;
3009  }
3010 
3012  LegacyOrder<Float, length>(order),
3013  gauge(order.gauge),
3014  volumeCB(order.volumeCB),
3015  exVolumeCB(order.exVolumeCB),
3016  scale(order.scale),
3017  scale_inv(order.scale_inv),
3018  dim {order.dim[0], order.dim[1], order.dim[2], order.dim[3]},
3019  exDim {order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]}
3020  {
3021  if (length != 18) errorQuda("Gauge length %d not supported", length);
3022  }
3023 
3028  __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const {
3029  // find coordinates
3030  int coord[4];
3031  getCoords(coord, x_cb, dim, parity);
3032 
3033  // get z-extended index
3034  coord[2] += 2; // offset for halo
3035  return linkIndex(coord, exDim);
3036  }
3037 
3038  // we need to transpose for TIFR ordering
3039  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
3040  {
3041  int y = getPaddedIndex(x, parity);
3042 
3043 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
3044  typedef S<Float,length> structure;
3045  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
3046  structure v_ = gauge_[(dir*2+parity)*exVolumeCB + y];
3047 #else
3048  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
3049 #endif
3050  for (int i = 0; i < Nc; i++) {
3051  for (int j = 0; j < Nc; j++) {
3052  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
3053  }
3054  }
3055  }
3056 
3057  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
3058  {
3059  int y = getPaddedIndex(x, parity);
3060 
3061 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
3062  typedef S<Float,length> structure;
3063  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
3064  structure v_;
3065  for (int i=0; i<Nc; i++)
3066  for (int j = 0; j < Nc; j++) {
3067  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
3068  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
3069  }
3070  gauge_[(dir * 2 + parity) * exVolumeCB + y] = v_;
3071 #else
3072  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
3073  for (int i = 0; i < Nc; i++) {
3074  for (int j = 0; j < Nc; j++) {
3075  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
3076  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
3077  }
3078  }
3079 #endif
3080  }
3081 
3092  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
3093  {
3094  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
3095  }
3096 
3107  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
3108  {
3109  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
3110  }
3111 
3112  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
3113  };
3114 
3115  } // namespace gauge
3116 
3117  template <typename otherFloat, typename storeFloat>
3119  x = a.real();
3120  y = a.imag();
3121  }
3122 
3123  template <typename otherFloat, typename storeFloat>
3125  x = a.real();
3126  y = a.imag();
3127  }
3128 
3129  template <typename otherFloat, typename storeFloat>
3131  x = a.real();
3132  y = a.imag();
3133  }
3134 
3135  template <typename otherFloat, typename storeFloat>
3137  x = a.real();
3138  y = a.imag();
3139  }
3140 
3141  // Use traits to reduce the template explosion
3142  template <typename T, QudaReconstructType, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
3143  bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
3144  bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER>
3145  struct gauge_mapper {
3146  };
3147 
3148  // double precision
3149  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3150  struct gauge_mapper<double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3152  };
3153  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3154  struct gauge_mapper<double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3156  };
3157  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3158  struct gauge_mapper<double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3160  };
3161  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3162  struct gauge_mapper<double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3164  };
3165  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3166  struct gauge_mapper<double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3168  };
3169  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3170  struct gauge_mapper<double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3172  };
3173 
3174  // single precision
3175  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3176  struct gauge_mapper<float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3178  };
3179  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3180  struct gauge_mapper<float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3182  };
3183  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3184  struct gauge_mapper<float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3186  };
3187  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3188  struct gauge_mapper<float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3190  };
3191  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3192  struct gauge_mapper<float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3194  };
3195  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3196  struct gauge_mapper<float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3198  };
3199 
3200 #ifdef FLOAT8
3201 #define N8 8
3202 #else
3203 #define N8 4
3204 #endif
3205 
3206  // half precision
3207  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3208  struct gauge_mapper<short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3210  };
3211  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3212  struct gauge_mapper<short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3214  };
3215  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3216  struct gauge_mapper<short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3218  };
3219  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3220  struct gauge_mapper<short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3222  };
3223  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3224  struct gauge_mapper<short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3226  };
3227  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3228  struct gauge_mapper<short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3230  };
3231 
3232  // quarter precision
3233  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3234  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3236  };
3237  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3238  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3240  };
3241  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3242  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3244  };
3245  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3246  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3248  };
3249  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3250  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3252  };
3253  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3254  struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {
3256  };
3257 
3258  template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
3259  QudaGhostExchange ghostExchange, bool use_inphase>
3260  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER> {
3262  };
3263 
3264  template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
3265  QudaGhostExchange ghostExchange, bool use_inphase>
3266  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER> {
3268  };
3269 
3270  template<typename T, QudaGaugeFieldOrder order, int Nc> struct gauge_order_mapper { };
3271  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDP_GAUGE_ORDER,Nc> { typedef gauge::QDPOrder<T, 2*Nc*Nc> type; };
3272  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDPJIT_GAUGE_ORDER,Nc> { typedef gauge::QDPJITOrder<T, 2*Nc*Nc> type; };
3273  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_MILC_GAUGE_ORDER,Nc> { typedef gauge::MILCOrder<T, 2*Nc*Nc> type; };
3274  template <typename T, int Nc> struct gauge_order_mapper<T, QUDA_CPS_WILSON_GAUGE_ORDER, Nc> {
3276  };
3277  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_BQCD_GAUGE_ORDER,Nc> { typedef gauge::BQCDOrder<T, 2*Nc*Nc> type; };
3278  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_GAUGE_ORDER,Nc> { typedef gauge::TIFROrder<T, 2*Nc*Nc> type; };
3279  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> { typedef gauge::TIFRPaddedOrder<T, 2*Nc*Nc> type; };
3280  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_FLOAT2_GAUGE_ORDER,Nc> { typedef gauge::FloatNOrder<T, 2*Nc*Nc, 2, 2*Nc*Nc> type; };
3281 
3282 } // namespace quda
3283 
3284 #endif // _GAUGE_ORDER_H
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294
int Nface() const
Definition: gauge_field.h:322
int Ncolor() const
Definition: gauge_field.h:285
virtual void * Gauge_p()
Definition: gauge_field.h:358
const void ** Ghost() const
Definition: gauge_field.h:368
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286
const int * SurfaceCB() const
const int * R() const
const int * X() const
double Scale() const
__device__ __host__ Matrix()
Definition: quda_matrix.h:74
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:117
int comm_coord(int dim)
void comm_allreduce_min(double *data)
int comm_dim(int dim)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
std::array< int, 4 > dim
double anisotropy
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
const int nColor
Definition: covdev_test.cpp:44
enum QudaStaggeredPhase_s QudaStaggeredPhase
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515
@ QUDA_STAGGERED_PHASE_TIFR
Definition: enum_quda.h:518
@ QUDA_STAGGERED_PHASE_CPS
Definition: enum_quda.h:517
@ QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:516
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71
@ QUDA_RECONSTRUCT_13
Definition: enum_quda.h:74
@ QUDA_RECONSTRUCT_8
Definition: enum_quda.h:72
@ QUDA_RECONSTRUCT_10
Definition: enum_quda.h:75
@ QUDA_RECONSTRUCT_9
Definition: enum_quda.h:73
@ QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:503
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
@ QUDA_GHOST_EXCHANGE_INVALID
Definition: enum_quda.h:511
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509
enum QudaGhostExchange_s QudaGhostExchange
enum QudaReconstructType_s QudaReconstructType
@ QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:40
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
@ QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:46
@ QUDA_NATIVE_GAUGE_ORDER
Definition: enum_quda.h:43
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:45
int length[]
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define host_free(ptr)
Definition: malloc_quda.h:115
void init()
Create the BLAS context.
void start()
Start profiling.
Definition: device.cpp:226
__host__ constexpr __device__ int ct_sqrt(int n, int i=1)
__host__ __device__ bool static_phase()
__host__ constexpr __device__ bool fixed_point()
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
__host__ constexpr __device__ bool match()
__host__ constexpr __device__ bool match< int, int >()
__host__ constexpr __device__ bool fixed_point< float, int8_t >()
__device__ __host__ complex< Float > operator+(const fieldorder_wrapper< Float, storeFloat > &a, const complex< Float > &b)
__host__ constexpr __device__ bool fixed_point< float, short >()
__host__ constexpr __device__ bool match< short, short >()
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
__host__ constexpr __device__ int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
__device__ __host__ complex< Float > operator*(const Float &a, const fieldorder_wrapper< Float, storeFloat > &b)
__host__ constexpr __device__ bool fixed_point< float, int >()
constexpr bool default_huge_alloc
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
void transform_reduce(Arg &arg)
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:417
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Provides precision abstractions and defines the register precision given the storage precision using ...
__host__ __device__ int8_t imag() const volatile
Definition: complex_quda.h:736
__host__ __device__ int8_t real() const volatile
Definition: complex_quda.h:735
__host__ __device__ int imag() const volatile
Definition: complex_quda.h:829
__host__ __device__ int real() const volatile
Definition: complex_quda.h:828
__host__ __device__ short real() const volatile
Definition: complex_quda.h:782
__host__ __device__ short imag() const volatile
Definition: complex_quda.h:783
__host__ __device__ complex(const ValueType &re=ValueType(), const ValueType &im=ValueType())
Definition: complex_quda.h:375
__host__ __device__ ValueType imag() const volatile
__host__ __device__ ValueType real() const volatile
__host__ __device__ complex< ValueType > & operator=(const complex< T > z)
Definition: complex_quda.h:399
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat > &a)
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
This and the following method creates a fieldorder_wrapper object whose pointer points to the start o...
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat > &a)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat > &a)
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
void resetScale(Float dummy)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
complex< Float > dummy
static constexpr bool is_mma_compatible
struct to define BQCD ordered gauge fields:
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
static constexpr int Nc
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
BQCDOrder(const BQCDOrder &order)
typename mapper< Float >::type real
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
static constexpr int Nc
typename mapper< Float >::type real
CPSOrder(const CPSOrder &order)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int Geometry() const
const int_fastdiv geometry
static constexpr bool is_mma_compatible
__device__ __host__ int Volume() const
__device__ __host__ int Ncolor() const
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
GhostAccessor< Float, nColor, order, native_ghost, storeFloat > ghostAccessor
__device__ __host__ int NcolorCoarse() const
__device__ __host__ const auto wrap(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
void resetScale(double max)
static constexpr int nColorCoarse
__device__ __host__ int Ndim() const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
FieldOrder(const FieldOrder &o)
__device__ __host__ const auto wrap(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ int VolumeCB() const
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ const auto wrap_ghost(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ const auto wrap_ghost(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
static constexpr bool fixedPoint()
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
__device__ __host__ auto Ghost(int d, int parity, int x) const
static constexpr bool supports_ghost_zone
const QudaFieldLocation location
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
__device__ __host__ auto wrap(int d, int parity, int x, int s_row, int s_col)
the non-const wrap method.
__device__ __host__ auto wrap_ghost(int d, int parity, int x)
the non-const wrap_ghost method.
__device__ __host__ int NspinCoarse() const
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
__device__ __host__ auto wrap(int d, int parity, int x)
the non-const wrap method.
__device__ __host__ auto wrap_ghost(int d, int parity, int x, int s_row, int s_col)
the non-const wrap_ghost method.
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
int_fastdiv X[QUDA_MAX_DIM]
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
size_t bytes
host memory for backing up the field when tuning
typename mapper< Float >::type real
FloatNOrder(const FloatNOrder &order)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
QudaGhostExchange ghostExchange
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
void save()
Backup the field to the host when tuning.
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
AllocType< huge_alloc >::type AllocInt
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
static constexpr int reconLen
static constexpr int hasPhase
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
void load()
Restore the field from the host after tuning.
VectorType< Float, N >::type Vector
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
the non-const wrap method.
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
The method similar to Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat>::wrap: this method a...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields,...
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
int faceVolumeCB[QUDA_MAX_DIM]
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
typename mapper< Float >::type real
LegacyOrder(const LegacyOrder &order)
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
Float * ghost[QUDA_MAX_DIM]
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
LegacyOrder(const GaugeField &u, Float **ghost_)
MILCOrder(const MILCOrder &order)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
struct to define gauge fields packed into an opaque MILC site struct:
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
MILCSiteOrder(const MILCSiteOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
QDPJITOrder(const QDPJITOrder &order)
Float * gauge[QUDA_MAX_DIM]
typename mapper< Float >::type real
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
QDPOrder(const QDPOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
typename mapper< Float >::type real
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra,...
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
__device__ __host__ real getPhase(const complex in[9])
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
__device__ __host__ real getPhase(const complex in[9]) const
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
Reconstruct(const GaugeField &u, real scale=1.0)
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
__device__ __host__ real getPhase(const complex in[9]) const
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
Generic reconstruction helper with no reconstruction.
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
__device__ __host__ real getPhase(const complex in[N/2]) const
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
Reconstruct(const GaugeField &u)
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
typename mapper< Float >::type real
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ real & operator[](int i)
__host__ __device__ const real & operator[](int i) const
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
TIFROrder(const TIFROrder &order)
static constexpr int Nc
typename mapper< Float >::type real
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
TIFRPaddedOrder(const TIFRPaddedOrder &order)
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded.
typename mapper< Float >::type real
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
__host__ __device__ Float operator()(const quda::complex< int8_t > &x)
__host__ __device__ Float operator()(const quda::complex< int > &x)
__host__ __device__ Float operator()(const quda::complex< short > &x)
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
abs_(const Float scale)
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors,...
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
__device__ __host__ const auto data() const
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__device__ __host__ Float real() const
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
__device__ __host__ Float imag() const
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ complex< Float > operator-() const
negation operator
__device__ __host__ auto data()
returns the pointor of this wrapper object
__host__ __device__ ReduceType operator()(const quda::complex< int8_t > &x)
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
square_(ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors,...
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors,...
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
QUDA reimplementation of thrust::transform_reduce as well as wrappers also implementing thrust::reduc...
#define errorQuda(...)
Definition: util_quda.h:120