quda-ref/v1.1.0/gauge__field__order_8h_source.html

 #ifndef _GAUGE_ORDER_H

 #define _GAUGE_ORDER_H


 #ifndef __CUDACC_RTC__

 #include <assert.h>

 #endif

 #include <type_traits>


 #include <register_traits.h>

 #include <convert.h>

 #include <complex_quda.h>

 #include <quda_matrix.h>

 #include <index_helper.cuh>

 #include <fast_intdiv.h>

 #include <type_traits>

 #include <limits>

 #include <atomic.cuh>

 #include <gauge_field.h>

 #include <index_helper.cuh>

 #include <trove_helper.cuh>

 #include <transform_reduce.h>


 namespace quda {


   template <typename Float, typename T>

     struct gauge_wrapper {

       const int dim;

       const int x_cb;

       const int parity;

       const Float phase;

       T &gauge;


       __device__ __host__ inline gauge_wrapper<Float, T>(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) :

           gauge(gauge),

           dim(dim),

           x_cb(x_cb),

           parity(parity),

           phase(phase)

       {

       }


       template<typename M>

       __device__ __host__ inline void operator=(const M &a) {

         gauge.save(a.data, x_cb, dim, parity);

       }

     };


   template <typename T, int N>

     template <typename S>

     __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_wrapper<typename RealType<T>::type,S> &a) {

     a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);

   }


   template <typename T, int N>

     template <typename S>

     __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_wrapper<typename RealType<T>::type,S> &a) {

     a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);

   }


   template <typename Float, typename T>

     struct gauge_ghost_wrapper {

       const int dim;

       const int ghost_idx;

       const int parity;

       const Float phase;

       T &gauge;


       __device__ __host__ inline gauge_ghost_wrapper<Float, T>(

           T &gauge, int dim, int ghost_idx, int parity, Float phase = 1.0) :

           gauge(gauge),

           dim(dim),

           ghost_idx(ghost_idx),

           parity(parity),

           phase(phase)

       {

       }


       template<typename M>

       __device__ __host__ inline void operator=(const M &a) {

         gauge.saveGhost(a.data, ghost_idx, dim, parity);

       }

     };


   template <typename T, int N>

     template <typename S>

     __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {

     a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);

   }


   template <typename T, int N>

     template <typename S>

     __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {

     a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);

   }


   namespace gauge {


     template<typename ReduceType, typename Float> struct square_ {

       square_(ReduceType scale) { }

       __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)

       { return static_cast<ReduceType>(norm(x)); }

     };


     template <typename ReduceType> struct square_<ReduceType, int8_t> {

       const ReduceType scale;

       square_(const ReduceType scale) : scale(scale) { }

       __host__ __device__ inline ReduceType operator()(const quda::complex<int8_t> &x)

       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }

     };


     template<typename ReduceType> struct square_<ReduceType,short> {

       const ReduceType scale;

       square_(const ReduceType scale) : scale(scale) { }

       __host__ __device__ inline ReduceType operator()(const quda::complex<short> &x)

       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }

     };


     template<typename ReduceType> struct square_<ReduceType,int> {

       const ReduceType scale;

       square_(const ReduceType scale) : scale(scale) { }

       __host__ __device__ inline ReduceType operator()(const quda::complex<int> &x)

       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }

     };


     template<typename Float, typename storeFloat> struct abs_ {

       abs_(const Float scale) { }

       __host__ __device__ Float operator()(const quda::complex<storeFloat> &x) { return abs(x); }

     };


     template <typename Float> struct abs_<Float, int8_t> {

       Float scale;

       abs_(const Float scale) : scale(scale) { }

       __host__ __device__ Float operator()(const quda::complex<int8_t> &x)

       { return abs(scale * complex<Float>(x.real(), x.imag())); }

     };


     template<typename Float> struct abs_<Float,short> {

       Float scale;

       abs_(const Float scale) : scale(scale) { }

       __host__ __device__ Float operator()(const quda::complex<short> &x)

       { return abs(scale * complex<Float>(x.real(), x.imag())); }

     };


     template<typename Float> struct abs_<Float,int> {

       Float scale;

       abs_(const Float scale) : scale(scale) { }

       __host__ __device__ Float operator()(const quda::complex<int> &x)

       { return abs(scale * complex<Float>(x.real(), x.imag())); }

     };


     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }

     template <> __host__ __device__ inline constexpr bool fixed_point<float, int8_t>() { return true; }

     template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }

     template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }


     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }

     template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }

     template<> __host__ __device__ inline constexpr bool match<short,short>() { return true; }


     template <typename Float, typename storeFloat>

       struct fieldorder_wrapper {


       using type = Float;

       using store_type = storeFloat;

       complex<storeFloat> *v;

       const int idx;

       const Float scale;

       const Float scale_inv;

       static constexpr bool fixed = fixed_point<Float, storeFloat>();


       __device__ __host__ inline fieldorder_wrapper(complex<storeFloat> *v, int idx, Float scale, Float scale_inv) :

         v(v),

         idx(idx),

         scale(scale),

         scale_inv(scale_inv)

       {

       }


       __device__ __host__ inline Float real() const

       {

         if (!fixed) {

           return v[idx].real();

         } else {

           return scale_inv * static_cast<Float>(v[idx].real());

         }

       }


       __device__ __host__ inline Float imag() const

       {

         if (!fixed) {

           return v[idx].imag();

         } else {

           return scale_inv * static_cast<Float>(v[idx].imag());

         }

       }


       __device__ __host__ inline auto data() { return &v[idx]; }


       __device__ __host__ inline const auto data() const { return &v[idx]; }


       __device__ __host__ inline complex<Float> operator-() const

       {

         return fixed ? -scale_inv * static_cast<complex<Float>>(v[idx]) : -static_cast<complex<Float>>(v[idx]);

       }


       __device__ __host__ inline void operator=(const fieldorder_wrapper<Float, storeFloat> &a)

       {

         v[idx] = fixed ? complex<storeFloat>(round(scale * a.real()), round(scale * a.imag())) : a.v[a.idx];

       }


       template <typename theirFloat> __device__ __host__ inline void operator=(const complex<theirFloat> &a)

       {

         if (match<storeFloat, theirFloat>()) {

           v[idx] = complex<storeFloat>(a.x, a.y);

         } else {

           v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);

         }

       }


       template <typename theirFloat> __device__ __host__ inline void operator+=(const complex<theirFloat> &a)

       {

         if (match<storeFloat, theirFloat>()) {

           v[idx] += complex<storeFloat>(a.x, a.y);

         } else {

           v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);

         }

       }


       template <typename theirFloat> __device__ __host__ inline void operator-=(const complex<theirFloat> &a)

       {

         if (match<storeFloat, theirFloat>()) {

           v[idx] -= complex<storeFloat>(a.x, a.y);

         } else {

           v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);

         }

       }

       };


     template<typename Float, typename storeFloat>

     __device__ __host__ inline complex<Float> operator*(const Float &a, const fieldorder_wrapper<Float,storeFloat> &b)

     {

       if (fixed_point<Float,storeFloat>()) return a*complex<Float>(b.real(), b.imag());

       else return a*complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());

     }


     template<typename Float, typename storeFloat>

     __device__ __host__ inline complex<Float> operator+(const fieldorder_wrapper<Float,storeFloat> &a, const complex<Float> &b) {

       if (fixed_point<Float,storeFloat>()) return complex<Float>(a.real(), a.imag()) + b;

       else return complex<Float>(a.v[a.idx].real(),a.v[a.idx].imag()) + b;

     }


     template<typename Float, typename storeFloat>

     __device__ __host__ inline complex<Float> operator+(const complex<Float> &a, const fieldorder_wrapper<Float,storeFloat> &b) {

       if (fixed_point<Float,storeFloat>()) return a + complex<Float>(b.real(), b.imag());

       else return a + complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());;

     }


     template <typename Float, int nColor, QudaGaugeFieldOrder order, typename storeFloat> struct Accessor {

       static constexpr bool is_mma_compatible = false;

       mutable complex<Float> dummy;

       Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {

         errorQuda("Not implemented for order=%d", order);

       }


       void resetScale(Float dummy) { }


       __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {

         return dummy;

       }

     };


     template <typename Float, int nColor, QudaGaugeFieldOrder order, bool native_ghost, typename storeFloat>

     struct GhostAccessor {

       mutable complex<Float> dummy;

       GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {

         errorQuda("Not implemented for order=%d", order);

       }


       void resetScale(Float dummy) { }


       __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {

         return dummy;

       }

     };


     template <typename Float, int nColor, typename storeFloat>

     struct Accessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat> {

       static constexpr bool is_mma_compatible = false;

       complex <storeFloat> *u[QUDA_MAX_GEOMETRY];

       const int volumeCB;

       const int geometry;

       const int cb_offset;

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();


       Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)

         : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(complex<storeFloat>)*U.Geometry())),

         scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))

       {

         for (int d=0; d<U.Geometry(); d++)

           u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :

             static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];

         resetScale(U.Scale());

       }


       Accessor(const Accessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat> &a) :

         volumeCB(a.volumeCB),

         geometry(a.geometry),

         cb_offset(a.cb_offset),

         scale(a.scale),

         scale_inv(a.scale_inv)

       {

         for (int d = 0; d < QUDA_MAX_GEOMETRY; d++) u[d] = a.u[d];

       }


       void resetScale(Float max) {

         if (fixed) {

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const

       {

         complex<storeFloat> tmp = u[d][ parity*cb_offset + (x*nColor + row)*nColor + col];


         if (fixed) {

           return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

         } else {

           return complex<Float>(tmp.x,tmp.y);

         }

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)

         { return fieldorder_wrapper<Float,storeFloat>(u[d], parity*cb_offset + (x*nColor + row)*nColor + col,

                                                       scale, scale_inv); }


       template<typename theirFloat>

       __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col,

                                                  const complex<theirFloat> &val) const {

 #ifdef __CUDA_ARCH__

         typedef typename vector<storeFloat,2>::type vec2;

         vec2 *u2 = reinterpret_cast<vec2*>(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col);

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

           atomicAdd(u2, (vec2&)val_);

         } else {

           atomicAdd(u2, (vec2&)val);

         }

 #else

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

 #pragma omp atomic update

           u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += val_.x;

 #pragma omp atomic update

           u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += val_.y;

         } else {

 #pragma omp atomic update

           u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);

 #pragma omp atomic update

           u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);

         }

 #endif

       }


       template <typename helper, typename reducer>

       __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const

       {

         if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);

         int lower = (dim == -1) ? 0 : dim;

         int ndim = (dim == -1 ? geometry : 1);

         std::vector<double> result(ndim);

         std::vector<complex<storeFloat> *> v(ndim);

         for (int d = 0; d < ndim; d++) v[d] = u[d + lower];

         ::quda::transform_reduce(location, result, v, 2 * volumeCB * nColor * nColor, h, init, r);

         double total = init;

         for (auto &res : result) total = r(total, res);

         return total;

       }

     };


     template <typename Float, int nColor, bool native_ghost, typename storeFloat>

     struct GhostAccessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat> {

       complex<storeFloat> *ghost[8];

       int ghostOffset[8];

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();


       GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)

         : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {

         for (int d=0; d<4; d++) {

           ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :

             static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));

           ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();


           ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :

             ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :

             static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));

           ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();

         }


         resetScale(U.Scale());

       }


       GhostAccessor(const GhostAccessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat> &a) :

         scale(a.scale),

         scale_inv(a.scale_inv)

       {

         for (int d = 0; d < 8; d++) {

           ghost[d] = a.ghost[d];

           ghostOffset[d] = a.ghostOffset[d];

         }

       }


       void resetScale(Float max) {

         if (fixed) {

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const

       {

         complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];

         if (fixed) {

           return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

         } else {

           return complex<Float>(tmp.x,tmp.y);

         }

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)

         { return fieldorder_wrapper<Float,storeFloat>(ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col,

                                                       scale, scale_inv); }

     };


     template <typename Float, int nColor, typename storeFloat>

     struct Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat> {

       static constexpr bool is_mma_compatible = true;

       complex<storeFloat> *u;

       const int volumeCB;

       const int geometry;

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();


       Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)

       : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :

           static_cast<complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),

         volumeCB(U.VolumeCB()), geometry(U.Geometry()),

         scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {

         resetScale(U.Scale());

       }


       Accessor(const Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat> &a) :

         u(a.u),

         volumeCB(a.volumeCB),

         geometry(a.geometry),

         scale(a.scale),

         scale_inv(a.scale_inv)

       { }


       void resetScale(Float max) {

         if (fixed) {

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const

       {

         complex<storeFloat> tmp = u[(((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col];

         if (fixed) {

           return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

         } else {

           return complex<Float>(tmp.x,tmp.y);

         }

       }


       __device__ __host__ inline const auto wrap(int d, int parity, int x, int row, int col) const

       {

         return fieldorder_wrapper<Float, storeFloat>(

           u, (((parity * volumeCB + x) * geometry + d) * nColor + row) * nColor + col, scale, scale_inv);

       }


       __device__ __host__ inline auto wrap(int d, int parity, int x, int row, int col)

       {

         return fieldorder_wrapper<Float, storeFloat>(

           u, (((parity * volumeCB + x) * geometry + d) * nColor + row) * nColor + col, scale, scale_inv);

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)

         { return fieldorder_wrapper<Float,storeFloat>

             (u, (((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col, scale, scale_inv); }


       template <typename theirFloat>

       __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {

 #ifdef __CUDA_ARCH__

         typedef typename vector<storeFloat,2>::type vec2;

         vec2 *u2 = reinterpret_cast<vec2*>(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col);

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

           atomicAdd(u2, (vec2&)val_);

         } else {

           atomicAdd(u2, (vec2&)val);

         }

 #else

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

 #pragma omp atomic update

           u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += val_.x;

 #pragma omp atomic update

           u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += val_.y;

         } else {

 #pragma omp atomic update

           u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);

 #pragma omp atomic update

           u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);

         }

 #endif

       }


       template <typename helper, typename reducer>

       __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const

       {

         if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);

         int start = dim == -1 ? 0 : dim;

         int count = (dim == -1 ? geometry : 1) * volumeCB * nColor * nColor; // items per parity

         std::vector<double> result = {init, init};

         std::vector<decltype(u)> v = {u + (0 * geometry + start) * volumeCB * nColor * nColor,

                                       u + (1 * geometry + start) * volumeCB * nColor * nColor};

         ::quda::transform_reduce(location, result, v, count, h, init, r);

         return r(result[0], result[1]);

       }

     };


     template <typename Float, int nColor, bool native_ghost, typename storeFloat>

     struct GhostAccessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat> {

       complex<storeFloat> *ghost[8];

       int ghostOffset[8];

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();


       GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)

         : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {

         for (int d=0; d<4; d++) {

           ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :

             static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));

           ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();


           ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :

             ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :

             static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));

           ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();

         }


         resetScale(U.Scale());

       }


       GhostAccessor(const GhostAccessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat> &a) :

         scale(a.scale),

         scale_inv(a.scale_inv)

       {

         for (int d = 0; d < 8; d++) {

           ghost[d] = a.ghost[d];

           ghostOffset[d] = a.ghostOffset[d];

         }

       }


       void resetScale(Float max) {

         if (fixed) {

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const

       {

         complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];

         if (fixed) {

           return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

         } else {

           return complex<Float>(tmp.x,tmp.y);

         }

       }


       __device__ __host__ inline const auto wrap(int d, int parity, int x, int row, int col) const

       {

         return fieldorder_wrapper<Float, storeFloat>(

           ghost[d], parity * ghostOffset[d] + (x * nColor + row) * nColor + col, scale, scale_inv);

       }


       __device__ __host__ inline auto wrap(int d, int parity, int x, int row, int col)

       {

         return fieldorder_wrapper<Float, storeFloat>(

           ghost[d], parity * ghostOffset[d] + (x * nColor + row) * nColor + col, scale, scale_inv);

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)

         { return fieldorder_wrapper<Float,storeFloat>

             (ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col, scale, scale_inv); }

     };


     template<int nColor, int N>

       __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) {

       constexpr int M = (2*nColor*nColor) / N;

       int j = ((row*nColor+col)*2) / N; // factor of two for complexity

       int i = ((row*nColor+col)*2) % N;

       int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset

       index += parity*offset_cb;

       return index;

     };


     template <typename Float, int nColor, typename storeFloat>

     struct Accessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat> {

       static constexpr bool is_mma_compatible = false;

       complex<storeFloat> *u;

       const int offset_cb;

       const int volumeCB;

       const int stride;

       const int geometry;

       Float max;

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();


     Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)

       : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :

           static_cast<complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),

         offset_cb( (U.Bytes()>>1) / sizeof(complex<storeFloat>)),

         volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),

         max(static_cast<Float>(1.0)), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))

       {

         resetScale(U.Scale());

       }


       Accessor(const Accessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat> &a) :

         u(a.u),

         offset_cb(a.offset_cb),

         volumeCB(a.volumeCB),

         stride(a.stride),

         geometry(a.geometry),

         scale(a.scale),

         scale_inv(a.scale_inv)

       {

       }


       void resetScale(Float max_) {

         if (fixed) {

           max = max_;

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline const complex<Float> operator()(int dim, int parity, int x_cb, int row, int col) const

       {

         complex<storeFloat> tmp

           = u[parity * offset_cb + dim * stride * nColor * nColor + (row * nColor + col) * stride + x_cb];

         if (fixed) {

           return scale_inv * complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

         } else {

           return complex<Float>(tmp.x, tmp.y);

         }

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int dim, int parity, int x_cb, int row, int col)

       {

         int index = parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb;

         return fieldorder_wrapper<Float,storeFloat>(u, index, scale, scale_inv);

       }


       template <typename theirFloat>

       __device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {

 #ifdef __CUDA_ARCH__

         typedef typename vector<storeFloat,2>::type vec2;

         vec2 *u2 = reinterpret_cast<vec2*>(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

           atomicAdd(u2, (vec2&)val_);

         } else {

           atomicAdd(u2, (vec2&)val);

         }

 #else

         if (fixed && !match<storeFloat,theirFloat>()) {

           complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));

 #pragma omp atomic update

           u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += val_.x;

 #pragma omp atomic update

           u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += val_.y;

           } else {

 #pragma omp atomic update

           u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += static_cast<storeFloat>(val.x);

 #pragma omp atomic update

           u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += static_cast<storeFloat>(val.y);

         }

 #endif

       }


       template <typename helper, typename reducer>

       __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const

       {

         if (dim >= geometry) errorQuda("Requested dimension %d exceeds dimensionality of the field %d", dim, geometry);

         int start = (dim == -1) ? 0 : dim;

         int count = (dim == -1 ? geometry : 1) * stride * nColor * nColor;

         std::vector<double> result = {init, init};

         std::vector<decltype(u)> v = {u + 0 * offset_cb + start * count, u + 1 * offset_cb + start * count};

         ::quda::transform_reduce(location, result, v, count, h, init, r);

         return r(result[0], result[1]);

       }

     };


     template <typename Float, int nColor, bool native_ghost, typename storeFloat>

     struct GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> {

       complex<storeFloat> *ghost[8];

       const int volumeCB;

       int ghostVolumeCB[8];

       Float scale;

       Float scale_inv;

       static constexpr bool fixed = fixed_point<Float,storeFloat>();

       Accessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat> accessor;


       GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) :

         volumeCB(U.VolumeCB()),

         scale(static_cast<Float>(1.0)),

         scale_inv(static_cast<Float>(1.0)),

         accessor(U, gauge_, ghost_)

       {

         if (!native_ghost) assert(ghost_ != nullptr);

         for (int d=0; d<4; d++) {

           ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;

           ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);

           ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;

           ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);

         }

         resetScale(U.Scale());

       }


       GhostAccessor(const GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> &a) :

         volumeCB(a.volumeCB),

         scale(a.scale),

         scale_inv(a.scale_inv),

         accessor(a.accessor)

       {

         for (int d=0; d<8; d++) {

           ghost[d] = a.ghost[d];

           ghostVolumeCB[d] = a.ghostVolumeCB[d];

         }

       }


       void resetScale(Float max) {

         accessor.resetScale(max);

         if (fixed) {

           scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;

           scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());

         }

       }


       __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x_cb, int row, int col) const

       {

         if (native_ghost) {

           return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);

         } else {

           complex<storeFloat> tmp = ghost[d][ ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb ];

           if (fixed) {

             return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));

           } else {

             return complex<Float>(tmp.x, tmp.y);

           }

         }

       }


       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x_cb, int row, int col)

       {

         if (native_ghost)

           return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);

         else

           return fieldorder_wrapper<Float,storeFloat>

             (ghost[d], ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb, scale, scale_inv);

       }

     };


     template <typename Float_, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order, bool native_ghost = true,

               typename storeFloat_ = Float_>

     struct FieldOrder {


       using Float = Float_;

       using storeFloat = storeFloat_;


       const int volumeCB;

       const int nDim;

       const int_fastdiv geometry;

       const QudaFieldLocation location;

       static constexpr int nColorCoarse = nColor / nSpinCoarse;


       using accessor_type = Accessor<Float, nColor, order, storeFloat>;

       static constexpr bool is_mma_compatible = accessor_type::is_mma_compatible;

       accessor_type accessor;

       GhostAccessor<Float, nColor, order, native_ghost, storeFloat> ghostAccessor;


       static constexpr bool supports_ghost_zone = true;


       FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)

       : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),

           location(U.Location()),

           accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)

         {

           if (U.Reconstruct() != QUDA_RECONSTRUCT_NO)

             errorQuda("GaugeField ordering not supported with reconstruction");

         }


       FieldOrder(const FieldOrder &o) : volumeCB(o.volumeCB),

           nDim(o.nDim), geometry(o.geometry), location(o.location),

           accessor(o.accessor), ghostAccessor(o.ghostAccessor)

         { }


         void resetScale(double max) {

           accessor.resetScale(max);

           ghostAccessor.resetScale(max);

         }


         static constexpr bool fixedPoint() { return fixed_point<Float,storeFloat>(); }


         __device__ __host__ complex<Float> operator()(int d, int parity, int x, int row, int col) const

         {

           return accessor(d, parity, x, row, col);

         }


         __device__ __host__ const auto wrap(int d, int parity, int x) const

         {

           return accessor.wrap(d, parity, x, 0, 0);

         }


         __device__ __host__ auto wrap(int d, int parity, int x) { return accessor.wrap(d, parity, x, 0, 0); }


         __device__ __host__ fieldorder_wrapper<Float, storeFloat> operator()(int d, int parity, int x, int row, int col)

         {

           return accessor(d, parity, x, row, col);

         }


         __device__ __host__ complex<Float> Ghost(int d, int parity, int x, int row, int col) const

         {

           return ghostAccessor(d, parity, x, row, col);

         }


         __device__ __host__ auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); }


         __device__ __host__ fieldorder_wrapper<Float, storeFloat> Ghost(int d, int parity, int x, int row, int col)

         {

           return ghostAccessor(d, parity, x, row, col);

         }

         __device__ __host__ const auto wrap_ghost(int d, int parity, int x) const

         {

           return ghostAccessor.wrap(d, parity, x, 0, 0);

         }


         __device__ __host__ auto wrap_ghost(int d, int parity, int x) { return ghostAccessor.wrap(d, parity, x, 0, 0); }


         __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x, int s_row, int s_col,

                                                                    int c_row, int c_col) const

         {

           return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);

         }


         __device__ __host__ inline fieldorder_wrapper<Float, storeFloat> operator()(int d, int parity, int x, int s_row,

                                                                                     int s_col, int c_row, int c_col)

         {

           return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);

         }


         __device__ __host__ inline const auto wrap(int d, int parity, int x, int s_row, int s_col) const

         {

           return accessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);

         }


         __device__ __host__ inline auto wrap(int d, int parity, int x, int s_row, int s_col)

         {

           return accessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);

         }


         __device__ __host__ inline complex<Float> Ghost(int d, int parity, int x, int s_row, int s_col, int c_row,

                                                         int c_col) const

         {

           return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);

         }


         __device__ __host__ inline fieldorder_wrapper<Float, storeFloat> Ghost(int d, int parity, int x, int s_row,

                                                                                int s_col, int c_row, int c_col)

         {

           return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col);

         }

         __device__ __host__ inline const auto wrap_ghost(int d, int parity, int x, int s_row, int s_col) const

         {

           return ghostAccessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);

         }


         __device__ __host__ inline auto wrap_ghost(int d, int parity, int x, int s_row, int s_col)

         {

           return ghostAccessor.wrap(d, parity, x, s_row * nColorCoarse, s_col * nColorCoarse);

         }


         template <typename theirFloat>

         __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col,

                                                   int c_row, int c_col, const complex<theirFloat> &val) {

           accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);

         }


         __device__ __host__ inline int Ncolor() const { return nColor; }


         __device__ __host__ inline int Volume() const { return 2*volumeCB; }


         __device__ __host__ inline int VolumeCB() const { return volumeCB; }


         __device__ __host__ inline int Ndim() const { return nDim; }


         __device__ __host__ inline int Geometry() const { return geometry; }


         __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; }


         __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; }


         __host__ double norm1(int dim=-1, bool global=true) const {

           double nrm1 = accessor.transform_reduce(location, dim, abs_<double, storeFloat>(accessor.scale_inv), 0.0,

                                                   plus<double>());

           if (global) comm_allreduce(&nrm1);

           return nrm1;

         }


         __host__ double norm2(int dim=-1, bool global=true) const {

           double nrm2 = accessor.transform_reduce(location, dim, square_<double, storeFloat>(accessor.scale_inv), 0.0,

                                                   plus<double>());

           if (global) comm_allreduce(&nrm2);

           return nrm2;

         }


         __host__ double abs_max(int dim=-1, bool global=true) const {

           double absmax = accessor.transform_reduce(location, dim, abs_<Float, storeFloat>(accessor.scale_inv), 0.0,

                                                     maximum<Float>());

           if (global) comm_allreduce_max(&absmax);

           return absmax;

         }


         __host__ double abs_min(int dim=-1, bool global=true) const {

           double absmin = accessor.transform_reduce(location, dim, abs_<Float, storeFloat>(accessor.scale_inv),

                                                     std::numeric_limits<double>::max(), minimum<Float>());

           if (global) comm_allreduce_min(&absmin);

           return absmin;

         }


         size_t Bytes() const { return static_cast<size_t>(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); }

     };


       template <int N, typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>

       struct Reconstruct {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         real scale;

         real scale_inv;

         Reconstruct(const GaugeField &u) :

           scale(isFixed<Float>::value ? u.LinkMax() : 1.0),

           scale_inv(isFixed<Float>::value ? 1.0 / scale : 1.0)

         {

         }


         Reconstruct(const Reconstruct<N, Float, ghostExchange_> &recon) : scale(recon.scale), scale_inv(recon.scale_inv)

         {

         }


         __device__ __host__ inline void Pack(real out[N], const complex in[N / 2], int idx) const

         {

           if (isFixed<Float>::value) {

 #pragma unroll

             for (int i = 0; i < N / 2; i++) {

               out[2 * i + 0] = scale_inv * in[i].real();

               out[2 * i + 1] = scale_inv * in[i].imag();

             }

           } else {

 #pragma unroll

             for (int i = 0; i < N / 2; i++) {

               out[2 * i + 0] = in[i].real();

               out[2 * i + 1] = in[i].imag();

             }

           }

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int idx, int dir, real phase,

                                                const I *X, const int *R) const

         {

           if (isFixed<Float>::value) {

 #pragma unroll

             for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }

           } else {

 #pragma unroll

             for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }

           }

         }

         __device__ __host__ inline real getPhase(const complex in[N / 2]) const { return 0; }

       };


       template <QudaGhostExchange ghostExchange_, typename T, typename I>

       __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],

           T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,

           bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)

       {


         // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale


         if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD

             || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {

           if (idx >= firstTimeSliceBound) { // halo region on the first time slice

             return isFirstTimeSlice ? tBoundary : scale;

           } else if (idx >= lastTimeSliceBound) { // last link on the last time slice

             return isLastTimeSlice ? tBoundary : scale;

           } else {

             return scale;

           }

         } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED

             || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {

           if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {

             // the boundary condition is on the R[3]-1 time slice

             return isFirstTimeSlice ? tBoundary : scale;

           } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {

             // the boundary condition lies on the X[3]-R[3]-1 time slice

             return isLastTimeSlice ? tBoundary : scale;

           } else {

             return scale;

           }

         }

         return scale;

       }


       // not actually used - here for reference

       template <typename Float, typename I>

       __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) {

         // could consider non-extended variant too?

         Float sign = static_cast<Float>(1.0);

         switch (dim) {

         case 0: if ( ((x[3] - R[3]) & 1) != 0)                             sign = -static_cast<Float>(1.0); break;

         case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0)               sign = -static_cast<Float>(1.0); break;

         case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;

         }

         return sign;

       }


       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<12, Float, ghostExchange_> {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         const real anisotropy;

         const real tBoundary;

         const int firstTimeSliceBound;

         const int lastTimeSliceBound;

         const bool isFirstTimeSlice;

         const bool isLastTimeSlice;

         QudaGhostExchange ghostExchange;


         Reconstruct(const GaugeField &u) :

           anisotropy(u.Anisotropy()),

           tBoundary(static_cast<real>(u.TBoundary())),

           firstTimeSliceBound(u.VolumeCB()),

           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),

           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),

           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),

           ghostExchange(u.GhostExchange())

         {

         }


         Reconstruct(const Reconstruct<12, Float, ghostExchange_> &recon) :

             anisotropy(recon.anisotropy),

             tBoundary(recon.tBoundary),

             firstTimeSliceBound(recon.firstTimeSliceBound),

             lastTimeSliceBound(recon.lastTimeSliceBound),

             isFirstTimeSlice(recon.isFirstTimeSlice),

             isLastTimeSlice(recon.isLastTimeSlice),

             ghostExchange(recon.ghostExchange)

         {

         }


         __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const

         {

 #pragma unroll

           for (int i = 0; i < 6; i++) {

             out[2 * i + 0] = in[i].real();

             out[2 * i + 1] = in[i].imag();

           }

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,

                                                const I *X, const int *R) const

         {

 #pragma unroll

           for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);


           const real u0 = dir < 3 ?

             anisotropy :

             timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,

                                          lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);


           // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);

           out[6] = cmul(out[2], out[4]);

           out[6] = cmac(out[1], out[5], -out[6]);

           out[6] = u0 * conj(out[6]);


           // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]);

           out[7] = cmul(out[0], out[5]);

           out[7] = cmac(out[2], out[3], -out[7]);

           out[7] = u0 * conj(out[7]);


           // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]);

           out[8] = cmul(out[1], out[3]);

           out[8] = cmac(out[0], out[4], -out[8]);

           out[8] = u0 * conj(out[8]);

         }


         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }

       };


       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<11, Float, ghostExchange_> {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;


         Reconstruct(const GaugeField &u) { ; }

         Reconstruct(const Reconstruct<11, Float, ghostExchange_> &recon) {}


         __device__ __host__ inline void Pack(real out[10], const complex in[9], int idx) const

         {

 #pragma unroll

           for (int i = 0; i < 2; i++) {

             out[2 * i + 0] = in[i + 1].real();

             out[2 * i + 1] = in[i + 1].imag();

           }

           out[4] = in[5].real();

           out[5] = in[5].imag();

           out[6] = in[0].imag();

           out[7] = in[4].imag();

           out[8] = in[8].imag();

           out[9] = 0.0;

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[9], const real in[10], int idx, int dir, real phase,

                                                const I *X, const int *R) const

         {

           out[0] = complex(0.0, in[6]);

           out[1] = complex(in[0], in[1]);

           out[2] = complex(in[2], in[3]);

           out[3] = complex(-out[1].real(), out[1].imag());

           out[4] = complex(0.0, in[7]);

           out[5] = complex(in[4], in[5]);

           out[6] = complex(-out[2].real(), out[2].imag());

           out[7] = complex(-out[5].real(), out[5].imag());

           out[8] = complex(0.0, in[8]);

         }


         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }

       };


       template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>

       struct Reconstruct<13, Float, ghostExchange_, stag_phase> {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         const Reconstruct<12, Float, ghostExchange_> reconstruct_12;

         const real scale;

         const real scale_inv;


         Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {}

         Reconstruct(const Reconstruct<13, Float, ghostExchange_, stag_phase> &recon) :

             reconstruct_12(recon.reconstruct_12),

             scale(recon.scale),

             scale_inv(recon.scale_inv)

         {

         }


         __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const

         {

           reconstruct_12.Pack(out, in, idx);

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,

                                                const I *X, const int *R) const

         {

 #pragma unroll

           for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);


           out[6] = cmul(out[2], out[4]);

           out[6] = cmac(out[1], out[5], -out[6]);

           out[6] = scale_inv * conj(out[6]);


           out[7] = cmul(out[0], out[5]);

           out[7] = cmac(out[2], out[3], -out[7]);

           out[7] = scale_inv * conj(out[7]);


           out[8] = cmul(out[1], out[3]);

           out[8] = cmac(out[0], out[4], -out[8]);

           out[8] = scale_inv * conj(out[8]);


           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing

             // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)

             real cos_sin[2];

             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(3. * phase), &cos_sin[1], &cos_sin[0]);

             complex A(cos_sin[0], cos_sin[1]);

             out[6] = cmul(A, out[6]);

             out[7] = cmul(A, out[7]);

             out[8] = cmul(A, out[8]);

           } else { // phase is +/- 1 so real multiply is sufficient

             out[6] *= phase;

             out[7] *= phase;

             out[8] *= phase;

           }

         }


         __device__ __host__ inline real getPhase(const complex in[9]) const

         {

 #if 1 // phase from cross product

           // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*

           complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;

           complex expI3Phase = in[8] / denom; // numerator = U[2][2]


           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing

             return arg(expI3Phase) / static_cast<real>(3.0);

           } else {

             return expI3Phase.real() > 0 ? 1 : -1;

           }

 #else // phase from determinant

           Matrix<complex, 3> a;

 #pragma unroll

           for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];

           const complex det = getDeterminant(a);

           return phase = arg(det) / 3;

 #endif

         }

       };


       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<8, Float, ghostExchange_> {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         const complex anisotropy; // imaginary value stores inverse

         const complex tBoundary;  // imaginary value stores inverse

         const int firstTimeSliceBound;

         const int lastTimeSliceBound;

         const bool isFirstTimeSlice;

         const bool isLastTimeSlice;

         QudaGhostExchange ghostExchange;


         // scale factor is set when using recon-9

         Reconstruct(const GaugeField &u, real scale = 1.0) :

           anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),

           tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),

           firstTimeSliceBound(u.VolumeCB()),

           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),

           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),

           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),

           ghostExchange(u.GhostExchange())

         {

         }


         Reconstruct(const Reconstruct<8, Float, ghostExchange_> &recon) :

             anisotropy(recon.anisotropy),

             tBoundary(recon.tBoundary),

             firstTimeSliceBound(recon.firstTimeSliceBound),

             lastTimeSliceBound(recon.lastTimeSliceBound),

             isFirstTimeSlice(recon.isFirstTimeSlice),

             isLastTimeSlice(recon.isLastTimeSlice),

             ghostExchange(recon.ghostExchange)

         {

         }


         // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf

         // Method was modified to avoid the singularity at unit gauge by

         // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}}

         // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}}


         __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const

         {

           out[0] = Trig<isFixed<Float>::value, real>::Atan2(in[3].imag(), in[3].real());   // a1 -> b1

           out[1] = Trig<isFixed<Float>::value, real>::Atan2(-in[6].imag(), -in[6].real()); // c1 -> -c1


           out[2] = in[4].real();

           out[3] = in[4].imag(); // a2 -> b2

           out[4] = in[5].real();

           out[5] = in[5].imag(); // a3 -> b3

           out[6] = in[0].real();

           out[7] = in[0].imag(); // b1 -> a1

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,

                                                const I *X, const int *R, const complex scale, const complex u) const

         {

           real u0 = u.real();

           real u0_inv = u.imag();


 #pragma unroll

           for (int i = 1; i <= 3; i++)

             out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly


           real tmp[2];

           Trig<isFixed<Float>::value, real>::SinCos(in[0], &tmp[1], &tmp[0]);

           out[0] = complex(tmp[0], tmp[1]);


           Trig<isFixed<Float>::value, real>::SinCos(in[1], &tmp[1], &tmp[0]);

           out[6] = complex(tmp[0], tmp[1]);


           // First, reconstruct first row

           real row_sum = out[1].real() * out[1].real();

           row_sum += out[1].imag() * out[1].imag();

           row_sum += out[2].real() * out[2].real();

           row_sum += out[2].imag() * out[2].imag();

           real row_sum_inv = static_cast<real>(1.0) / row_sum;


           real diff = u0_inv * u0_inv - row_sum;

           real U00_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);


           out[0] *= U00_mag;


           // Second, reconstruct first column

           real column_sum = out[0].real() * out[0].real();

           column_sum += out[0].imag() * out[0].imag();

           column_sum += out[3].real() * out[3].real();

           column_sum += out[3].imag() * out[3].imag();


           diff = u0_inv * u0_inv - column_sum;

           real U20_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);


           out[6] *= U20_mag;


           // Finally, reconstruct last elements from SU(2) rotation

           real r_inv2 = u0_inv * row_sum_inv;

           {

             complex A = cmul(conj(out[0]), out[3]);


             // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11

             out[4] = cmul(conj(out[6]), conj(out[2]));

             out[4] = cmac(u0 * A, out[1], out[4]);

             out[4] = -r_inv2 * out[4];


             // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2;  // U12

             out[5] = cmul(conj(out[6]), conj(out[1]));

             out[5] = cmac(-u0 * A, out[2], out[5]);

             out[5] = r_inv2 * out[5];

           }


           {

             complex A = cmul(conj(out[0]), out[6]);


             // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2;  // U21

             out[7] = cmul(conj(out[3]), conj(out[2]));

             out[7] = cmac(-u0 * A, out[1], out[7]);

             out[7] = r_inv2 * out[7];


             // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12

             out[8] = cmul(conj(out[3]), conj(out[1]));

             out[8] = cmac(u0 * A, out[2], out[8]);

             out[8] = -r_inv2 * out[8];

           }


           // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back

           // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}}

 #pragma unroll

           for (int i = 0; i < 3; i++) {

             const auto tmp = out[i];

             out[i] = out[i + 3];

             out[i + 3] = tmp;

             out[i + 6] = -out[i + 6];

           }

         }


         template <typename I>

         __device__ __host__ inline void

         Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R,

                const complex scale = complex(static_cast<real>(1.0), static_cast<real>(1.0))) const

         {

           complex u = dir < 3 ?

             anisotropy :

             timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,

                                          isFirstTimeSlice, isLastTimeSlice, ghostExchange);

           Unpack(out, in, idx, dir, phase, X, R, scale, u);

         }


         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }

       };


       template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>

       struct Reconstruct<9, Float, ghostExchange_, stag_phase> {

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         const Reconstruct<8, Float, ghostExchange_> reconstruct_8;

         const real scale;

         const real scale_inv;


         Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {}


         Reconstruct(const Reconstruct<9, Float, ghostExchange_, stag_phase> &recon) :

             reconstruct_8(recon.reconstruct_8),

             scale(recon.scale),

             scale_inv(recon.scale_inv)

         {

         }


         __device__ __host__ inline real getPhase(const complex in[9]) const

         {

 #if 1 // phase from cross product

           // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*

           complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;

           complex expI3Phase = in[8] / denom; // numerator = U[2][2]

           if (stag_phase == QUDA_STAGGERED_PHASE_NO) {

             return arg(expI3Phase) / static_cast<real>(3.0);

           } else {

             return expI3Phase.real() > 0 ? 1 : -1;

           }

 #else // phase from determinant

           Matrix<complex, 3> a;

 #pragma unroll

           for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];

           const complex det = getDeterminant(a);

           real phase = arg(det) / 3;

           return phase;

 #endif

         }


         // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,

         __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const

         {

           real phase = getPhase(in);

           complex su3[9];


           if (stag_phase == QUDA_STAGGERED_PHASE_NO) {

             real cos_sin[2];

             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(-phase), &cos_sin[1], &cos_sin[0]);

             complex z(cos_sin[0], cos_sin[1]);

             z *= scale_inv;

 #pragma unroll

             for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]);

           } else {

 #pragma unroll

             for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }

           }

           reconstruct_8.Pack(out, su3, idx);

         }


         template <typename I>

         __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,

                                                const I *X, const int *R) const

         {

           reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast<real>(1.0), static_cast<real>(1.0)),

                                complex(static_cast<real>(1.0), static_cast<real>(1.0)));


           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase

             real cos_sin[2];

             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(phase), &cos_sin[1], &cos_sin[0]);

             complex z(cos_sin[0], cos_sin[1]);

             z *= scale;

 #pragma unroll

             for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);

           } else { // stagic phase

 #pragma unroll

             for (int i = 0; i < 18; i++) { out[i] *= phase; }

           }

         }

       };


       __host__ __device__ constexpr int ct_sqrt(int n, int i = 1)

       {

         return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i);

       }


       __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); }


       // we default to huge allocations for gauge field (for now)

       constexpr bool default_huge_alloc = true;


       template <QudaStaggeredPhase phase> __host__ __device__ inline bool static_phase()

       {

         switch (phase) {

         case QUDA_STAGGERED_PHASE_MILC:

         case QUDA_STAGGERED_PHASE_CPS:

         case QUDA_STAGGERED_PHASE_TIFR: return true;

         default: return false;

         }

       }


       template <typename Float, int length, int N, int reconLenParam,

           QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,

           QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>

       struct FloatNOrder {

         using Accessor

             = FloatNOrder<Float, length, N, reconLenParam, stag_phase, huge_alloc, ghostExchange_, use_inphase>;


         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         typedef typename VectorType<Float, N>::type Vector;

         typedef typename AllocType<huge_alloc>::type AllocInt;

         Reconstruct<reconLenParam, Float, ghostExchange_, stag_phase> reconstruct;

         static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;

         static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;

         Float *gauge;

         const AllocInt offset;

         Float *ghost[4];

         QudaGhostExchange ghostExchange;

         int coords[QUDA_MAX_DIM];

         int_fastdiv X[QUDA_MAX_DIM];

         int R[QUDA_MAX_DIM];

         const int volumeCB;

         int faceVolumeCB[4];

         const int stride;

         const int geometry;

         const AllocInt phaseOffset;

         void *backup_h;

         size_t bytes;


         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0, bool override = false) :

           reconstruct(u),

           gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

           offset(u.Bytes() / (2 * sizeof(Float) * N)),

           ghostExchange(u.GhostExchange()),

           volumeCB(u.VolumeCB()),

           stride(u.Stride()),

           geometry(u.Geometry()),

           phaseOffset(u.PhaseOffset() / sizeof(Float)),

           backup_h(nullptr),

           bytes(u.Bytes())

         {

           if (geometry == QUDA_COARSE_GEOMETRY)

             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");


           // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12),

           //           "staggered phase only presently supported for 18 and 12 reconstruct");

           for (int i = 0; i < 4; i++) {

             X[i] = u.X()[i];

             R[i] = u.R()[i];

             ghost[i] = ghost_ ? ghost_[i] : 0;

             faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth

           }

         }


         FloatNOrder(const FloatNOrder &order) :

           reconstruct(order.reconstruct),

           gauge(order.gauge),

           offset(order.offset),

           ghostExchange(order.ghostExchange),

           volumeCB(order.volumeCB),

           stride(order.stride),

           geometry(order.geometry),

           phaseOffset(order.phaseOffset),

           backup_h(nullptr),

           bytes(order.bytes)

         {

           for (int i = 0; i < 4; i++) {

             X[i] = order.X[i];

             R[i] = order.R[i];

             ghost[i] = order.ghost[i];

             faceVolumeCB[i] = order.faceVolumeCB[i];

           }

       }


       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

       {

         const int M = reconLen / N;

         real tmp[reconLen];


 #pragma unroll

         for (int i=0; i<M; i++){

           // first load from memory

           Vector vecTmp = vector_load<Vector>(gauge, parity * offset + (dir * M + i) * stride + x);

           // second do copy converting into register type

 #pragma unroll

           for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);

         }


         real phase = 0.;

         if (hasPhase) {

           if (static_phase<stag_phase>() && (reconLen == 13 || use_inphase)) {

             phase = inphase;

           } else {

             copy(phase, gauge[parity * offset * N + phaseOffset + stride * dir + x]);

             phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);

           }

         }


         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);

       }


       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)

       {

         const int M = reconLen / N;

         real tmp[reconLen];

         reconstruct.Pack(tmp, v, x);


 #pragma unroll

         for (int i=0; i<M; i++){

           Vector vecTmp;

           // first do copy converting into storage type

 #pragma unroll

           for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);

           // second do vectorized copy into memory

           vector_store(gauge, parity * offset + x + (dir * M + i) * stride, vecTmp);

         }

         if (hasPhase) {

           real phase = reconstruct.getPhase(v);

           copy(gauge[parity * offset * N + phaseOffset + dir * stride + x], static_cast<real>(phase / (2. * M_PI)));

         }

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity, real phase = 1.0)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity, phase);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity,

                                                                                 real phase = 1.0) const

       {

         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity, phase);

       }


       __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

       {

         if (!ghost[dir]) { // load from main field not separate array

           load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region

           // This also works perfectly when phases are stored. No need to change this.

         } else {

           const int M = reconLen / N;

           real tmp[reconLen];


 #pragma unroll

           for (int i=0; i<M; i++) {

             // first do vectorized copy from memory into registers

             Vector vecTmp = vector_load<Vector>(

                 ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x);

             // second do copy converting into register type

 #pragma unroll

             for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);

           }

           real phase = 0.;


           if (hasPhase) {


             // if(stag_phase == QUDA_STAGGERED_PHASE_MILC )  {

             //   phase = inphase < static_cast<Float>(0) ? static_cast<Float>(-1./(2.*M_PI)) : static_cast<Float>(1./2.*M_PI);

             // } else {

             copy(phase, ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x]);

             phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);

             // }

           }

           reconstruct.Unpack(v, tmp, x, dir, phase, X, R);

         }

       }


       __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)

       {

         if (!ghost[dir]) { // store in main field not separate array

           save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region

         } else {

           const int M = reconLen / N;

           real tmp[reconLen];

           reconstruct.Pack(tmp, v, x);


 #pragma unroll

           for (int i=0; i<M; i++) {

             Vector vecTmp;

             // first do copy converting into storage type

 #pragma unroll

             for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);

             // second do vectorized copy into memory

             vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);

           }


           if (hasPhase) {

             real phase = reconstruct.getPhase(v);

             copy(ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x],

                  static_cast<real>(phase / (2. * M_PI)));

           }

         }

       }


       __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,

                                                                            real phase = 1.0)

       {

         return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);

       }


       __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,

                                                                                  real phase = 1.0) const

       {

         return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);

       }


       __device__ __host__ inline void loadGhostEx(complex v[length / 2], int buff_idx, int extended_idx, int dir,

                                                   int dim, int g, int parity, const int R[]) const

       {

         const int M = reconLen / N;

         real tmp[reconLen];


 #pragma unroll

         for (int i=0; i<M; i++) {

           // first do vectorized copy from memory

           Vector vecTmp = vector_load<Vector>(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),

                                               +i*R[dim]*faceVolumeCB[dim]+buff_idx);

           // second do copy converting into register type

 #pragma unroll

           for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);

         }

         real phase = 0.;

         if (hasPhase)

           copy(phase,

                ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)

                           + R[dim] * faceVolumeCB[dim] * M * N + buff_idx]);


         // use the extended_idx to determine the boundary condition

         reconstruct.Unpack(v, tmp, extended_idx, g, 2. * M_PI * phase, X, R);

       }


       __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int buff_idx, int extended_idx, int dir,

                                                   int dim, int g, int parity, const int R[])

       {

         const int M = reconLen / N;

         real tmp[reconLen];

         // use the extended_idx to determine the boundary condition

         reconstruct.Pack(tmp, v, extended_idx);


 #pragma unroll

           for (int i=0; i<M; i++) {

             Vector vecTmp;

             // first do copy converting into storage type

 #pragma unroll

             for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);

             // second do vectorized copy to memory

             vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),

                          i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);

           }

           if (hasPhase) {

             real phase = reconstruct.getPhase(v);

             copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)

                             + R[dim] * faceVolumeCB[dim] * M * N + buff_idx],

                  static_cast<real>(phase / (2. * M_PI)));

           }

       }


       void save() {

         if (backup_h) errorQuda("Already allocated host backup");

         backup_h = safe_malloc(bytes);

         qudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);

       }


       void load()

       {

         qudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);

         host_free(backup_h);

         backup_h = nullptr;

       }


       size_t Bytes() const { return reconLen * sizeof(Float); }

       };


       template <typename real, int length> struct S {

         real v[length];

         __host__ __device__ const real &operator[](int i) const { return v[i]; }

         __host__ __device__ real &operator[](int i) { return v[i]; }

       };


       template <typename Float, int length> struct LegacyOrder {

         using Accessor = LegacyOrder<Float, length>;

         using real = typename mapper<Float>::type;

         using complex = complex<real>;

         Float *ghost[QUDA_MAX_DIM];

         int faceVolumeCB[QUDA_MAX_DIM];

         const int volumeCB;

         const int stride;

         const int geometry;

         const int hasPhase;


         LegacyOrder(const GaugeField &u, Float **ghost_) :

           volumeCB(u.VolumeCB()),

           stride(u.Stride()),

           geometry(u.Geometry()),

           hasPhase(0)

         {

           if (geometry == QUDA_COARSE_GEOMETRY)

             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");


           for (int i = 0; i < 4; i++) {

             ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);

             faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth

           }

         }


         LegacyOrder(const LegacyOrder &order) :

           volumeCB(order.volumeCB),

           stride(order.stride),

           geometry(order.geometry),

           hasPhase(0)

         {

           for (int i = 0; i < 4; i++) {

             ghost[i] = order.ghost[i];

             faceVolumeCB[i] = order.faceVolumeCB[i];

           }

         }


         __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const

         {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

           typedef S<Float, length> structure;

           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);

           structure v_ = ghost_[parity * faceVolumeCB[dir] + x];

 #else

           auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];

 #endif

           for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);

         }


         __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)

         {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

           typedef S<Float, length> structure;

           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);

           structure v_;

           for (int i = 0; i < length / 2; i++) {

             v_[2 * i + 0] = (Float)v[i].real();

             v_[2 * i + 1] = (Float)v[i].imag();

           }

           ghost_[parity * faceVolumeCB[dir] + x] = v_;

 #else

           auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];

           for (int i = 0; i < length / 2; i++) {

             v_[2 * i + 0] = (Float)v[i].real();

             v_[2 * i + 1] = (Float)v[i].imag();

           }

 #endif

         }


         __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,

                                                                              real phase = 1.0)

         {

           return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);

         }


         __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,

                                                                                    real phase = 1.0) const

         {

           return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);

         }


         __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int dummy, int dir, int dim, int g,

                                                     int parity, const int R[]) const

         {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);

         structure v_ = ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];

 #else

           auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];

 #endif

         for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);

         }


         __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int dummy, int dir, int dim,

                                                     int g, int parity, const int R[])

         {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

           typedef S<Float, length> structure;

           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dim]);

           structure v_;

           for (int i = 0; i < length / 2; i++) {

             v_[2 * i + 0] = (Float)v[i].real();

             v_[2 * i + 1] = (Float)v[i].imag();

           }

           ghost_[((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g] = v_;

 #else

           auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];

           for (int i = 0; i < length / 2; i++) {

             v_[2 * i + 0] = (Float)v[i].real();

             v_[2 * i + 1] = (Float)v[i].imag();

           }

 #endif

         }

       };


     template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {

       using Accessor = QDPOrder<Float, length>;

       using real = typename mapper<Float>::type;

       using complex = complex<real>;

       Float *gauge[QUDA_MAX_DIM];

       const int volumeCB;

     QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)

       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())

         { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }

     QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {

         for(int i=0; i<4; i++) gauge[i] = order.gauge[i];

       }


       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);

         structure v_ = gauge_[parity*volumeCB + x];

 #else

         auto v_ = &gauge[dir][(parity * volumeCB + x) * length];

 #endif

         for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);

       }


       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);

         structure v_;

         for (int i = 0; i < length / 2; i++) {

           v_[2 * i + 0] = (Float)v[i].real();

           v_[2 * i + 1] = (Float)v[i].imag();

         }

         gauge_[parity * volumeCB + x] = v_;

 #else

         auto v_ = &gauge[dir][(parity * volumeCB + x) * length];

         for (int i = 0; i < length / 2; i++) {

           v_[2 * i + 0] = (Float)v[i].real();

           v_[2 * i + 1] = (Float)v[i].imag();

         }

 #endif

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

       {

         return gauge_wrapper<real, QDPOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);

       }


       size_t Bytes() const { return length * sizeof(Float); }

     };


     template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {

       using Accessor = QDPJITOrder<Float, length>;

       using real = typename mapper<Float>::type;

       using complex = complex<real>;

       Float *gauge[QUDA_MAX_DIM];

       const int volumeCB;

     QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)

       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())

         { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }

     QDPJITOrder(const QDPJITOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {

         for(int i=0; i<4; i++) gauge[i] = order.gauge[i];

       }


       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

       {

         for (int i = 0; i < length / 2; i++) {

           v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]);

           v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]);

         }

       }


       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)

       {

         for (int i = 0; i < length / 2; i++) {

           gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].real();

           gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].imag();

         }

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

       {

         return gauge_wrapper<real, QDPJITOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);

       }


       size_t Bytes() const { return length * sizeof(Float); }

     };


   template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {

     using Accessor = MILCOrder<Float, length>;

     using real = typename mapper<Float>::type;

     using complex = complex<real>;

     Float *gauge;

     const int volumeCB;

     const int geometry;

   MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :

     LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),

       volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }

   MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),

       gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)

       { ; }


       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge);

       structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];

 #else

         auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];

 #endif

       for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);

     }


     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)

     {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge);

       structure v_;

       for (int i = 0; i < length / 2; i++) {

         v_[2 * i + 0] = v[i].real();

         v_[2 * i + 1] = v[i].imag();

       }

       gauge_[(parity*volumeCB+x)*geometry + dir] = v_;

 #else

       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];

       for (int i = 0; i < length / 2; i++) {

         v_[2 * i + 0] = v[i].real();

         v_[2 * i + 1] = v[i].imag();

       }

 #endif

     }


     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

     {

       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

     }


     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

     {

       return gauge_wrapper<real, MILCOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);

     }


     size_t Bytes() const { return length * sizeof(Float); }

   };


   template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {

     using Accessor = MILCSiteOrder<Float, length>;

     using real = typename mapper<Float>::type;

     using complex = complex<real>;

     Float *gauge;

     const int volumeCB;

     const int geometry;

     const size_t offset;

     const size_t size;

     MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :

       LegacyOrder<Float, length>(u, ghost_),

       gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

       volumeCB(u.VolumeCB()),

       geometry(u.Geometry()),

       offset(u.SiteOffset()),

       size(u.SiteSize())

     {

       if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); }

     }


     MILCSiteOrder(const MILCSiteOrder &order) :

       LegacyOrder<Float, length>(order),

       gauge(order.gauge),

       volumeCB(order.volumeCB),

       geometry(order.geometry),

       offset(order.offset),

       size(order.size)

     {

     }


     __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const

     {

       // get base pointer

       const Float *gauge0 = reinterpret_cast<const Float*>(reinterpret_cast<const char*>(gauge) + (parity*volumeCB+x)*size + offset);


 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge0);

       structure v_ = gauge_[dir];

 #else

       auto v_ = &gauge0[dir * length];

 #endif

       for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);

     }


     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)

     {

       // get base pointer

       Float *gauge0 = reinterpret_cast<Float*>(reinterpret_cast<char*>(gauge) + (parity*volumeCB+x)*size + offset);


 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge0);

       structure v_;

       for (int i = 0; i < length / 2; i++) {

         v_[2 * i + 0] = v[i].real();

         v_[2 * i + 1] = v[i].imag();

       }

       gauge_[dir] = v_;

 #else

       for (int i = 0; i < length / 2; i++) {

         gauge0[dir * length + 2 * i + 0] = v[i].real();

         gauge0[dir * length + 2 * i + 1] = v[i].imag();

       }

 #endif

     }


     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

     {

       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

     }


     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

     {

       return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);

     }


     size_t Bytes() const { return length * sizeof(Float); }

   };


   template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {

     using Accessor = CPSOrder<Float, length>;

     using real = typename mapper<Float>::type;

     using complex = complex<real>;

     Float *gauge;

     const int volumeCB;

     const real anisotropy;

     const real anisotropy_inv;

     static constexpr int Nc = 3;

     const int geometry;

     CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :

       LegacyOrder<Float, length>(u, ghost_),

       gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

       volumeCB(u.VolumeCB()),

       anisotropy(u.Anisotropy()),

       anisotropy_inv(1.0 / anisotropy),

       geometry(u.Geometry())

     {

       if (length != 18) errorQuda("Gauge length %d not supported", length);

     }

     CPSOrder(const CPSOrder &order) :

       LegacyOrder<Float, length>(order),

       gauge(order.gauge),

       volumeCB(order.volumeCB),

       anisotropy(order.anisotropy),

       anisotropy_inv(order.anisotropy_inv),

       geometry(order.geometry)

     {

       ;

     }


     // we need to transpose and scale for CPS ordering

     __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float inphase = 1.0) const

     {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge);

       structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];

 #else

       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];

 #endif

       for (int i=0; i<Nc; i++) {

         for (int j=0; j<Nc; j++) {

           v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * anisotropy_inv;

         }

       }

     }


     __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)

     {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

       typedef S<Float,length> structure;

       trove::coalesced_ptr<structure> gauge_((structure*)gauge);

       structure v_;

       for (int i=0; i<Nc; i++)

         for (int j = 0; j < Nc; j++) {

           v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();

           v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();

         }

       gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;

 #else

       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];

       for (int i=0; i<Nc; i++) {

         for (int j=0; j<Nc; j++) {

           v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();

           v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();

         }

       }

 #endif

     }


     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

     {

       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

     }


     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

     {

       return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);

     }


     size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }

   };


     template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {

       using Accessor = BQCDOrder<Float, length>;

       using real = typename mapper<Float>::type;

       using complex = complex<real>;

       Float *gauge;

       const int volumeCB;

       int exVolumeCB; // extended checkerboard volume

       static constexpr int Nc = 3;

       BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :

         LegacyOrder<Float, length>(u, ghost_),

         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

         volumeCB(u.VolumeCB())

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);

         // compute volumeCB + halo region

         exVolumeCB = u.X()[0]/2 + 2;

         for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;

       }

       BQCDOrder(const BQCDOrder &order) :

         LegacyOrder<Float, length>(order),

         gauge(order.gauge),

         volumeCB(order.volumeCB),

         exVolumeCB(order.exVolumeCB)

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);

       }


       // we need to transpose for BQCD ordering

       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float, length> structure;

         trove::coalesced_ptr<structure> gauge_((structure *)gauge);

         structure v_ = gauge_[(dir * 2 + parity) * exVolumeCB + x];

 #else

         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];

 #endif

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) { v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]); }

         }

       }


       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge);

         structure v_;

         for (int i=0; i<Nc; i++)

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();

           }

         gauge_[(dir * 2 + parity) * exVolumeCB + x] = v_;

 #else

         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();

           }

         }

 #endif

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

       {

         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);

       }


       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }

     };


     template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {

       using Accessor = TIFROrder<Float, length>;

       using real = typename mapper<Float>::type;

       using complex = complex<real>;

       Float *gauge;

       const int volumeCB;

       static constexpr int Nc = 3;

       const real scale;

       const real scale_inv;

       TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :

         LegacyOrder<Float, length>(u, ghost_),

         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

         volumeCB(u.VolumeCB()),

         scale(u.Scale()),

         scale_inv(1.0 / scale)

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);

       }

       TIFROrder(const TIFROrder &order) :

         LegacyOrder<Float, length>(order),

         gauge(order.gauge),

         volumeCB(order.volumeCB),

         scale(order.scale),

         scale_inv(1.0 / scale)

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);

       }


       // we need to transpose for TIFR ordering

       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float, length> structure;

         trove::coalesced_ptr<structure> gauge_((structure *)gauge);

         structure v_ = gauge_[(dir * 2 + parity) * volumeCB + x];

 #else

         auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];

 #endif

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) {

             v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;

           }

         }

       }


       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)

       {

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge);

         structure v_;

         for (int i=0; i<Nc; i++)

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;

           }

         gauge_[(dir * 2 + parity) * volumeCB + x] = v_;

 #else

         auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;

           }

         }

 #endif

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

       {

         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);

       }


       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }

     };


     template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {

       using Accessor = TIFRPaddedOrder<Float, length>;

       using real = typename mapper<Float>::type;

       using complex = complex<real>;

       Float *gauge;

       const int volumeCB;

       int exVolumeCB;

       static constexpr int Nc = 3;

       const real scale;

       const real scale_inv;

       const int dim[4];

       const int exDim[4];

       TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :

         LegacyOrder<Float, length>(u, ghost_),

         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),

         volumeCB(u.VolumeCB()),

         exVolumeCB(1),

         scale(u.Scale()),

         scale_inv(1.0 / scale),

         dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},

         exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);


         // exVolumeCB is the padded checkboard volume

         for (int i=0; i<4; i++) exVolumeCB *= exDim[i];

         exVolumeCB /= 2;

       }


       TIFRPaddedOrder(const TIFRPaddedOrder &order) :

         LegacyOrder<Float, length>(order),

         gauge(order.gauge),

         volumeCB(order.volumeCB),

         exVolumeCB(order.exVolumeCB),

         scale(order.scale),

         scale_inv(order.scale_inv),

         dim {order.dim[0], order.dim[1], order.dim[2], order.dim[3]},

         exDim {order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]}

       {

         if (length != 18) errorQuda("Gauge length %d not supported", length);

       }


       __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const {

         // find coordinates

         int coord[4];

         getCoords(coord, x_cb, dim, parity);


         // get z-extended index

         coord[2] += 2; // offset for halo

         return linkIndex(coord, exDim);

       }


       // we need to transpose for TIFR ordering

       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const

       {

         int y = getPaddedIndex(x, parity);


 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge);

         structure v_ = gauge_[(dir*2+parity)*exVolumeCB + y];

 #else

         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];

 #endif

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) {

             v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;

           }

         }

       }


       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)

       {

         int y = getPaddedIndex(x, parity);


 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)

         typedef S<Float,length> structure;

         trove::coalesced_ptr<structure> gauge_((structure*)gauge);

         structure v_;

         for (int i=0; i<Nc; i++)

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;

           }

         gauge_[(dir * 2 + parity) * exVolumeCB + y] = v_;

 #else

         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];

         for (int i = 0; i < Nc; i++) {

           for (int j = 0; j < Nc; j++) {

             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;

             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;

           }

         }

 #endif

       }


       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)

       {

         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);

       }


       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const

       {

         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);

       }


       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }

     };


   } // namespace gauge


   template <typename otherFloat, typename storeFloat>

     __device__ __host__ inline void complex<double>::operator=(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {

     x = a.real();

     y = a.imag();

   }


   template <typename otherFloat, typename storeFloat>

     __device__ __host__ inline void complex<float>::operator=(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {

     x = a.real();

     y = a.imag();

   }


   template <typename otherFloat, typename storeFloat>

     __device__ __host__ inline complex<double>::complex(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {

     x = a.real();

     y = a.imag();

   }


   template <typename otherFloat, typename storeFloat>

     __device__ __host__ inline complex<float>::complex(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {

     x = a.real();

     y = a.imag();

   }


   // Use traits to reduce the template explosion

   template <typename T, QudaReconstructType, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,

             bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,

             bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER>

   struct gauge_mapper {

   };


   // double precision

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase> type;

   };


   // single precision

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase> type;

   };


 #ifdef FLOAT8

 #define N8 8

 #else

 #define N8 4

 #endif


   // half precision

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<short, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase> type;

   };


   // quarter precision

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase> type;

   };

   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<int8_t, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER> {

     typedef gauge::FloatNOrder<int8_t, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase> type;

   };


   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,

             QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER> {

     typedef gauge::MILCOrder<T, N> type;

   };


   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,

             QudaGhostExchange ghostExchange, bool use_inphase>

   struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER> {

     typedef gauge::QDPOrder<T, N> type;

   };


   template<typename T, QudaGaugeFieldOrder order, int Nc> struct gauge_order_mapper { };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDP_GAUGE_ORDER,Nc> { typedef gauge::QDPOrder<T, 2*Nc*Nc> type; };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDPJIT_GAUGE_ORDER,Nc> { typedef gauge::QDPJITOrder<T, 2*Nc*Nc> type; };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_MILC_GAUGE_ORDER,Nc> { typedef gauge::MILCOrder<T, 2*Nc*Nc> type; };

   template <typename T, int Nc> struct gauge_order_mapper<T, QUDA_CPS_WILSON_GAUGE_ORDER, Nc> {

     typedef gauge::CPSOrder<T, 2 * Nc * Nc> type;

   };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_BQCD_GAUGE_ORDER,Nc> { typedef gauge::BQCDOrder<T, 2*Nc*Nc> type; };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_GAUGE_ORDER,Nc> { typedef gauge::TIFROrder<T, 2*Nc*Nc> type; };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> { typedef gauge::TIFRPaddedOrder<T, 2*Nc*Nc> type; };

   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_FLOAT2_GAUGE_ORDER,Nc> { typedef gauge::FloatNOrder<T, 2*Nc*Nc, 2, 2*Nc*Nc> type; };


 } // namespace quda


 #endif // _GAUGE_ORDER_H

int_fastdiv
Definition: fast_intdiv.h:21

quda::GaugeField
Definition: gauge_field.h:200

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294

quda::GaugeField::Nface
int Nface() const
Definition: gauge_field.h:322

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:285

quda::GaugeField::Gauge_p
virtual void * Gauge_p()
Definition: gauge_field.h:358

quda::GaugeField::Ghost
const void ** Ghost() const
Definition: gauge_field.h:368

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286

quda::LatticeField::SurfaceCB
const int * SurfaceCB() const
Definition: lattice_field.h:536

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:557

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:505

quda::LatticeField::Scale
double Scale() const
Definition: lattice_field.h:577

quda::Matrix
Definition: quda_matrix.h:63

quda::Matrix::Matrix
__device__ __host__ Matrix()
Definition: quda_matrix.h:74

quda::Matrix::operator=
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:117

comm_coord
int comm_coord(int dim)
Definition: communicator_stack.cpp:58

comm_allreduce_min
void comm_allreduce_min(double *data)
Definition: communicator_stack.cpp:175

comm_dim
int comm_dim(int dim)
Definition: communicator_stack.cpp:56

comm_allreduce_max
void comm_allreduce_max(double *data)
Definition: communicator_stack.cpp:173

comm_allreduce
void comm_allreduce(double *data)
Definition: communicator_stack.cpp:171

dim
std::array< int, 4 > dim
Definition: command_line_params.cpp:34

anisotropy
double anisotropy
Definition: command_line_params.cpp:78

complex_quda.h

convert.h

parity
QudaParity parity
Definition: covdev_test.cpp:40

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34

nColor
const int nColor
Definition: covdev_test.cpp:44

QudaStaggeredPhase
enum QudaStaggeredPhase_s QudaStaggeredPhase

QUDA_STAGGERED_PHASE_NO
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515

QUDA_STAGGERED_PHASE_TIFR
@ QUDA_STAGGERED_PHASE_TIFR
Definition: enum_quda.h:518

QUDA_STAGGERED_PHASE_CPS
@ QUDA_STAGGERED_PHASE_CPS
Definition: enum_quda.h:517

QUDA_STAGGERED_PHASE_MILC
@ QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:516

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

QUDA_RECONSTRUCT_NO
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70

QUDA_RECONSTRUCT_12
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71

QUDA_RECONSTRUCT_13
@ QUDA_RECONSTRUCT_13
Definition: enum_quda.h:74

QUDA_RECONSTRUCT_8
@ QUDA_RECONSTRUCT_8
Definition: enum_quda.h:72

QUDA_RECONSTRUCT_10
@ QUDA_RECONSTRUCT_10
Definition: enum_quda.h:75

QUDA_RECONSTRUCT_9
@ QUDA_RECONSTRUCT_9
Definition: enum_quda.h:73

QUDA_COARSE_GEOMETRY
@ QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:503

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_GHOST_EXCHANGE_EXTENDED
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510

QUDA_GHOST_EXCHANGE_NO
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508

QUDA_GHOST_EXCHANGE_INVALID
@ QUDA_GHOST_EXCHANGE_INVALID
Definition: enum_quda.h:511

QUDA_GHOST_EXCHANGE_PAD
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509

QudaGhostExchange
enum QudaGhostExchange_s QudaGhostExchange

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

QUDA_FLOAT2_GAUGE_ORDER
@ QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:40

QUDA_BQCD_GAUGE_ORDER
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49

QUDA_TIFR_GAUGE_ORDER
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50

QUDA_QDP_GAUGE_ORDER
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44

QUDA_CPS_WILSON_GAUGE_ORDER
@ QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:46

QUDA_NATIVE_GAUGE_ORDER
@ QUDA_NATIVE_GAUGE_ORDER
Definition: enum_quda.h:43

QUDA_TIFR_PADDED_GAUGE_ORDER
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51

QUDA_MILC_GAUGE_ORDER
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47

QUDA_QDPJIT_GAUGE_ORDER
@ QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:45

fast_intdiv.h

gauge_field.h

length
int length[]
Definition: gauge_force_test.cpp:18

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:106

host_free
#define host_free(ptr)
Definition: malloc_quda.h:115

quda::blas_lapack::native::init
void init()
Create the BLAS context.
Definition: blas_lapack_cublas.cpp:28

quda::device::profile::start
void start()
Start profiling.
Definition: device.cpp:226

quda::gauge::ct_sqrt
__host__ constexpr __device__ int ct_sqrt(int n, int i=1)
Definition: gauge_field_order.h:1817

quda::gauge::static_phase
__host__ __device__ bool static_phase()
Definition: gauge_field_order.h:1832

quda::gauge::fixed_point
__host__ constexpr __device__ bool fixed_point()
Definition: gauge_field_order.h:218

quda::gauge::timeBoundary
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
Definition: gauge_field_order.h:1315

quda::gauge::match
__host__ constexpr __device__ bool match()
Definition: gauge_field_order.h:223

quda::gauge::match< int, int >
__host__ constexpr __device__ bool match< int, int >()
Definition: gauge_field_order.h:224

quda::gauge::fixed_point< float, int8_t >
__host__ constexpr __device__ bool fixed_point< float, int8_t >()
Definition: gauge_field_order.h:219

quda::gauge::operator+
__device__ __host__ complex< Float > operator+(const fieldorder_wrapper< Float, storeFloat > &a, const complex< Float > &b)
Definition: gauge_field_order.h:351

quda::gauge::fixed_point< float, short >
__host__ constexpr __device__ bool fixed_point< float, short >()
Definition: gauge_field_order.h:220

quda::gauge::match< short, short >
__host__ constexpr __device__ bool match< short, short >()
Definition: gauge_field_order.h:225

quda::gauge::indexFloatN
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
Definition: gauge_field_order.h:728

quda::gauge::Ncolor
__host__ constexpr __device__ int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
Definition: gauge_field_order.h:1827

quda::gauge::operator*
__device__ __host__ complex< Float > operator*(const Float &a, const fieldorder_wrapper< Float, storeFloat > &b)
Definition: gauge_field_order.h:344

quda::gauge::fixed_point< float, int >
__host__ constexpr __device__ bool fixed_point< float, int >()
Definition: gauge_field_order.h:221

quda::gauge::default_huge_alloc
constexpr bool default_huge_alloc
Definition: gauge_field_order.h:1830

quda::gauge::milcStaggeredPhase
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
Definition: gauge_field_order.h:1348

quda
Definition: blas_lapack.h:24

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

quda::transform_reduce
void transform_reduce(Arg &arg)
Definition: transform_reduce.h:58

quda::cmul
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
Definition: complex_quda.h:1357

quda::cmac
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
Definition: complex_quda.h:1368

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1072

quda::getDeterminant
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:417

quda::vector_store
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Definition: register_traits.h:520

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1088

quda::copy
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64

quda::abs
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest-internal.h:396

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204

QUDA_MAX_GEOMETRY
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
Definition: quda_constants.h:24

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Definition: quda_constants.h:17

quda_matrix.h

register_traits.h
Provides precision abstractions and defines the register precision given the storage precision using ...

quda::AllocType
Definition: register_traits.h:606

quda::RealType
Definition: float_vector.h:250

quda::Trig
Definition: register_traits.h:368

quda::VectorType
Definition: register_traits.h:442

quda::complex< int8_t >
Definition: complex_quda.h:707

quda::complex< int8_t >::imag
__host__ __device__ int8_t imag() const volatile
Definition: complex_quda.h:736

quda::complex< int8_t >::real
__host__ __device__ int8_t real() const volatile
Definition: complex_quda.h:735

quda::complex< int >
Definition: complex_quda.h:800

quda::complex< int >::imag
__host__ __device__ int imag() const volatile
Definition: complex_quda.h:829

quda::complex< int >::real
__host__ __device__ int real() const volatile
Definition: complex_quda.h:828

quda::complex< short >
Definition: complex_quda.h:754

quda::complex< short >::real
__host__ __device__ short real() const volatile
Definition: complex_quda.h:782

quda::complex< short >::imag
__host__ __device__ short imag() const volatile
Definition: complex_quda.h:783

quda::complex< Float >

quda::complex::complex
__host__ __device__ complex(const ValueType &re=ValueType(), const ValueType &im=ValueType())
Definition: complex_quda.h:375

quda::complex::imag
__host__ __device__ ValueType imag() const volatile

quda::complex::real
__host__ __device__ ValueType real() const volatile

quda::complex::operator=
__host__ __device__ complex< ValueType > & operator=(const complex< T > z)
Definition: complex_quda.h:399

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >
Definition: gauge_field_order.h:738

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::offset_cb
const int offset_cb
Definition: gauge_field_order.h:741

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
Definition: gauge_field_order.h:750

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:747

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:797

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
Definition: gauge_field_order.h:790

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:746

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:742

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::max
Float max
Definition: gauge_field_order.h:745

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Definition: gauge_field_order.h:824

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat > &a)
Definition: gauge_field_order.h:760

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::geometry
const int geometry
Definition: gauge_field_order.h:744

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::stride
const int stride
Definition: gauge_field_order.h:743

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
Definition: gauge_field_order.h:779

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::u
complex< storeFloat > * u
Definition: gauge_field_order.h:740

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat >::resetScale
void resetScale(Float max_)
Definition: gauge_field_order.h:771

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >
Definition: gauge_field_order.h:544

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::wrap
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
This and the following method creates a fieldorder_wrapper object whose pointer points to the start o...
Definition: gauge_field_order.h:592

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Definition: gauge_field_order.h:639

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::u
complex< storeFloat > * u
Definition: gauge_field_order.h:546

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat > &a)
Definition: gauge_field_order.h:561

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:553

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:607

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::geometry
const int geometry
Definition: gauge_field_order.h:548

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::wrap
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:601

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:547

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:569

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:576

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:612

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:550

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:549

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >
Definition: gauge_field_order.h:391

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::geometry
const int geometry
Definition: gauge_field_order.h:395

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:444

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:394

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat > &a)
Definition: gauge_field_order.h:411

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Definition: gauge_field_order.h:472

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:439

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:401

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::cb_offset
const int cb_offset
Definition: gauge_field_order.h:396

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:421

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:428

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:397

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:398

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat >::u
complex< storeFloat > * u[QUDA_MAX_GEOMETRY]
Definition: gauge_field_order.h:393

quda::gauge::Accessor
Definition: gauge_field_order.h:362

quda::gauge::Accessor::Accessor
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:365

quda::gauge::Accessor::resetScale
void resetScale(Float dummy)
Definition: gauge_field_order.h:369

quda::gauge::Accessor::operator()
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:371

quda::gauge::Accessor::dummy
complex< Float > dummy
Definition: gauge_field_order.h:364

quda::gauge::Accessor::is_mma_compatible
static constexpr bool is_mma_compatible
Definition: gauge_field_order.h:363

quda::gauge::BQCDOrder
struct to define BQCD ordered gauge fields:
Definition: gauge_field_order.h:2775

quda::gauge::BQCDOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2850

quda::gauge::BQCDOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2803

quda::gauge::BQCDOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2870

quda::gauge::BQCDOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2779

quda::gauge::BQCDOrder::exVolumeCB
int exVolumeCB
Definition: gauge_field_order.h:2781

quda::gauge::BQCDOrder::Nc
static constexpr int Nc
Definition: gauge_field_order.h:2782

quda::gauge::BQCDOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2778

quda::gauge::BQCDOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2865

quda::gauge::BQCDOrder::BQCDOrder
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2783

quda::gauge::BQCDOrder::BQCDOrder
BQCDOrder(const BQCDOrder &order)
Definition: gauge_field_order.h:2793

quda::gauge::BQCDOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2777

quda::gauge::BQCDOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2780

quda::gauge::BQCDOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2817

quda::gauge::CPSOrder
Definition: gauge_field_order.h:2664

quda::gauge::CPSOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2669

quda::gauge::CPSOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2765

quda::gauge::CPSOrder::anisotropy_inv
const real anisotropy_inv
Definition: gauge_field_order.h:2671

quda::gauge::CPSOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2760

quda::gauge::CPSOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2712

quda::gauge::CPSOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2745

quda::gauge::CPSOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2667

quda::gauge::CPSOrder::Nc
static constexpr int Nc
Definition: gauge_field_order.h:2672

quda::gauge::CPSOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2666

quda::gauge::CPSOrder::anisotropy
const real anisotropy
Definition: gauge_field_order.h:2670

quda::gauge::CPSOrder::geometry
const int geometry
Definition: gauge_field_order.h:2673

quda::gauge::CPSOrder::CPSOrder
CPSOrder(const CPSOrder &order)
Definition: gauge_field_order.h:2684

quda::gauge::CPSOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
Definition: gauge_field_order.h:2696

quda::gauge::CPSOrder::CPSOrder
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2674

quda::gauge::CPSOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2668

quda::gauge::FieldOrder
Definition: gauge_field_order.h:921

quda::gauge::FieldOrder::Geometry
__device__ __host__ int Geometry() const
Definition: gauge_field_order.h:1187

quda::gauge::FieldOrder::geometry
const int_fastdiv geometry
Definition: gauge_field_order.h:930

quda::gauge::FieldOrder::Float
Float_ Float
Definition: gauge_field_order.h:924

quda::gauge::FieldOrder::accessor
accessor_type accessor
Definition: gauge_field_order.h:936

quda::gauge::FieldOrder::is_mma_compatible
static constexpr bool is_mma_compatible
Definition: gauge_field_order.h:935

quda::gauge::FieldOrder::Volume
__device__ __host__ int Volume() const
Definition: gauge_field_order.h:1178

quda::gauge::FieldOrder::Ncolor
__device__ __host__ int Ncolor() const
Definition: gauge_field_order.h:1175

quda::gauge::FieldOrder::abs_min
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
Definition: gauge_field_order.h:1236

quda::gauge::FieldOrder::Ghost
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Definition: gauge_field_order.h:1124

quda::gauge::FieldOrder::atomicAdd
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
Definition: gauge_field_order.h:1169

quda::gauge::FieldOrder::ghostAccessor
GhostAccessor< Float, nColor, order, native_ghost, storeFloat > ghostAccessor
Definition: gauge_field_order.h:937

quda::gauge::FieldOrder::NcolorCoarse
__device__ __host__ int NcolorCoarse() const
Definition: gauge_field_order.h:1193

quda::gauge::FieldOrder::wrap
__device__ __host__ const auto wrap(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
Definition: gauge_field_order.h:989

quda::gauge::FieldOrder::resetScale
void resetScale(double max)
Definition: gauge_field_order.h:960

quda::gauge::FieldOrder::nColorCoarse
static constexpr int nColorCoarse
Definition: gauge_field_order.h:932

quda::gauge::FieldOrder::Ndim
__device__ __host__ int Ndim() const
Definition: gauge_field_order.h:1184

quda::gauge::FieldOrder::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
Definition: gauge_field_order.h:1085

quda::gauge::FieldOrder::FieldOrder
FieldOrder(const FieldOrder &o)
Definition: gauge_field_order.h:955

quda::gauge::FieldOrder::wrap
__device__ __host__ const auto wrap(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
Definition: gauge_field_order.h:1101

quda::gauge::FieldOrder::VolumeCB
__device__ __host__ int VolumeCB() const
Definition: gauge_field_order.h:1181

quda::gauge::FieldOrder::operator()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Definition: gauge_field_order.h:1069

quda::gauge::FieldOrder::wrap_ghost
__device__ __host__ const auto wrap_ghost(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
Definition: gauge_field_order.h:1155

quda::gauge::FieldOrder::FieldOrder
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:946

quda::gauge::FieldOrder::wrap_ghost
__device__ __host__ const auto wrap_ghost(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
Definition: gauge_field_order.h:1049

quda::gauge::FieldOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:1244

quda::gauge::FieldOrder::storeFloat
storeFloat_ storeFloat
Definition: gauge_field_order.h:925

quda::gauge::FieldOrder::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:1007

quda::gauge::FieldOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:928

quda::gauge::FieldOrder::fixedPoint
static constexpr bool fixedPoint()
Definition: gauge_field_order.h:965

quda::gauge::FieldOrder::norm2
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
Definition: gauge_field_order.h:1212

quda::gauge::FieldOrder::Ghost
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:1035

quda::gauge::FieldOrder::Ghost
__device__ __host__ auto Ghost(int d, int parity, int x) const
Definition: gauge_field_order.h:1025

quda::gauge::FieldOrder::supports_ghost_zone
static constexpr bool supports_ghost_zone
Definition: gauge_field_order.h:940

quda::gauge::FieldOrder::location
const QudaFieldLocation location
Definition: gauge_field_order.h:931

quda::gauge::FieldOrder::Ghost
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
Definition: gauge_field_order.h:1140

quda::gauge::FieldOrder::wrap
__device__ __host__ auto wrap(int d, int parity, int x, int s_row, int s_col)
the non-const wrap method.
Definition: gauge_field_order.h:1109

quda::gauge::FieldOrder::nDim
const int nDim
Definition: gauge_field_order.h:929

quda::gauge::FieldOrder::wrap_ghost
__device__ __host__ auto wrap_ghost(int d, int parity, int x)
the non-const wrap_ghost method.
Definition: gauge_field_order.h:1057

quda::gauge::FieldOrder::NspinCoarse
__device__ __host__ int NspinCoarse() const
Definition: gauge_field_order.h:1190

quda::gauge::FieldOrder::Ghost
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:1020

quda::gauge::FieldOrder::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:975

quda::gauge::FieldOrder::abs_max
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
Definition: gauge_field_order.h:1224

quda::gauge::FieldOrder::wrap
__device__ __host__ auto wrap(int d, int parity, int x)
the non-const wrap method.
Definition: gauge_field_order.h:997

quda::gauge::FieldOrder::wrap_ghost
__device__ __host__ auto wrap_ghost(int d, int parity, int x, int s_row, int s_col)
the non-const wrap_ghost method.
Definition: gauge_field_order.h:1163

quda::gauge::FieldOrder::norm1
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
Definition: gauge_field_order.h:1200

quda::gauge::FloatNOrder
Definition: gauge_field_order.h:1845

quda::gauge::FloatNOrder::Ghost
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2065

quda::gauge::FloatNOrder::loadGhost
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:1995

quda::gauge::FloatNOrder::saveGhost
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2028

quda::gauge::FloatNOrder::X
int_fastdiv X[QUDA_MAX_DIM]
Definition: gauge_field_order.h:1861

quda::gauge::FloatNOrder::backup_h
void * backup_h
Definition: gauge_field_order.h:1868

quda::gauge::FloatNOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:1943

quda::gauge::FloatNOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:1863

quda::gauge::FloatNOrder::reconstruct
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
Definition: gauge_field_order.h:1853

quda::gauge::FloatNOrder::phaseOffset
const AllocInt phaseOffset
Definition: gauge_field_order.h:1867

quda::gauge::FloatNOrder::geometry
const int geometry
Definition: gauge_field_order.h:1866

quda::gauge::FloatNOrder::bytes
size_t bytes
host memory for backing up the field when tuning
Definition: gauge_field_order.h:1869

quda::gauge::FloatNOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1849

quda::gauge::FloatNOrder::FloatNOrder
FloatNOrder(const FloatNOrder &order)
Definition: gauge_field_order.h:1896

quda::gauge::FloatNOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:1916

quda::gauge::FloatNOrder::offset
const AllocInt offset
Definition: gauge_field_order.h:1857

quda::gauge::FloatNOrder::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1859

quda::gauge::FloatNOrder::coords
int coords[QUDA_MAX_DIM]
Definition: gauge_field_order.h:1860

quda::gauge::FloatNOrder::FloatNOrder
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
Definition: gauge_field_order.h:1871

quda::gauge::FloatNOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2157

quda::gauge::FloatNOrder::save
void save()
Backup the field to the host when tuning.
Definition: gauge_field_order.h:2141

quda::gauge::FloatNOrder::Ghost
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
Definition: gauge_field_order.h:2081

quda::gauge::FloatNOrder::AllocInt
AllocType< huge_alloc >::type AllocInt
Definition: gauge_field_order.h:1852

quda::gauge::FloatNOrder::loadGhostEx
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
Definition: gauge_field_order.h:2087

quda::gauge::FloatNOrder::complex
complex< real > complex
Definition: gauge_field_order.h:1850

quda::gauge::FloatNOrder::reconLen
static constexpr int reconLen
Definition: gauge_field_order.h:1854

quda::gauge::FloatNOrder::hasPhase
static constexpr int hasPhase
Definition: gauge_field_order.h:1855

quda::gauge::FloatNOrder::saveGhostEx
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
Definition: gauge_field_order.h:2112

quda::gauge::FloatNOrder::gauge
Float * gauge
Definition: gauge_field_order.h:1856

quda::gauge::FloatNOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:1989

quda::gauge::FloatNOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:1974

quda::gauge::FloatNOrder::stride
const int stride
Definition: gauge_field_order.h:1865

quda::gauge::FloatNOrder::ghost
Float * ghost[4]
Definition: gauge_field_order.h:1858

quda::gauge::FloatNOrder::load
void load()
Restore the field from the host after tuning.
Definition: gauge_field_order.h:2150

quda::gauge::FloatNOrder::faceVolumeCB
int faceVolumeCB[4]
Definition: gauge_field_order.h:1864

quda::gauge::FloatNOrder::R
int R[QUDA_MAX_DIM]
Definition: gauge_field_order.h:1862

quda::gauge::FloatNOrder::Vector
VectorType< Float, N >::type Vector
Definition: gauge_field_order.h:1851

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >
Definition: gauge_field_order.h:837

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:839

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::accessor
Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat > accessor
Definition: gauge_field_order.h:844

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat > &a)
Definition: gauge_field_order.h:862

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
Definition: gauge_field_order.h:882

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
Definition: gauge_field_order.h:896

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::ghostVolumeCB
int ghostVolumeCB[8]
Definition: gauge_field_order.h:840

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:842

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
Definition: gauge_field_order.h:846

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:838

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:874

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:841

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >
Definition: gauge_field_order.h:653

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:693

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::wrap
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
the non-const wrap method.
Definition: gauge_field_order.h:716

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:657

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:660

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:654

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:686

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::wrap
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
The method similar to Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat>::wrap: this method a...
Definition: gauge_field_order.h:707

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:722

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::ghostOffset
int ghostOffset[8]
Definition: gauge_field_order.h:655

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat > &a)
Definition: gauge_field_order.h:676

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:656

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >
Definition: gauge_field_order.h:488

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:492

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::scale
Float scale
Definition: gauge_field_order.h:491

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:528

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat > &a)
Definition: gauge_field_order.h:511

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:538

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:495

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:521

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::ghostOffset
int ghostOffset[8]
Definition: gauge_field_order.h:490

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:489

quda::gauge::GhostAccessor
Definition: gauge_field_order.h:377

quda::gauge::GhostAccessor::GhostAccessor
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:379

quda::gauge::GhostAccessor::resetScale
void resetScale(Float dummy)
Definition: gauge_field_order.h:383

quda::gauge::GhostAccessor::dummy
complex< Float > dummy
Definition: gauge_field_order.h:378

quda::gauge::GhostAccessor::operator()
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:385

quda::gauge::LegacyOrder
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields,...
Definition: gauge_field_order.h:2176

quda::gauge::LegacyOrder::saveGhost
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2226

quda::gauge::LegacyOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2179

quda::gauge::LegacyOrder::Ghost
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
Definition: gauge_field_order.h:2272

quda::gauge::LegacyOrder::faceVolumeCB
int faceVolumeCB[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2181

quda::gauge::LegacyOrder::geometry
const int geometry
Definition: gauge_field_order.h:2184

quda::gauge::LegacyOrder::stride
const int stride
Definition: gauge_field_order.h:2183

quda::gauge::LegacyOrder::loadGhostEx
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
Definition: gauge_field_order.h:2278

quda::gauge::LegacyOrder::Ghost
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2256

quda::gauge::LegacyOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2178

quda::gauge::LegacyOrder::LegacyOrder
LegacyOrder(const LegacyOrder &order)
Definition: gauge_field_order.h:2202

quda::gauge::LegacyOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2182

quda::gauge::LegacyOrder::hasPhase
const int hasPhase
Definition: gauge_field_order.h:2185

quda::gauge::LegacyOrder::loadGhost
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
Definition: gauge_field_order.h:2214

quda::gauge::LegacyOrder::ghost
Float * ghost[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2180

quda::gauge::LegacyOrder::saveGhostEx
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
Definition: gauge_field_order.h:2291

quda::gauge::LegacyOrder::LegacyOrder
LegacyOrder(const GaugeField &u, Float **ghost_)
Definition: gauge_field_order.h:2187

quda::gauge::MILCOrder
Definition: gauge_field_order.h:2465

quda::gauge::MILCOrder::MILCOrder
MILCOrder(const MILCOrder &order)
Definition: gauge_field_order.h:2475

quda::gauge::MILCOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2541

quda::gauge::MILCOrder::geometry
const int geometry
Definition: gauge_field_order.h:2471

quda::gauge::MILCOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2479

quda::gauge::MILCOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2470

quda::gauge::MILCOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2468

quda::gauge::MILCOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2521

quda::gauge::MILCOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2469

quda::gauge::MILCOrder::MILCOrder
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2472

quda::gauge::MILCOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2491

quda::gauge::MILCOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2536

quda::gauge::MILCSiteOrder
struct to define gauge fields packed into an opaque MILC site struct:
Definition: gauge_field_order.h:2559

quda::gauge::MILCSiteOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2564

quda::gauge::MILCSiteOrder::size
const size_t size
Definition: gauge_field_order.h:2567

quda::gauge::MILCSiteOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2562

quda::gauge::MILCSiteOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2589

quda::gauge::MILCSiteOrder::offset
const size_t offset
Definition: gauge_field_order.h:2566

quda::gauge::MILCSiteOrder::MILCSiteOrder
MILCSiteOrder(const MILCSiteOrder &order)
Definition: gauge_field_order.h:2579

quda::gauge::MILCSiteOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2651

quda::gauge::MILCSiteOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2604

quda::gauge::MILCSiteOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2636

quda::gauge::MILCSiteOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2656

quda::gauge::MILCSiteOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2563

quda::gauge::MILCSiteOrder::MILCSiteOrder
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2568

quda::gauge::MILCSiteOrder::geometry
const int geometry
Definition: gauge_field_order.h:2565

quda::gauge::QDPJITOrder
Definition: gauge_field_order.h:2399

quda::gauge::QDPJITOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2420

quda::gauge::QDPJITOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2458

quda::gauge::QDPJITOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2438

quda::gauge::QDPJITOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2453

quda::gauge::QDPJITOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2412

quda::gauge::QDPJITOrder::QDPJITOrder
QDPJITOrder(const QDPJITOrder &order)
Definition: gauge_field_order.h:2408

quda::gauge::QDPJITOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2404

quda::gauge::QDPJITOrder::gauge
Float * gauge[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2403

quda::gauge::QDPJITOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2401

quda::gauge::QDPJITOrder::QDPJITOrder
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2405

quda::gauge::QDPOrder
Definition: gauge_field_order.h:2317

quda::gauge::QDPOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2322

quda::gauge::QDPOrder::QDPOrder
QDPOrder(const QDPOrder &order)
Definition: gauge_field_order.h:2326

quda::gauge::QDPOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2387

quda::gauge::QDPOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2330

quda::gauge::QDPOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2342

quda::gauge::QDPOrder::QDPOrder
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2323

quda::gauge::QDPOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2319

quda::gauge::QDPOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2392

quda::gauge::QDPOrder::gauge
Float * gauge[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2321

quda::gauge::QDPOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2372

quda::gauge::QDPOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2320

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra,...
Definition: gauge_field_order.h:1449

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
Definition: gauge_field_order.h:1456

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1454

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1451

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1450

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1486

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1472

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1453

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
Definition: gauge_field_order.h:1366

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::tBoundary
const real tBoundary
Definition: gauge_field_order.h:1370

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1409

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Definition: gauge_field_order.h:1399

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::firstTimeSliceBound
const int firstTimeSliceBound
Definition: gauge_field_order.h:1371

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::lastTimeSliceBound
const int lastTimeSliceBound
Definition: gauge_field_order.h:1372

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::isLastTimeSlice
const bool isLastTimeSlice
Definition: gauge_field_order.h:1374

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1377

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::anisotropy
const real anisotropy
Definition: gauge_field_order.h:1369

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::isFirstTimeSlice
const bool isFirstTimeSlice
Definition: gauge_field_order.h:1373

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1367

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1375

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1368

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1388

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1436

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
Definition: gauge_field_order.h:1498

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::scale_inv
const real scale_inv
Definition: gauge_field_order.h:1503

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::complex
complex< real > complex
Definition: gauge_field_order.h:1500

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1505

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::reconstruct_12
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
Definition: gauge_field_order.h:1501

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1499

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Pack
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Definition: gauge_field_order.h:1513

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
Definition: gauge_field_order.h:1506

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::scale
const real scale
Definition: gauge_field_order.h:1502

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::getPhase
__device__ __host__ real getPhase(const complex in[9]) const
Definition: gauge_field_order.h:1552

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1519

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
Definition: gauge_field_order.h:1581

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1583

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1590

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
Definition: gauge_field_order.h:1634

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1604

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u, real scale=1.0)
Definition: gauge_field_order.h:1593

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1582

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::isLastTimeSlice
const bool isLastTimeSlice
Definition: gauge_field_order.h:1589

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1727

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::anisotropy
const complex anisotropy
Definition: gauge_field_order.h:1584

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::tBoundary
const complex tBoundary
Definition: gauge_field_order.h:1585

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
Definition: gauge_field_order.h:1620

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::firstTimeSliceBound
const int firstTimeSliceBound
Definition: gauge_field_order.h:1586

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
Definition: gauge_field_order.h:1717

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::isFirstTimeSlice
const bool isFirstTimeSlice
Definition: gauge_field_order.h:1588

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::lastTimeSliceBound
const int lastTimeSliceBound
Definition: gauge_field_order.h:1587

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
Definition: gauge_field_order.h:1739

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1797

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::complex
complex< real > complex
Definition: gauge_field_order.h:1741

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1746

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Pack
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
Definition: gauge_field_order.h:1777

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::scale_inv
const real scale_inv
Definition: gauge_field_order.h:1744

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::getPhase
__device__ __host__ real getPhase(const complex in[9]) const
Definition: gauge_field_order.h:1755

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
Definition: gauge_field_order.h:1748

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::scale
const real scale
Definition: gauge_field_order.h:1743

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1740

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::reconstruct_8
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
Definition: gauge_field_order.h:1742

quda::gauge::Reconstruct
Generic reconstruction helper with no reconstruction.
Definition: gauge_field_order.h:1256

quda::gauge::Reconstruct::Reconstruct
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1267

quda::gauge::Reconstruct::complex
complex< real > complex
Definition: gauge_field_order.h:1258

quda::gauge::Reconstruct::scale
real scale
Definition: gauge_field_order.h:1259

quda::gauge::Reconstruct::getPhase
__device__ __host__ real getPhase(const complex in[N/2]) const
Definition: gauge_field_order.h:1300

quda::gauge::Reconstruct::Unpack
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1289

quda::gauge::Reconstruct::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1261

quda::gauge::Reconstruct::scale_inv
real scale_inv
Definition: gauge_field_order.h:1260

quda::gauge::Reconstruct::Pack
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
Definition: gauge_field_order.h:1271

quda::gauge::Reconstruct::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1257

quda::gauge::S
This is just a dummy structure we use for trove to define the required structure size.
Definition: gauge_field_order.h:2166

quda::gauge::S::operator[]
__host__ __device__ real & operator[](int i)
Definition: gauge_field_order.h:2169

quda::gauge::S::v
real v[length]
Definition: gauge_field_order.h:2167

quda::gauge::S::operator[]
__host__ __device__ const real & operator[](int i) const
Definition: gauge_field_order.h:2168

quda::gauge::TIFROrder
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
Definition: gauge_field_order.h:2877

quda::gauge::TIFROrder::scale
const real scale
Definition: gauge_field_order.h:2884

quda::gauge::TIFROrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2970

quda::gauge::TIFROrder::TIFROrder
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2886

quda::gauge::TIFROrder::TIFROrder
TIFROrder(const TIFROrder &order)
Definition: gauge_field_order.h:2895

quda::gauge::TIFROrder::Nc
static constexpr int Nc
Definition: gauge_field_order.h:2883

quda::gauge::TIFROrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2879

quda::gauge::TIFROrder::scale_inv
const real scale_inv
Definition: gauge_field_order.h:2885

quda::gauge::TIFROrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2882

quda::gauge::TIFROrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2955

quda::gauge::TIFROrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2922

quda::gauge::TIFROrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2975

quda::gauge::TIFROrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2906

quda::gauge::TIFROrder::gauge
Float * gauge
Definition: gauge_field_order.h:2881

quda::gauge::TIFROrder::complex
complex< real > complex
Definition: gauge_field_order.h:2880

quda::gauge::TIFRPaddedOrder
Definition: gauge_field_order.h:2982

quda::gauge::TIFRPaddedOrder::dim
const int dim[4]
Definition: gauge_field_order.h:2992

quda::gauge::TIFRPaddedOrder::Nc
static constexpr int Nc
Definition: gauge_field_order.h:2989

quda::gauge::TIFRPaddedOrder::TIFRPaddedOrder
TIFRPaddedOrder(const TIFRPaddedOrder &order)
Definition: gauge_field_order.h:3011

quda::gauge::TIFRPaddedOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:3057

quda::gauge::TIFRPaddedOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:3107

quda::gauge::TIFRPaddedOrder::TIFRPaddedOrder
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2994

quda::gauge::TIFRPaddedOrder::getPaddedIndex
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded.
Definition: gauge_field_order.h:3028

quda::gauge::TIFRPaddedOrder::exDim
const int exDim[4]
Definition: gauge_field_order.h:2993

quda::gauge::TIFRPaddedOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:3112

quda::gauge::TIFRPaddedOrder::scale
const real scale
Definition: gauge_field_order.h:2990

quda::gauge::TIFRPaddedOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2984

quda::gauge::TIFRPaddedOrder::scale_inv
const real scale_inv
Definition: gauge_field_order.h:2991

quda::gauge::TIFRPaddedOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:3092

quda::gauge::TIFRPaddedOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2985

quda::gauge::TIFRPaddedOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:3039

quda::gauge::TIFRPaddedOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2987

quda::gauge::TIFRPaddedOrder::exVolumeCB
int exVolumeCB
Definition: gauge_field_order.h:2988

quda::gauge::TIFRPaddedOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2986

quda::gauge::abs_< Float, int8_t >::operator()
__host__ __device__ Float operator()(const quda::complex< int8_t > &x)
Definition: gauge_field_order.h:200

quda::gauge::abs_< Float, int8_t >::scale
Float scale
Definition: gauge_field_order.h:198

quda::gauge::abs_< Float, int8_t >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:199

quda::gauge::abs_< Float, int >::operator()
__host__ __device__ Float operator()(const quda::complex< int > &x)
Definition: gauge_field_order.h:214

quda::gauge::abs_< Float, int >::scale
Float scale
Definition: gauge_field_order.h:212

quda::gauge::abs_< Float, int >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:213

quda::gauge::abs_< Float, short >::operator()
__host__ __device__ Float operator()(const quda::complex< short > &x)
Definition: gauge_field_order.h:207

quda::gauge::abs_< Float, short >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:206

quda::gauge::abs_< Float, short >::scale
Float scale
Definition: gauge_field_order.h:205

quda::gauge::abs_
Definition: gauge_field_order.h:192

quda::gauge::abs_::operator()
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
Definition: gauge_field_order.h:194

quda::gauge::abs_::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:193

quda::gauge::fieldorder_wrapper
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors,...
Definition: gauge_field_order.h:235

quda::gauge::fieldorder_wrapper::idx
const int idx
Definition: gauge_field_order.h:243

quda::gauge::fieldorder_wrapper::operator=
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
Definition: gauge_field_order.h:298

quda::gauge::fieldorder_wrapper::fixed
static constexpr bool fixed
Definition: gauge_field_order.h:246

quda::gauge::fieldorder_wrapper::type
Float type
Definition: gauge_field_order.h:240

quda::gauge::fieldorder_wrapper::data
__device__ __host__ const auto data() const
Definition: gauge_field_order.h:283

quda::gauge::fieldorder_wrapper::operator=
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
Definition: gauge_field_order.h:307

quda::gauge::fieldorder_wrapper::scale_inv
const Float scale_inv
Definition: gauge_field_order.h:245

quda::gauge::fieldorder_wrapper::store_type
storeFloat store_type
Definition: gauge_field_order.h:241

quda::gauge::fieldorder_wrapper::v
complex< storeFloat > * v
Definition: gauge_field_order.h:242

quda::gauge::fieldorder_wrapper::operator+=
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
Definition: gauge_field_order.h:320

quda::gauge::fieldorder_wrapper::real
__device__ __host__ Float real() const
Definition: gauge_field_order.h:260

quda::gauge::fieldorder_wrapper::operator-=
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
Definition: gauge_field_order.h:333

quda::gauge::fieldorder_wrapper::imag
__device__ __host__ Float imag() const
Definition: gauge_field_order.h:269

quda::gauge::fieldorder_wrapper::fieldorder_wrapper
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
Definition: gauge_field_order.h:252

quda::gauge::fieldorder_wrapper::operator-
__device__ __host__ complex< Float > operator-() const
negation operator
Definition: gauge_field_order.h:289

quda::gauge::fieldorder_wrapper::data
__device__ __host__ auto data()
returns the pointor of this wrapper object
Definition: gauge_field_order.h:281

quda::gauge::fieldorder_wrapper::scale
const Float scale
Definition: gauge_field_order.h:244

quda::gauge::square_< ReduceType, int8_t >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< int8_t > &x)
Definition: gauge_field_order.h:174

quda::gauge::square_< ReduceType, int8_t >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:173

quda::gauge::square_< ReduceType, int8_t >::scale
const ReduceType scale
Definition: gauge_field_order.h:172

quda::gauge::square_< ReduceType, int >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
Definition: gauge_field_order.h:188

quda::gauge::square_< ReduceType, int >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:187

quda::gauge::square_< ReduceType, int >::scale
const ReduceType scale
Definition: gauge_field_order.h:186

quda::gauge::square_< ReduceType, short >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
Definition: gauge_field_order.h:181

quda::gauge::square_< ReduceType, short >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:180

quda::gauge::square_< ReduceType, short >::scale
const ReduceType scale
Definition: gauge_field_order.h:179

quda::gauge::square_
Definition: gauge_field_order.h:165

quda::gauge::square_::square_
square_(ReduceType scale)
Definition: gauge_field_order.h:166

quda::gauge::square_::operator()
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
Definition: gauge_field_order.h:167

quda::gauge_ghost_wrapper
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors,...
Definition: gauge_field_order.h:109

quda::gauge_ghost_wrapper::dim
const int dim
Definition: gauge_field_order.h:110

quda::gauge_ghost_wrapper::ghost_idx
const int ghost_idx
Definition: gauge_field_order.h:111

quda::gauge_ghost_wrapper::parity
const int parity
Definition: gauge_field_order.h:112

quda::gauge_ghost_wrapper::gauge
T & gauge
Definition: gauge_field_order.h:114

quda::gauge_ghost_wrapper::operator=
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
Definition: gauge_field_order.h:138

quda::gauge_ghost_wrapper::phase
const Float phase
Definition: gauge_field_order.h:113

quda::gauge_mapper< T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER >::type
gauge::MILCOrder< T, N > type
Definition: gauge_field_order.h:3261

quda::gauge_mapper< T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER >::type
gauge::QDPOrder< T, N > type
Definition: gauge_field_order.h:3267

quda::gauge_mapper< double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3163

quda::gauge_mapper< double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3159

quda::gauge_mapper< double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3155

quda::gauge_mapper< double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3171

quda::gauge_mapper< double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3167

quda::gauge_mapper< double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3151

quda::gauge_mapper< float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3189

quda::gauge_mapper< float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3185

quda::gauge_mapper< float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3181

quda::gauge_mapper< float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3197

quda::gauge_mapper< float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3193

quda::gauge_mapper< float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3177

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3247

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3243

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3239

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3255

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3251

quda::gauge_mapper< int8_t, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< int8_t, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3235

quda::gauge_mapper< short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3221

quda::gauge_mapper< short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3217

quda::gauge_mapper< short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3213

quda::gauge_mapper< short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3229

quda::gauge_mapper< short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3225

quda::gauge_mapper< short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_NATIVE_GAUGE_ORDER >::type
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3209

quda::gauge_mapper
Definition: gauge_field_order.h:3145

quda::gauge_order_mapper< T, QUDA_BQCD_GAUGE_ORDER, Nc >::type
gauge::BQCDOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3277

quda::gauge_order_mapper< T, QUDA_CPS_WILSON_GAUGE_ORDER, Nc >::type
gauge::CPSOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3275

quda::gauge_order_mapper< T, QUDA_FLOAT2_GAUGE_ORDER, Nc >::type
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3280

quda::gauge_order_mapper< T, QUDA_MILC_GAUGE_ORDER, Nc >::type
gauge::MILCOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3273

quda::gauge_order_mapper< T, QUDA_QDP_GAUGE_ORDER, Nc >::type
gauge::QDPOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3271

quda::gauge_order_mapper< T, QUDA_QDPJIT_GAUGE_ORDER, Nc >::type
gauge::QDPJITOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3272

quda::gauge_order_mapper< T, QUDA_TIFR_GAUGE_ORDER, Nc >::type
gauge::TIFROrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3278

quda::gauge_order_mapper< T, QUDA_TIFR_PADDED_GAUGE_ORDER, Nc >::type
gauge::TIFRPaddedOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3279

quda::gauge_order_mapper
Definition: gauge_field_order.h:3270

quda::gauge_wrapper
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors,...
Definition: gauge_field_order.h:44

quda::gauge_wrapper::dim
const int dim
Definition: gauge_field_order.h:45

quda::gauge_wrapper::x_cb
const int x_cb
Definition: gauge_field_order.h:46

quda::gauge_wrapper::parity
const int parity
Definition: gauge_field_order.h:47

quda::gauge_wrapper::gauge
T & gauge
Definition: gauge_field_order.h:49

quda::gauge_wrapper::phase
const Float phase
Definition: gauge_field_order.h:48

quda::gauge_wrapper::operator=
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
Definition: gauge_field_order.h:72

quda::isFixed
Definition: register_traits.h:348

quda::mapper
Definition: register_traits.h:94

quda::maximum
Definition: transform_reduce.h:22

quda::minimum
Definition: transform_reduce.h:26

quda::plus
Definition: transform_reduce.h:18

quda::vector
Definition: register_traits.h:217

trove::coalesced_ptr
Definition: ptr.h:61

transform_reduce.h
QUDA reimplementation of thrust::transform_reduce as well as wrappers also implementing thrust::reduc...

errorQuda
#define errorQuda(...)
Definition: util_quda.h:120