quda-ref/v1.0.0/gauge__field__order_8h_source.html

 #ifndef _GAUGE_ORDER_H
 #define _GAUGE_ORDER_H

 #ifndef __CUDACC_RTC__
 #include <assert.h>
 #endif
 #include <type_traits>

 #include <register_traits.h>
 #include <complex_quda.h>
 #include <quda_matrix.h>
 #include <index_helper.cuh>
 #include <fast_intdiv.h>
 #include <type_traits>
 #include <limits>
 #include <atomic.cuh>
 #include <thrust_helper.cuh>
 #include <gauge_field.h>
 #include <index_helper.cuh>
 #include <trove_helper.cuh>
 #include <texture_helper.cuh>

 namespace quda {

   template <typename Float, typename T>
     struct gauge_wrapper {
       const int dim;
       const int x_cb;
       const int parity;
       const Float phase;
       T &gauge;

       __device__ __host__ inline gauge_wrapper<Float, T>(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) :
           gauge(gauge),
           dim(dim),
           x_cb(x_cb),
           parity(parity),
           phase(phase)
       {
       }

       template<typename M>
       __device__ __host__ inline void operator=(const M &a) {
         gauge.save(a.data, x_cb, dim, parity);
       }
     };

   template <typename T, int N>
     template <typename S>
     __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_wrapper<typename RealType<T>::type,S> &a) {
     a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
   }

   template <typename T, int N>
     template <typename S>
     __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_wrapper<typename RealType<T>::type,S> &a) {
     a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
   }

   template <typename Float, typename T>
     struct gauge_ghost_wrapper {
       const int dim;
       const int ghost_idx;
       const int parity;
       const Float phase;
       T &gauge;

       __device__ __host__ inline gauge_ghost_wrapper<Float, T>(
           T &gauge, int dim, int ghost_idx, int parity, Float phase = 1.0) :
           gauge(gauge),
           dim(dim),
           ghost_idx(ghost_idx),
           parity(parity),
           phase(phase)
       {
       }

       template<typename M>
       __device__ __host__ inline void operator=(const M &a) {
         gauge.saveGhost(a.data, ghost_idx, dim, parity);
       }
     };

   template <typename T, int N>
     template <typename S>
     __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
     a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
   }

   template <typename T, int N>
     template <typename S>
     __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
     a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
   }

   namespace gauge {

     template<typename ReduceType, typename Float> struct square_ {
       square_(ReduceType scale) { }
       __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
       { return static_cast<ReduceType>(norm(x)); }
     };

     template<typename ReduceType> struct square_<ReduceType,char> {
       const ReduceType scale;
       square_(const ReduceType scale) : scale(scale) { }
       __host__ __device__ inline ReduceType operator()(const quda::complex<char> &x)
       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
     };

     template<typename ReduceType> struct square_<ReduceType,short> {
       const ReduceType scale;
       square_(const ReduceType scale) : scale(scale) { }
       __host__ __device__ inline ReduceType operator()(const quda::complex<short> &x)
       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
     };

     template<typename ReduceType> struct square_<ReduceType,int> {
       const ReduceType scale;
       square_(const ReduceType scale) : scale(scale) { }
       __host__ __device__ inline ReduceType operator()(const quda::complex<int> &x)
       { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
     };

     template<typename Float, typename storeFloat> struct abs_ {
       abs_(const Float scale) { }
       __host__ __device__ Float operator()(const quda::complex<storeFloat> &x) { return abs(x); }
     };

     template<typename Float> struct abs_<Float,char> {
       Float scale;
       abs_(const Float scale) : scale(scale) { }
       __host__ __device__ Float operator()(const quda::complex<char> &x)
       { return abs(scale * complex<Float>(x.real(), x.imag())); }
     };

     template<typename Float> struct abs_<Float,short> {
       Float scale;
       abs_(const Float scale) : scale(scale) { }
       __host__ __device__ Float operator()(const quda::complex<short> &x)
       { return abs(scale * complex<Float>(x.real(), x.imag())); }
     };

     template<typename Float> struct abs_<Float,int> {
       Float scale;
       abs_(const Float scale) : scale(scale) { }
       __host__ __device__ Float operator()(const quda::complex<int> &x)
       { return abs(scale * complex<Float>(x.real(), x.imag())); }
     };

     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
     template<> __host__ __device__ inline constexpr bool fixed_point<float,char>() { return true; }
     template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
     template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }

     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
     template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
     template<> __host__ __device__ inline constexpr bool match<short,short>() { return true; }

     template <typename Float, typename storeFloat>
       struct fieldorder_wrapper {
   complex<storeFloat> *v;
   const int idx;
   const Float scale;
   const Float scale_inv;
   static constexpr bool fixed = fixed_point<Float,storeFloat>();

         __device__ __host__ inline fieldorder_wrapper(complex<storeFloat> *v, int idx, Float scale, Float scale_inv)
     : v(v), idx(idx), scale(scale), scale_inv(scale_inv) {}

   __device__ __host__ inline Float real() const {
           if (!fixed) {
             return v[idx].real();
           } else {
             return scale_inv*static_cast<Float>(v[idx].real());
           }
         }

   __device__ __host__ inline Float imag() const {
           if (!fixed) {
             return v[idx].imag();
           } else {
             return scale_inv*static_cast<Float>(v[idx].imag());
           }
         }

   __device__ __host__ inline complex<Float> operator-() const {
     return fixed ? -scale_inv*static_cast<complex<Float> >(v[idx]) : -static_cast<complex<Float> >(v[idx]);
   }

   __device__ __host__ inline void operator=(const fieldorder_wrapper<Float,storeFloat> &a) {
     v[idx] = fixed ? complex<storeFloat>(round(scale * a.real()), round(scale * a.imag())) : a.v[a.idx];
   }

         template<typename theirFloat>
   __device__ __host__ inline void operator=(const complex<theirFloat> &a) {
     if (match<storeFloat,theirFloat>()) {
       v[idx] = complex<storeFloat>(a.x, a.y);
     } else {
       v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
     }
   }

         template<typename theirFloat>
   __device__ __host__ inline void operator+=(const complex<theirFloat> &a) {
     if (match<storeFloat,theirFloat>()) {
       v[idx] += complex<storeFloat>(a.x, a.y);
     } else {
       v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
     }
   }

   template<typename theirFloat>
   __device__ __host__ inline void operator-=(const complex<theirFloat> &a) {
     if (match<storeFloat,theirFloat>()) {
       v[idx] -= complex<storeFloat>(a.x, a.y);
     } else {
       v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
     }
   }

       };

     template<typename Float, typename storeFloat>
     __device__ __host__ inline complex<Float> operator*(const Float &a, const fieldorder_wrapper<Float,storeFloat> &b)
     {
       if (fixed_point<Float,storeFloat>()) return a*complex<Float>(b.real(), b.imag());
       else return a*complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());
     }

     template<typename Float, typename storeFloat>
     __device__ __host__ inline complex<Float> operator+(const fieldorder_wrapper<Float,storeFloat> &a, const complex<Float> &b) {
       if (fixed_point<Float,storeFloat>()) return complex<Float>(a.real(), a.imag()) + b;
       else return complex<Float>(a.v[a.idx].real(),a.v[a.idx].imag()) + b;
     }

     template<typename Float, typename storeFloat>
     __device__ __host__ inline complex<Float> operator+(const complex<Float> &a, const fieldorder_wrapper<Float,storeFloat> &b) {
       if (fixed_point<Float,storeFloat>()) return a + complex<Float>(b.real(), b.imag());
       else return a + complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());;
     }

     template<typename Float, int nColor, QudaGaugeFieldOrder order, typename storeFloat, bool use_tex>
     struct Accessor {
       mutable complex<Float> dummy;
       Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
   errorQuda("Not implemented for order=%d", order);
       }

       void resetScale(Float dummy) { }

       __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
   return dummy;
       }
     };

     template<typename Float, int nColor, QudaGaugeFieldOrder order, bool native_ghost, typename storeFloat, bool use_tex>
     struct GhostAccessor {
       mutable complex<Float> dummy;
       GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
   errorQuda("Not implemented for order=%d", order);
       }

       void resetScale(Float dummy) { }

       __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
   return dummy;
       }
     };

     template<typename Float, int nColor, typename storeFloat, bool use_tex>
       struct Accessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,storeFloat,use_tex> {
       complex <storeFloat> *u[QUDA_MAX_GEOMETRY];
       const int volumeCB;
       const int geometry;
       const int cb_offset;
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();

       Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
   : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(complex<storeFloat>)*U.Geometry())),
   scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
       {
   for (int d=0; d<U.Geometry(); d++)
     u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :
       static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];
   resetScale(U.Scale());
       }

     Accessor(const Accessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,storeFloat,use_tex> &a)
       : volumeCB(a.volumeCB), geometry(a.geometry), cb_offset(a.cb_offset), scale(a.scale), scale_inv(a.scale_inv) {
   for (int d=0; d<QUDA_MAX_GEOMETRY; d++)
     u[d] = a.u[d];
       }

       void resetScale(Float max) {
   if (fixed) {
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
       {
   complex<storeFloat> tmp = u[d][ parity*cb_offset + (x*nColor + row)*nColor + col];

   if (fixed) {
     return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
   } else {
     return complex<Float>(tmp.x,tmp.y);
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
   { return fieldorder_wrapper<Float,storeFloat>(u[d], parity*cb_offset + (x*nColor + row)*nColor + col,
                   scale, scale_inv); }

       template<typename theirFloat>
       __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col,
                                                  const complex<theirFloat> &val) const {
 #ifdef __CUDA_ARCH__
   typedef typename vector<storeFloat,2>::type vec2;
   vec2 *u2 = reinterpret_cast<vec2*>(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col);
   if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
     atomicAdd(u2, (vec2&)val_);
   } else {
     atomicAdd(u2, (vec2&)val);
   }
 #else
   if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
 #pragma omp atomic update
     u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += val_.x;
 #pragma omp atomic update
     u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += val_.y;
   } else {
 #pragma omp atomic update
     u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
 #pragma omp atomic update
     u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
   }
 #endif
       }

       template<typename helper, typename reducer>
         __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
   if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
         int lower = (dim == -1) ? 0 : dim;
         int upper = (dim == -1) ? geometry : dim+1;
         double result = init;
         if (location == QUDA_CUDA_FIELD_LOCATION) {
           thrust_allocator alloc;
           for (int d=lower; d<upper; d++) {
             thrust::device_ptr<complex<storeFloat> > ptr(u[d]);
             result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+2*volumeCB*nColor*nColor, h, result, r);
           }
         } else {
           for (int d=lower; d<upper; d++) {
             result = thrust::transform_reduce(thrust::seq, u[d], u[d]+2*volumeCB*nColor*nColor, h, result, r);
           }
         }
         return result;
       }

     };

     template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
       struct GhostAccessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
       complex<storeFloat> *ghost[8];
       int ghostOffset[8];
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();

       GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
   : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
   for (int d=0; d<4; d++) {
     ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
       static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
     ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();

     ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
       ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
       static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
     ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
   }

   resetScale(U.Scale());
       }

     GhostAccessor(const GhostAccessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,native_ghost,storeFloat,use_tex> &a)
   : scale(a.scale), scale_inv(a.scale_inv) {
   for (int d=0; d<8; d++) {
     ghost[d] = a.ghost[d];
     ghostOffset[d] = a.ghostOffset[d];
   }
       }

       void resetScale(Float max) {
   if (fixed) {
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
       {
   complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
   if (fixed) {
     return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
   } else {
     return complex<Float>(tmp.x,tmp.y);
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
   { return fieldorder_wrapper<Float,storeFloat>(ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col,
                   scale, scale_inv); }
     };

     template<typename Float, int nColor, typename storeFloat, bool use_tex>
       struct Accessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,storeFloat,use_tex> {
       complex<storeFloat> *u;
       const int volumeCB;
       const int geometry;
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();

       Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
       : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
     static_cast<complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),
   volumeCB(U.VolumeCB()), geometry(U.Geometry()),
   scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
   resetScale(U.Scale());
       }

     Accessor(const Accessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,storeFloat,use_tex> &a)
   : u(a.u), volumeCB(a.volumeCB), geometry(a.geometry), scale(a.scale), scale_inv(a.scale_inv)
       { }

       void resetScale(Float max) {
   if (fixed) {
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
       {
   complex<storeFloat> tmp = u[(((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col];
   if (fixed) {
     return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
   } else {
     return complex<Float>(tmp.x,tmp.y);
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
   { return fieldorder_wrapper<Float,storeFloat>
       (u, (((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col, scale, scale_inv); }

       template <typename theirFloat>
       __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
 #ifdef __CUDA_ARCH__
   typedef typename vector<storeFloat,2>::type vec2;
   vec2 *u2 = reinterpret_cast<vec2*>(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col);
   if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
     atomicAdd(u2, (vec2&)val_);
   } else {
     atomicAdd(u2, (vec2&)val);
   }
 #else
   if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
 #pragma omp atomic update
     u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += val_.x;
 #pragma omp atomic update
     u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += val_.y;
   } else {
 #pragma omp atomic update
     u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
 #pragma omp atomic update
     u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
   }
 #endif
       }

       template<typename helper, typename reducer>
       __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
   if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
         int lower = (dim == -1) ? 0 : dim;
         int upper = (dim == -1) ? geometry : dim+1;
         double result = init;
         if (location == QUDA_CUDA_FIELD_LOCATION) {
           thrust_allocator alloc;
           thrust::device_ptr<complex<storeFloat> > ptr(u);
           result = thrust::transform_reduce(thrust::cuda::par(alloc),
                                             ptr+(0*geometry+lower)*volumeCB*nColor*nColor,
                                             ptr+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
           result = thrust::transform_reduce(thrust::cuda::par(alloc),
                                             ptr+(1*geometry+lower)*volumeCB*nColor*nColor,
                                             ptr+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
         } else {
           result = thrust::transform_reduce(thrust::seq,
                                             u+(0*geometry+lower)*volumeCB*nColor*nColor,
                                             u+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
           result  = thrust::transform_reduce(thrust::seq,
                                              u+(1*geometry+lower)*volumeCB*nColor*nColor,
                                              u+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
         }
         return result;
       }

     };

     template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
       struct GhostAccessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
       complex<storeFloat> *ghost[8];
       int ghostOffset[8];
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();

       GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
   : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
   for (int d=0; d<4; d++) {
     ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
       static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
     ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();

     ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
       ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
       static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
     ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
   }

   resetScale(U.Scale());
       }

     GhostAccessor(const GhostAccessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,native_ghost,storeFloat,use_tex> &a)
   : scale(a.scale), scale_inv(a.scale_inv) {
   for (int d=0; d<8; d++) {
     ghost[d] = a.ghost[d];
     ghostOffset[d] = a.ghostOffset[d];
   }
       }

       void resetScale(Float max) {
   if (fixed) {
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
       {
   complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
   if (fixed) {
     return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
   } else {
     return complex<Float>(tmp.x,tmp.y);
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
   { return fieldorder_wrapper<Float,storeFloat>
       (ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col, scale, scale_inv); }
     };

     template<int nColor, int N>
       __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) {
       constexpr int M = (2*nColor*nColor) / N;
       int j = ((row*nColor+col)*2) / N; // factor of two for complexity
       int i = ((row*nColor+col)*2) % N;
       int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset
       index += parity*offset_cb;
       return index;
     };

     template<typename Float, int nColor, typename storeFloat, bool use_tex>
       struct Accessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex> {
       complex<storeFloat> *u;
       const int offset_cb;
 #ifdef USE_TEXTURE_OBJECTS
       typedef typename TexVectorType<Float,2>::type TexVector;
       cudaTextureObject_t tex;
 #endif
       const int volumeCB;
       const int stride;
       const int geometry;
       Float max;
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();

     Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
       : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
     static_cast<complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),
   offset_cb( (U.Bytes()>>1) / sizeof(complex<storeFloat>)),
 #ifdef USE_TEXTURE_OBJECTS
         tex(0),
 #endif
         volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),
         max(static_cast<Float>(1.0)), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
       {
   resetScale(U.Scale());
 #ifdef USE_TEXTURE_OBJECTS
   if (U.Location() == QUDA_CUDA_FIELD_LOCATION) tex = static_cast<const cudaGaugeField&>(U).Tex();
   if (use_tex && this->u != U.Gauge_p() && !override) {
     errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with use_tex=false instead");
   }
 #endif
       }

     Accessor(const Accessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,storeFloat,use_tex> &a)
       : u(a.u), offset_cb(a.offset_cb),
 #ifdef USE_TEXTURE_OBJECTS
         tex(a.tex),
 #endif
         volumeCB(a.volumeCB), stride(a.stride), geometry(a.geometry),
   scale(a.scale), scale_inv(a.scale_inv) {  }

       void resetScale(Float max_) {
   if (fixed) {
     max = max_;
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline const complex<Float> operator()(int dim, int parity, int x_cb, int row, int col) const
       {
 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
   if (use_tex) {
     TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
     if (fixed) {
       return max*complex<Float>(vecTmp.x, vecTmp.y);
     } else {
       return complex<Float>(vecTmp.x, vecTmp.y);
     }
   } else
 #endif
   {
     complex<storeFloat> tmp = u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb];
     if (fixed) {
       return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
     } else {
       return complex<Float>(tmp.x, tmp.y);
     }
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int dim, int parity, int x_cb, int row, int col)
       {
   int index = parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb;
   return fieldorder_wrapper<Float,storeFloat>(u, index, scale, scale_inv);
       }

       template <typename theirFloat>
       __device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
 #ifdef __CUDA_ARCH__
   typedef typename vector<storeFloat,2>::type vec2;
   vec2 *u2 = reinterpret_cast<vec2*>(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
   if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
     atomicAdd(u2, (vec2&)val_);
   } else {
     atomicAdd(u2, (vec2&)val);
   }
 #else
         if (fixed && !match<storeFloat,theirFloat>()) {
     complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
 #pragma omp atomic update
     u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += val_.x;
 #pragma omp atomic update
     u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += val_.y;
     } else {
 #pragma omp atomic update
     u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += static_cast<storeFloat>(val.x);
 #pragma omp atomic update
     u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += static_cast<storeFloat>(val.y);
   }
 #endif
       }

       template<typename helper, typename reducer>
         __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
   if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
         int lower = (dim == -1) ? 0 : dim;
         int upper = (dim == -1) ? geometry : dim+1;
         double result = init;
         if (location == QUDA_CUDA_FIELD_LOCATION) {
           thrust_allocator alloc;
           thrust::device_ptr<complex<storeFloat> > ptr(u);
           result = thrust::transform_reduce(thrust::cuda::par(alloc),
                                             ptr+0*offset_cb+lower*stride*nColor*nColor,
                                             ptr+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
           result = thrust::transform_reduce(thrust::cuda::par(alloc),
                                             ptr+1*offset_cb+lower*stride*nColor*nColor,
                                             ptr+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
         } else {
           result = thrust::transform_reduce(thrust::seq,
                                             u+0*offset_cb+lower*stride*nColor*nColor,
                                             u+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
           result = thrust::transform_reduce(thrust::seq,
                                             u+1*offset_cb+lower*stride*nColor*nColor,
                                             u+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
         }
         return result;
       }

     };

     template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
       struct GhostAccessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
       complex<storeFloat> *ghost[8];
       const int volumeCB;
       int ghostVolumeCB[8];
       Float scale;
       Float scale_inv;
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
       Accessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,storeFloat,use_tex> accessor;

       GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
   : volumeCB(U.VolumeCB()), accessor(U, gauge_, ghost_),
     scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
       {
   if (!native_ghost) assert(ghost_ != nullptr);
   for (int d=0; d<4; d++) {
     ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
     ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
     ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
     ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
   }
   resetScale(U.Scale());
       }

     GhostAccessor(const GhostAccessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,native_ghost,storeFloat,use_tex> &a)
   : volumeCB(a.volumeCB), scale(a.scale), scale_inv(a.scale_inv), accessor(a.accessor)
       {
   for (int d=0; d<8; d++) {
     ghost[d] = a.ghost[d];
     ghostVolumeCB[d] = a.ghostVolumeCB[d];
   }
       }

       void resetScale(Float max) {
   accessor.resetScale(max);
   if (fixed) {
     scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
     scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
   }
       }

       __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x_cb, int row, int col) const
       {
   if (native_ghost) {
     return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
   } else {
     complex<storeFloat> tmp = ghost[d][ ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb ];
     if (fixed) {
       return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
     } else {
       return complex<Float>(tmp.x, tmp.y);
     }
   }
       }

       __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x_cb, int row, int col)
       {
   if (native_ghost)
     return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
   else
     return fieldorder_wrapper<Float,storeFloat>
       (ghost[d], ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb, scale, scale_inv);
       }
     };


   template <typename Float, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order,
     bool native_ghost=true, typename storeFloat=Float, bool use_tex=false>
       struct FieldOrder {

   const int volumeCB;
   const int nDim;
   const int_fastdiv geometry;
   const QudaFieldLocation location;
   static constexpr int nColorCoarse = nColor / nSpinCoarse;

   Accessor<Float,nColor,order,storeFloat,use_tex> accessor;
   GhostAccessor<Float,nColor,order,native_ghost,storeFloat,use_tex> ghostAccessor;

       FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
       : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),
     location(U.Location()),
     accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)
   {
     if (U.Reconstruct() != QUDA_RECONSTRUCT_NO)
       errorQuda("GaugeField ordering not supported with reconstruction");
   }

       FieldOrder(const FieldOrder &o) : volumeCB(o.volumeCB),
     nDim(o.nDim), geometry(o.geometry), location(o.location),
     accessor(o.accessor), ghostAccessor(o.ghostAccessor)
   { }

   void resetScale(double max) {
     accessor.resetScale(max);
     ghostAccessor.resetScale(max);
   }

   static constexpr bool fixedPoint() { return fixed_point<Float,storeFloat>(); }

   __device__ __host__ complex<Float> operator()(int d, int parity, int x, int row, int col) const
   { return accessor(d,parity,x,row,col); }

   __device__ __host__ fieldorder_wrapper<Float,storeFloat> operator() (int d, int parity, int x, int row, int col)
   { return accessor(d,parity,x,row,col); }

   __device__ __host__ complex<Float> Ghost(int d, int parity, int x, int row, int col) const
   { return ghostAccessor(d,parity,x,row,col); }

   __device__ __host__ fieldorder_wrapper<Float,storeFloat> Ghost(int d, int parity, int x, int row, int col)
   { return ghostAccessor(d,parity,x,row,col); }

   __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x, int s_row,
                    int s_col, int c_row, int c_col) const {
     return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
   }

   __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()
     (int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) {
     return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
   }

   __device__ __host__ inline complex<Float> Ghost(int d, int parity, int x, int s_row,
               int s_col, int c_row, int c_col) const {
     return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
   }

   __device__ __host__ inline fieldorder_wrapper<Float,storeFloat>
     Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) {
     return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
   }

         template <typename theirFloat>
   __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col,
               int c_row, int c_col, const complex<theirFloat> &val) {
     accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);
   }

   __device__ __host__ inline int Ncolor() const { return nColor; }

   __device__ __host__ inline int Volume() const { return 2*volumeCB; }

   __device__ __host__ inline int VolumeCB() const { return volumeCB; }

   __device__ __host__ inline int Ndim() const { return nDim; }

   __device__ __host__ inline int Geometry() const { return geometry; }

   __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; }

   __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; }

   __host__ double norm1(int dim=-1, bool global=true) const {
     double nrm1 = accessor.transform_reduce(location, dim, abs_<double,storeFloat>(accessor.scale_inv),
                                                   thrust::plus<double>(), 0.0);
     if (global) comm_allreduce(&nrm1);
     return nrm1;
   }

   __host__ double norm2(int dim=-1, bool global=true) const {
     double nrm2 = accessor.transform_reduce(location, dim, square_<double,storeFloat>(accessor.scale_inv),
                                                   thrust::plus<double>(), 0.0);
     if (global) comm_allreduce(&nrm2);
     return nrm2;
   }

   __host__ double abs_max(int dim=-1, bool global=true) const {
     double absmax = accessor.transform_reduce(location, dim, abs_<Float,storeFloat>(accessor.scale_inv),
                                                     thrust::maximum<Float>(), 0.0);
     if (global) comm_allreduce_max(&absmax);
     return absmax;
   }

   __host__ double abs_min(int dim=-1, bool global=true) const {
     double absmin = accessor.transform_reduce(location, dim, abs_<Float,storeFloat>(accessor.scale_inv),
                                                     thrust::minimum<Float>(), std::numeric_limits<double>::max());
     if (global) comm_allreduce_min(&absmin);
     return absmin;
   }

   size_t Bytes() const { return static_cast<size_t>(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); }
       };

       template <int N, typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
       struct Reconstruct {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         real scale;
         real scale_inv;
         Reconstruct(const GaugeField &u) : scale(u.LinkMax()), scale_inv(1.0 / scale) {}
         Reconstruct(const Reconstruct<N, Float, ghostExchange_> &recon) : scale(recon.scale), scale_inv(recon.scale_inv)
         {
         }

         __device__ __host__ inline void Pack(real out[N], const complex in[N / 2], int idx) const
         {
           if (isFixed<Float>::value) {
 #pragma unroll
             for (int i = 0; i < N / 2; i++) {
               out[2 * i + 0] = scale_inv * in[i].real();
               out[2 * i + 1] = scale_inv * in[i].imag();
             }
           } else {
 #pragma unroll
             for (int i = 0; i < N / 2; i++) {
               out[2 * i + 0] = in[i].real();
               out[2 * i + 1] = in[i].imag();
             }
           }
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int idx, int dir, real phase,
                                                const I *X, const int *R) const
         {
           if (isFixed<Float>::value) {
 #pragma unroll
             for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }
           } else {
 #pragma unroll
             for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
           }
         }
         __device__ __host__ inline real getPhase(const complex in[N / 2]) const { return 0; }
       };

       template <QudaGhostExchange ghostExchange_, typename T, typename I>
       __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
           T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,
           bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
       {

         // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale

         if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
             || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
           if (idx >= firstTimeSliceBound) { // halo region on the first time slice
             return isFirstTimeSlice ? tBoundary : scale;
           } else if (idx >= lastTimeSliceBound) { // last link on the last time slice
             return isLastTimeSlice ? tBoundary : scale;
           } else {
             return scale;
           }
         } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
             || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
           if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
             // the boundary condition is on the R[3]-1 time slice
             return isFirstTimeSlice ? tBoundary : scale;
           } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
             // the boundary condition lies on the X[3]-R[3]-1 time slice
             return isLastTimeSlice ? tBoundary : scale;
           } else {
             return scale;
           }
         }
         return scale;
       }

       // not actually used - here for reference
       template <typename Float, typename I>
       __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) {
         // could consider non-extended variant too?
         Float sign = static_cast<Float>(1.0);
   switch (dim) {
   case 0: if ( ((x[3] - R[3]) & 1) != 0)                             sign = -static_cast<Float>(1.0); break;
   case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0)               sign = -static_cast<Float>(1.0); break;
   case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
   }
   return sign;
       }

       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<12, Float, ghostExchange_> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const real anisotropy;
         const real tBoundary;
         const int firstTimeSliceBound;
         const int lastTimeSliceBound;
         const bool isFirstTimeSlice;
         const bool isLastTimeSlice;
         QudaGhostExchange ghostExchange;

         Reconstruct(const GaugeField &u) :
           anisotropy(u.Anisotropy()),
           tBoundary(static_cast<real>(u.TBoundary())),
           firstTimeSliceBound(u.VolumeCB()),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
           ghostExchange(u.GhostExchange())
         {
         }

         Reconstruct(const Reconstruct<12, Float, ghostExchange_> &recon) :
             anisotropy(recon.anisotropy),
             tBoundary(recon.tBoundary),
             firstTimeSliceBound(recon.firstTimeSliceBound),
             lastTimeSliceBound(recon.lastTimeSliceBound),
             isFirstTimeSlice(recon.isFirstTimeSlice),
             isLastTimeSlice(recon.isLastTimeSlice),
             ghostExchange(recon.ghostExchange)
         {
         }

         __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
         {
 #pragma unroll
           for (int i = 0; i < 6; i++) {
             out[2 * i + 0] = in[i].real();
             out[2 * i + 1] = in[i].imag();
           }
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
                                                const I *X, const int *R) const
         {
 #pragma unroll
           for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);

           const real u0 = dir < 3 ?
             anisotropy :
             timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
                                          lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);

           // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);
           out[6] = cmul(out[2], out[4]);
           out[6] = cmac(out[1], out[5], -out[6]);
           out[6] = u0 * conj(out[6]);

           // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]);
           out[7] = cmul(out[0], out[5]);
           out[7] = cmac(out[2], out[3], -out[7]);
           out[7] = u0 * conj(out[7]);

           // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]);
           out[8] = cmul(out[1], out[3]);
           out[8] = cmac(out[0], out[4], -out[8]);
           out[8] = u0 * conj(out[8]);
         }

         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
       };

       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<11, Float, ghostExchange_> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;

         Reconstruct(const GaugeField &u) { ; }
         Reconstruct(const Reconstruct<11, Float, ghostExchange_> &recon) {}

         __device__ __host__ inline void Pack(real out[10], const complex in[9], int idx) const
         {
 #pragma unroll
           for (int i = 0; i < 2; i++) {
             out[2 * i + 0] = in[i + 1].real();
             out[2 * i + 1] = in[i + 1].imag();
           }
           out[4] = in[5].real();
           out[5] = in[5].imag();
           out[6] = in[0].imag();
           out[7] = in[4].imag();
           out[8] = in[8].imag();
           out[9] = 0.0;
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[9], const real in[10], int idx, int dir, real phase,
                                                const I *X, const int *R) const
         {
           out[0] = complex(0.0, in[6]);
           out[1] = complex(in[0], in[1]);
           out[2] = complex(in[2], in[3]);
           out[3] = complex(-out[1].real(), out[1].imag());
           out[4] = complex(0.0, in[7]);
           out[5] = complex(in[4], in[5]);
           out[6] = complex(-out[2].real(), out[2].imag());
           out[7] = complex(-out[5].real(), out[5].imag());
           out[8] = complex(0.0, in[8]);
         }

         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
       };

       template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
       struct Reconstruct<13, Float, ghostExchange_, stag_phase> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const Reconstruct<12, Float, ghostExchange_> reconstruct_12;
         const real scale;
         const real scale_inv;

         Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {}
         Reconstruct(const Reconstruct<13, Float, ghostExchange_, stag_phase> &recon) :
             reconstruct_12(recon.reconstruct_12),
             scale(recon.scale),
             scale_inv(recon.scale_inv)
         {
         }

         __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
         {
           reconstruct_12.Pack(out, in, idx);
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
                                                const I *X, const int *R) const
         {
 #pragma unroll
           for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);

           out[6] = cmul(out[2], out[4]);
           out[6] = cmac(out[1], out[5], -out[6]);
           out[6] = scale_inv * conj(out[6]);

           out[7] = cmul(out[0], out[5]);
           out[7] = cmac(out[2], out[3], -out[7]);
           out[7] = scale_inv * conj(out[7]);

           out[8] = cmul(out[1], out[3]);
           out[8] = cmac(out[0], out[4], -out[8]);
           out[8] = scale_inv * conj(out[8]);

           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
             // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
             real cos_sin[2];
             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(3. * phase), &cos_sin[1], &cos_sin[0]);
             complex A(cos_sin[0], cos_sin[1]);
             out[6] = cmul(A, out[6]);
             out[7] = cmul(A, out[7]);
             out[8] = cmul(A, out[8]);
           } else { // phase is +/- 1 so real multiply is sufficient
             out[6] *= phase;
             out[7] *= phase;
             out[8] *= phase;
           }
         }

         __device__ __host__ inline real getPhase(const complex in[9]) const
         {
 #if 1 // phase from cross product
           // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
           complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
           complex expI3Phase = in[8] / denom; // numerator = U[2][2]

           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
             return arg(expI3Phase) / static_cast<real>(3.0);
           } else {
             return expI3Phase.real() > 0 ? 1 : -1;
           }
 #else // phase from determinant
           Matrix<complex, 3> a;
 #pragma unroll
           for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
           const complex det = getDeterminant(a);
           return phase = arg(det) / 3;
 #endif
         }
       };

       template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<8, Float, ghostExchange_> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const complex anisotropy; // imaginary value stores inverse
         const complex tBoundary;  // imaginary value stores inverse
         const int firstTimeSliceBound;
         const int lastTimeSliceBound;
         const bool isFirstTimeSlice;
         const bool isLastTimeSlice;
         QudaGhostExchange ghostExchange;

         // scale factor is set when using recon-9
         Reconstruct(const GaugeField &u, real scale = 1.0) :
           anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
           tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),
           firstTimeSliceBound(u.VolumeCB()),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
           ghostExchange(u.GhostExchange())
         {
         }

         Reconstruct(const Reconstruct<8, Float, ghostExchange_> &recon) :
             anisotropy(recon.anisotropy),
             tBoundary(recon.tBoundary),
             firstTimeSliceBound(recon.firstTimeSliceBound),
             lastTimeSliceBound(recon.lastTimeSliceBound),
             isFirstTimeSlice(recon.isFirstTimeSlice),
             isLastTimeSlice(recon.isLastTimeSlice),
             ghostExchange(recon.ghostExchange)
         {
         }

         __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
         {
           out[0] = Trig<isFixed<Float>::value, real>::Atan2(in[0].imag(), in[0].real());
           out[1] = Trig<isFixed<Float>::value, real>::Atan2(in[6].imag(), in[6].real());
 #pragma unroll
           for (int i = 1; i < 4; i++) {
             out[2 * i + 0] = in[i].real();
             out[2 * i + 1] = in[i].imag();
           }
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
                                                const I *X, const int *R, const complex scale, const complex u) const
         {
           real u0 = u.real();
           real u0_inv = u.imag();

 #pragma unroll
           for (int i = 1; i <= 3; i++)
             out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly

           real tmp[2];
           Trig<isFixed<Float>::value, real>::SinCos(in[0], &tmp[1], &tmp[0]);
           out[0] = complex(tmp[0], tmp[1]);

           Trig<isFixed<Float>::value, real>::SinCos(in[1], &tmp[1], &tmp[0]);
           out[6] = complex(tmp[0], tmp[1]);

           // First, reconstruct first row
           real row_sum = out[1].real() * out[1].real();
           row_sum += out[1].imag() * out[1].imag();
           row_sum += out[2].real() * out[2].real();
           row_sum += out[2].imag() * out[2].imag();
           real row_sum_inv = static_cast<real>(1.0) / row_sum;

           real diff = u0_inv * u0_inv - row_sum;
           real U00_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);

           out[0] *= U00_mag;

           // Second, reconstruct first column
           real column_sum = out[0].real() * out[0].real();
           column_sum += out[0].imag() * out[0].imag();
           column_sum += out[3].real() * out[3].real();
           column_sum += out[3].imag() * out[3].imag();

           diff = u0_inv * u0_inv - column_sum;
           real U20_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);

           out[6] *= U20_mag;

           // Finally, reconstruct last elements from SU(2) rotation
           real r_inv2 = u0_inv * row_sum_inv;
           {
             complex A = cmul(conj(out[0]), out[3]);

             // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11
             out[4] = cmul(conj(out[6]), conj(out[2]));
             out[4] = cmac(u0 * A, out[1], out[4]);
             out[4] = -r_inv2 * out[4];

             // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2;  // U12
             out[5] = cmul(conj(out[6]), conj(out[1]));
             out[5] = cmac(-u0 * A, out[2], out[5]);
             out[5] = r_inv2 * out[5];
           }

           {
             complex A = cmul(conj(out[0]), out[6]);

             // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2;  // U21
             out[7] = cmul(conj(out[3]), conj(out[2]));
             out[7] = cmac(-u0 * A, out[1], out[7]);
             out[7] = r_inv2 * out[7];

             // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12
             out[8] = cmul(conj(out[3]), conj(out[1]));
             out[8] = cmac(u0 * A, out[2], out[8]);
             out[8] = -r_inv2 * out[8];
           }
         }

         template <typename I>
         __device__ __host__ inline void
         Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R,
                const complex scale = complex(static_cast<real>(1.0), static_cast<real>(1.0))) const
         {
           complex u = dir < 3 ?
             anisotropy :
             timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
                                          isFirstTimeSlice, isLastTimeSlice, ghostExchange);
           Unpack(out, in, idx, dir, phase, X, R, scale, u);
         }

         __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
       };

       template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
       struct Reconstruct<9, Float, ghostExchange_, stag_phase> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const Reconstruct<8, Float, ghostExchange_> reconstruct_8;
         const real scale;
         const real scale_inv;

         Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {}

         Reconstruct(const Reconstruct<9, Float, ghostExchange_, stag_phase> &recon) :
             reconstruct_8(recon.reconstruct_8),
             scale(recon.scale),
             scale_inv(recon.scale_inv)
         {
         }

         __device__ __host__ inline real getPhase(const complex in[9]) const
         {
 #if 1 // phase from cross product
           // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
           complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
           complex expI3Phase = in[8] / denom; // numerator = U[2][2]
           if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
             return arg(expI3Phase) / static_cast<real>(3.0);
           } else {
             return expI3Phase.real() > 0 ? 1 : -1;
           }
 #else // phase from determinant
           Matrix<complex, 3> a;
 #pragma unroll
           for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
           const complex det = getDeterminant(a);
           real phase = arg(det) / 3;
           return phase;
 #endif
         }

         // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,
         __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
         {
           real phase = getPhase(in);
           complex su3[9];

           if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
             real cos_sin[2];
             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(-phase), &cos_sin[1], &cos_sin[0]);
             complex z(cos_sin[0], cos_sin[1]);
             z *= scale_inv;
 #pragma unroll
             for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]);
           } else {
 #pragma unroll
             for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }
           }
           reconstruct_8.Pack(out, su3, idx);
         }

         template <typename I>
         __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
                                                const I *X, const int *R) const
         {
           reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast<real>(1.0), static_cast<real>(1.0)),
                                complex(static_cast<real>(1.0), static_cast<real>(1.0)));

           if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase
             real cos_sin[2];
             Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(phase), &cos_sin[1], &cos_sin[0]);
             complex z(cos_sin[0], cos_sin[1]);
             z *= scale;
 #pragma unroll
             for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
           } else { // stagic phase
 #pragma unroll
             for (int i = 0; i < 18; i++) { out[i] *= phase; }
           }
         }
       };

       __host__ __device__ constexpr int ct_sqrt(int n, int i = 1)
       {
         return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i);
       }

       __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); }

       // we default to huge allocations for gauge field (for now)
       constexpr bool default_huge_alloc = true;

       template <QudaStaggeredPhase phase> __host__ __device__ inline bool static_phase()
       {
         switch (phase) {
         case QUDA_STAGGERED_PHASE_MILC:
         case QUDA_STAGGERED_PHASE_CPS:
         case QUDA_STAGGERED_PHASE_TIFR: return true;
         default: return false;
         }
       }

       template <typename Float, int length, int N, int reconLenParam,
           QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
           QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>
       struct FloatNOrder {
         using Accessor
             = FloatNOrder<Float, length, N, reconLenParam, stag_phase, huge_alloc, ghostExchange_, use_inphase>;

         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         typedef typename VectorType<Float, N>::type Vector;
         typedef typename AllocType<huge_alloc>::type AllocInt;
         Reconstruct<reconLenParam, Float, ghostExchange_, stag_phase> reconstruct;
         static const int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
         static const int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
         Float *gauge;
         const AllocInt offset;
 #ifdef USE_TEXTURE_OBJECTS
         typedef typename TexVectorType<real, N>::type TexVector;
         cudaTextureObject_t tex;
         const int tex_offset;
 #endif
       Float *ghost[4];
       QudaGhostExchange ghostExchange;
       int coords[QUDA_MAX_DIM];
       int_fastdiv X[QUDA_MAX_DIM];
       int R[QUDA_MAX_DIM];
       const int volumeCB;
       int faceVolumeCB[4];
       const int stride;
       const int geometry;
       const AllocInt phaseOffset;
       void *backup_h;
       size_t bytes;

     FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
       : reconstruct(u), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
   offset(u.Bytes()/(2*sizeof(Float))),
 #ifdef USE_TEXTURE_OBJECTS
   tex(0), tex_offset(offset/N),
 #endif
   ghostExchange(u.GhostExchange()),
   volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()),
   phaseOffset(u.PhaseOffset()), backup_h(nullptr), bytes(u.Bytes())
       {
   if (geometry == QUDA_COARSE_GEOMETRY)
     errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");

         // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12),
         //         "staggered phase only presently supported for 18 and 12 reconstruct");
         for (int i = 0; i < 4; i++) {
           X[i] = u.X()[i];
           R[i] = u.R()[i];
     ghost[i] = ghost_ ? ghost_[i] : 0;
     faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
         }
 #ifdef USE_TEXTURE_OBJECTS
   if (u.Location() == QUDA_CUDA_FIELD_LOCATION) tex = static_cast<const cudaGaugeField&>(u).Tex();
   if (!huge_alloc && this->gauge != u.Gauge_p() && !override) {
     errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
   }
 #endif
       }

     FloatNOrder(const FloatNOrder &order)
       : reconstruct(order.reconstruct), gauge(order.gauge), offset(order.offset),
 #ifdef USE_TEXTURE_OBJECTS
   tex(order.tex), tex_offset(order.tex_offset),
 #endif
   ghostExchange(order.ghostExchange),
         volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry),
   phaseOffset(order.phaseOffset), backup_h(nullptr), bytes(order.bytes)
       {
   for (int i=0; i<4; i++) {
     X[i] = order.X[i];
     R[i] = order.R[i];
     ghost[i] = order.ghost[i];
     faceVolumeCB[i] = order.faceVolumeCB[i];
   }
       }

       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
       {
         const int M = reconLen / N;
         real tmp[reconLen];

 #pragma unroll
         for (int i=0; i<M; i++){
           // first do texture load from memory
 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
     if (!huge_alloc) { // use textures unless we have a huge alloc
             TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity * tex_offset + (dir * M + i) * stride + x);
             // now insert into output array
 #pragma unroll
             for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<real *>(&vecTmp)[j]);
           } else
 #endif
           {
             // first load from memory
             Vector vecTmp = vector_load<Vector>(gauge + parity * offset, (dir * M + i) * stride + x);
             // second do copy converting into register type
 #pragma unroll
       for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
           }
         }

         real phase = 0.; // TODO - add texture support for phases

         if (hasPhase) {
           if (static_phase<stag_phase>() && (reconLen == 13 || use_inphase)) {
             phase = inphase;
           } else {
             copy(phase, (gauge + parity * offset)[phaseOffset / sizeof(Float) + stride * dir + x]);
             phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
           }
         }

         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }

       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
       {
         const int M = reconLen / N;
         real tmp[reconLen];
         reconstruct.Pack(tmp, v, x);

 #pragma unroll
         for (int i=0; i<M; i++){
     Vector vecTmp;
     // first do copy converting into storage type
 #pragma unroll
     for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
     // second do vectorized copy into memory
           vector_store(gauge + parity * offset, x + (dir * M + i) * stride, vecTmp);
         }
         if (hasPhase) {
           real phase = reconstruct.getPhase(v);
           copy((gauge + parity * offset)[phaseOffset / sizeof(Float) + dir * stride + x],
                static_cast<real>(phase / (2. * M_PI)));
         }
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity, real phase = 1.0)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity, phase);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity,
                                                                                 real phase = 1.0) const
       {
         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity, phase);
       }

       __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
       {
         if (!ghost[dir]) { // load from main field not separate array
           load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region
           // This also works perfectly when phases are stored. No need to change this.
         } else {
           const int M = reconLen / N;
           real tmp[reconLen];

 #pragma unroll
           for (int i=0; i<M; i++) {
       // first do vectorized copy from memory into registers
             Vector vecTmp = vector_load<Vector>(
                 ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x);
             // second do copy converting into register type
 #pragma unroll
             for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
           }
           real phase = 0.;

           if (hasPhase) {

             // if(stag_phase == QUDA_STAGGERED_PHASE_MILC )  {
             //   phase = inphase < static_cast<Float>(0) ? static_cast<Float>(-1./(2.*M_PI)) : static_cast<Float>(1./2.*M_PI);
             // } else {
             copy(phase, ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x]);
             phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
             // }
           }
           reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
         }
       }

       __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
       {
         if (!ghost[dir]) { // store in main field not separate array
           save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region
         } else {
           const int M = reconLen / N;
           real tmp[reconLen];
           reconstruct.Pack(tmp, v, x);

 #pragma unroll
           for (int i=0; i<M; i++) {
       Vector vecTmp;
       // first do copy converting into storage type
 #pragma unroll
       for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
       // second do vectorized copy into memory
       vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);
           }

     if (hasPhase) {
             real phase = reconstruct.getPhase(v);
             copy(ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x],
                  static_cast<real>(phase / (2. * M_PI)));
           }
         }
       }

       __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
                                                                            real phase = 1.0)
       {
         return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
       }

       __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
                                                                                  real phase = 1.0) const
       {
         return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
       }

       __device__ __host__ inline void loadGhostEx(complex v[length / 2], int buff_idx, int extended_idx, int dir,
                                                   int dim, int g, int parity, const int R[]) const
       {
         const int M = reconLen / N;
         real tmp[reconLen];

 #pragma unroll
   for (int i=0; i<M; i++) {
     // first do vectorized copy from memory
     Vector vecTmp = vector_load<Vector>(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
                 +i*R[dim]*faceVolumeCB[dim]+buff_idx);
     // second do copy converting into register type
 #pragma unroll
     for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
   }
         real phase = 0.;
         if (hasPhase)
           copy(phase,
                ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
                           + R[dim] * faceVolumeCB[dim] * M * N + buff_idx]);

         // use the extended_idx to determine the boundary condition
         reconstruct.Unpack(v, tmp, extended_idx, g, 2. * M_PI * phase, X, R);
       }

       __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int buff_idx, int extended_idx, int dir,
                                                   int dim, int g, int parity, const int R[])
       {
         const int M = reconLen / N;
         real tmp[reconLen];
         // use the extended_idx to determine the boundary condition
         reconstruct.Pack(tmp, v, extended_idx);

 #pragma unroll
     for (int i=0; i<M; i++) {
       Vector vecTmp;
       // first do copy converting into storage type
 #pragma unroll
       for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
       // second do vectorized copy to memory
       vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
        i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);
     }
     if (hasPhase) {
             real phase = reconstruct.getPhase(v);
             copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
                             + R[dim] * faceVolumeCB[dim] * M * N + buff_idx],
                  static_cast<real>(phase / (2. * M_PI)));
           }
       }

       void save() {
   if (backup_h) errorQuda("Already allocated host backup");
   backup_h = safe_malloc(bytes);
   cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
   checkCudaError();
       }

       void load() {
   cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
   host_free(backup_h);
   backup_h = nullptr;
   checkCudaError();
       }

       size_t Bytes() const { return reconLen * sizeof(Float); }
       };

       template <typename real, int length> struct S {
         real v[length];
         __host__ __device__ const real &operator[](int i) const { return v[i]; }
         __host__ __device__ real &operator[](int i) { return v[i]; }
       };

       template <typename Float, int length> struct LegacyOrder {
         using Accessor = LegacyOrder<Float, length>;
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         Float *ghost[QUDA_MAX_DIM];
         int faceVolumeCB[QUDA_MAX_DIM];
         const int volumeCB;
         const int stride;
         const int geometry;
         const int hasPhase;

         LegacyOrder(const GaugeField &u, Float **ghost_) :
           volumeCB(u.VolumeCB()),
           stride(u.Stride()),
           geometry(u.Geometry()),
           hasPhase(0)
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");

           for (int i = 0; i < 4; i++) {
             ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);
             faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
           }
         }

         LegacyOrder(const LegacyOrder &order) :
           volumeCB(order.volumeCB),
           stride(order.stride),
           geometry(order.geometry),
           hasPhase(0)
         {
           for (int i = 0; i < 4; i++) {
             ghost[i] = order.ghost[i];
             faceVolumeCB[i] = order.faceVolumeCB[i];
           }
         }

         __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const
         {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
           typedef S<Float, length> structure;
           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
           structure v_ = ghost_[parity * faceVolumeCB[dir] + x];
 #else
           auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
 #endif
           for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
         }

         __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
         {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
           typedef S<Float, length> structure;
           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
           structure v_;
           for (int i = 0; i < length / 2; i++) {
             v_[2 * i + 0] = (Float)v[i].real();
             v_[2 * i + 1] = (Float)v[i].imag();
           }
           ghost_[parity * faceVolumeCB[dir] + x] = v_;
 #else
           auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
           for (int i = 0; i < length / 2; i++) {
             v_[2 * i + 0] = (Float)v[i].real();
             v_[2 * i + 1] = (Float)v[i].imag();
           }
 #endif
         }

         __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
                                                                              real phase = 1.0)
         {
           return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
         }

         __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
                                                                                    real phase = 1.0) const
         {
           return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
         }

         __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int dummy, int dir, int dim, int g,
                                                     int parity, const int R[]) const
         {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
   structure v_ = ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];
 #else
           auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
 #endif
         for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
         }

         __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int dummy, int dir, int dim,
                                                     int g, int parity, const int R[])
         {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
           typedef S<Float, length> structure;
           trove::coalesced_ptr<structure> ghost_((structure *)ghost[dim]);
           structure v_;
           for (int i = 0; i < length / 2; i++) {
             v_[2 * i + 0] = (Float)v[i].real();
             v_[2 * i + 1] = (Float)v[i].imag();
           }
           ghost_[((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g] = v_;
 #else
           auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
           for (int i = 0; i < length / 2; i++) {
             v_[2 * i + 0] = (Float)v[i].real();
             v_[2 * i + 1] = (Float)v[i].imag();
           }
 #endif
         }
       };

     template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
       using Accessor = QDPOrder<Float, length>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const int volumeCB;
     QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
   { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
     QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
   for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
       }

       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
   structure v_ = gauge_[parity*volumeCB + x];
 #else
         auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
 #endif
         for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
       }

       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
   structure v_;
         for (int i = 0; i < length / 2; i++) {
           v_[2 * i + 0] = (Float)v[i].real();
           v_[2 * i + 1] = (Float)v[i].imag();
         }
         gauge_[parity * volumeCB + x] = v_;
 #else
         auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
         for (int i = 0; i < length / 2; i++) {
           v_[2 * i + 0] = (Float)v[i].real();
           v_[2 * i + 1] = (Float)v[i].imag();
         }
 #endif
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
       {
         return gauge_wrapper<real, QDPOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
       }

       size_t Bytes() const { return length * sizeof(Float); }
     };

     template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
       using Accessor = QDPJITOrder<Float, length>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const int volumeCB;
     QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
   { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
     QDPJITOrder(const QDPJITOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
   for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
       }

       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
       {
         for (int i = 0; i < length / 2; i++) {
           v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
           v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
         }
       }

       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
       {
         for (int i = 0; i < length / 2; i++) {
           gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].real();
           gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].imag();
         }
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
       {
         return gauge_wrapper<real, QDPJITOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
       }

       size_t Bytes() const { return length * sizeof(Float); }
     };

   template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
     using Accessor = MILCOrder<Float, length>;
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
     const int volumeCB;
     const int geometry;
   MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
     LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
       volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
   MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),
       gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
       { ; }

       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge);
       structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];
 #else
         auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
 #endif
       for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
     }

     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
     {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge);
       structure v_;
       for (int i = 0; i < length / 2; i++) {
         v_[2 * i + 0] = v[i].real();
         v_[2 * i + 1] = v[i].imag();
       }
       gauge_[(parity*volumeCB+x)*geometry + dir] = v_;
 #else
       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
       for (int i = 0; i < length / 2; i++) {
         v_[2 * i + 0] = v[i].real();
         v_[2 * i + 1] = v[i].imag();
       }
 #endif
     }

     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
     {
       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
     }

     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
     {
       return gauge_wrapper<real, MILCOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
     }

     size_t Bytes() const { return length * sizeof(Float); }
   };

   template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {
     using Accessor = MILCSiteOrder<Float, length>;
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
     const int volumeCB;
     const int geometry;
     const size_t offset;
     const size_t size;
     MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
       gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
       volumeCB(u.VolumeCB()),
       geometry(u.Geometry()),
       offset(u.SiteOffset()),
       size(u.SiteSize())
     {
       if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); }
     }

     MILCSiteOrder(const MILCSiteOrder &order) :
       LegacyOrder<Float, length>(order),
       gauge(order.gauge),
       volumeCB(order.volumeCB),
       geometry(order.geometry),
       offset(order.offset),
       size(order.size)
     {
     }

     __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
     {
       // get base pointer
       const Float *gauge0 = reinterpret_cast<const Float*>(reinterpret_cast<const char*>(gauge) + (parity*volumeCB+x)*size + offset);

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
       structure v_ = gauge_[dir];
 #else
       auto v_ = &gauge0[dir * length];
 #endif
       for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
     }

     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
     {
       // get base pointer
       Float *gauge0 = reinterpret_cast<Float*>(reinterpret_cast<char*>(gauge) + (parity*volumeCB+x)*size + offset);

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
       structure v_;
       for (int i = 0; i < length / 2; i++) {
         v_[2 * i + 0] = v[i].real();
         v_[2 * i + 1] = v[i].imag();
       }
       gauge_[dir] = v_;
 #else
       for (int i = 0; i < length / 2; i++) {
         gauge0[dir * length + 2 * i + 0] = v[i].real();
         gauge0[dir * length + 2 * i + 1] = v[i].imag();
       }
 #endif
     }

     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
     {
       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
     }

     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
     {
       return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
     }

     size_t Bytes() const { return length * sizeof(Float); }
   };


   template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
     using Accessor = CPSOrder<Float, length>;
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
     const int volumeCB;
     const real anisotropy;
     const real anisotropy_inv;
     static constexpr int Nc = 3;
     const int geometry;
     CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
       gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
       volumeCB(u.VolumeCB()),
       anisotropy(u.Anisotropy()),
       anisotropy_inv(1.0 / anisotropy),
       geometry(u.Geometry())
     {
       if (length != 18) errorQuda("Gauge length %d not supported", length);
     }
     CPSOrder(const CPSOrder &order) :
       LegacyOrder<Float, length>(order),
       gauge(order.gauge),
       volumeCB(order.volumeCB),
       anisotropy(order.anisotropy),
       anisotropy_inv(order.anisotropy_inv),
       geometry(order.geometry)
     {
       ;
     }

     // we need to transpose and scale for CPS ordering
     __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float inphase = 1.0) const
     {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge);
       structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];
 #else
       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
 #endif
       for (int i=0; i<Nc; i++) {
   for (int j=0; j<Nc; j++) {
           v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * anisotropy_inv;
         }
       }
     }

     __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
     {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
       typedef S<Float,length> structure;
       trove::coalesced_ptr<structure> gauge_((structure*)gauge);
       structure v_;
       for (int i=0; i<Nc; i++)
         for (int j = 0; j < Nc; j++) {
           v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
           v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
         }
       gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;
 #else
       auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
       for (int i=0; i<Nc; i++) {
   for (int j=0; j<Nc; j++) {
           v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
           v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
         }
       }
 #endif
     }

     __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
     {
       return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
     }

     __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
     {
       return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
     }

     size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
   };

     template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
       using Accessor = BQCDOrder<Float, length>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
       const int volumeCB;
       int exVolumeCB; // extended checkerboard volume
       static constexpr int Nc = 3;
       BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
         volumeCB(u.VolumeCB())
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);
         // compute volumeCB + halo region
         exVolumeCB = u.X()[0]/2 + 2;
   for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;
       }
       BQCDOrder(const BQCDOrder &order) :
         LegacyOrder<Float, length>(order),
         gauge(order.gauge),
         volumeCB(order.volumeCB),
         exVolumeCB(order.exVolumeCB)
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);
       }

       // we need to transpose for BQCD ordering
       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
         typedef S<Float, length> structure;
         trove::coalesced_ptr<structure> gauge_((structure *)gauge);
         structure v_ = gauge_[(dir * 2 + parity) * exVolumeCB + x];
 #else
         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
 #endif
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) { v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]); }
         }
       }

       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge);
   structure v_;
   for (int i=0; i<Nc; i++)
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
           }
         gauge_[(dir * 2 + parity) * exVolumeCB + x] = v_;
 #else
         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
           }
         }
 #endif
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
       {
         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
       }

       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
     };

     template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
       using Accessor = TIFROrder<Float, length>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
       const int volumeCB;
       static constexpr int Nc = 3;
       const real scale;
       const real scale_inv;
       TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
         volumeCB(u.VolumeCB()),
         scale(u.Scale()),
         scale_inv(1.0 / scale)
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);
       }
       TIFROrder(const TIFROrder &order) :
         LegacyOrder<Float, length>(order),
         gauge(order.gauge),
         volumeCB(order.volumeCB),
         scale(order.scale),
         scale_inv(1.0 / scale)
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);
       }

       // we need to transpose for TIFR ordering
       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
         typedef S<Float, length> structure;
         trove::coalesced_ptr<structure> gauge_((structure *)gauge);
         structure v_ = gauge_[(dir * 2 + parity) * volumeCB + x];
 #else
         auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
 #endif
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) {
             v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
           }
         }
       }

       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
       {
 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge);
   structure v_;
   for (int i=0; i<Nc; i++)
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
           }
         gauge_[(dir * 2 + parity) * volumeCB + x] = v_;
 #else
         auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
           }
         }
 #endif
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
       {
         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
       }

       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
     };

     template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {
       using Accessor = TIFRPaddedOrder<Float, length>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
       const int volumeCB;
       int exVolumeCB;
       static constexpr int Nc = 3;
       const real scale;
       const real scale_inv;
       const int dim[4];
       const int exDim[4];
       TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
         gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
         volumeCB(u.VolumeCB()),
         exVolumeCB(1),
         scale(u.Scale()),
         scale_inv(1.0 / scale),
         dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},
         exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);

         // exVolumeCB is the padded checkboard volume
         for (int i=0; i<4; i++) exVolumeCB *= exDim[i];
   exVolumeCB /= 2;
       }

       TIFRPaddedOrder(const TIFRPaddedOrder &order) :
         LegacyOrder<Float, length>(order),
         gauge(order.gauge),
         volumeCB(order.volumeCB),
         exVolumeCB(order.exVolumeCB),
         scale(order.scale),
         scale_inv(order.scale_inv),
         dim {order.dim[0], order.dim[1], order.dim[2], order.dim[3]},
         exDim {order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]}
       {
         if (length != 18) errorQuda("Gauge length %d not supported", length);
       }

       __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const {
   // find coordinates
   int coord[4];
   getCoords(coord, x_cb, dim, parity);

   // get z-extended index
   coord[2] += 2; // offset for halo
   return linkIndex(coord, exDim);
       }

       // we need to transpose for TIFR ordering
       __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
       {
         int y = getPaddedIndex(x, parity);

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge);
   structure v_ = gauge_[(dir*2+parity)*exVolumeCB + y];
 #else
         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
 #endif
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) {
             v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
           }
         }
       }

       __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
       {
         int y = getPaddedIndex(x, parity);

 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
   typedef S<Float,length> structure;
   trove::coalesced_ptr<structure> gauge_((structure*)gauge);
   structure v_;
   for (int i=0; i<Nc; i++)
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
           }
         gauge_[(dir * 2 + parity) * exVolumeCB + y] = v_;
 #else
         auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
         for (int i = 0; i < Nc; i++) {
           for (int j = 0; j < Nc; j++) {
             v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
             v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
           }
         }
 #endif
       }

       __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
       {
         return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
       }

       __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
       {
         return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
       }

       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
     };

   } // namespace gauge

   template <typename otherFloat, typename storeFloat>
     __device__ __host__ inline void complex<double>::operator=(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {
     x = a.real();
     y = a.imag();
   }

   template <typename otherFloat, typename storeFloat>
     __device__ __host__ inline void complex<float>::operator=(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {
     x = a.real();
     y = a.imag();
   }

   template <typename otherFloat, typename storeFloat>
     __device__ __host__ inline complex<double>::complex(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {
     x = a.real();
     y = a.imag();
   }

   template <typename otherFloat, typename storeFloat>
     __device__ __host__ inline complex<float>::complex(const gauge::fieldorder_wrapper<otherFloat,storeFloat> &a) {
     x = a.real();
     y = a.imag();
   }

   // Use traits to reduce the template explosion
   template <typename T, QudaReconstructType, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
       bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
       bool use_inphase = false>
   struct gauge_mapper {
   };

   // double precision
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase> type;
   };

   // single precision
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase> type;
   };

   // half precision
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<short, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase> type;
   };

   // quarter precision
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase> type;
   };
   template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
   struct gauge_mapper<char, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
     typedef gauge::FloatNOrder<char, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase> type;
   };

   template<typename T, QudaGaugeFieldOrder order, int Nc> struct gauge_order_mapper { };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDP_GAUGE_ORDER,Nc> { typedef gauge::QDPOrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDPJIT_GAUGE_ORDER,Nc> { typedef gauge::QDPJITOrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_MILC_GAUGE_ORDER,Nc> { typedef gauge::MILCOrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_BQCD_GAUGE_ORDER,Nc> { typedef gauge::BQCDOrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_GAUGE_ORDER,Nc> { typedef gauge::TIFROrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> { typedef gauge::TIFRPaddedOrder<T, 2*Nc*Nc> type; };
   template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_FLOAT2_GAUGE_ORDER,Nc> { typedef gauge::FloatNOrder<T, 2*Nc*Nc, 2, 2*Nc*Nc> type; };

   // experiments in reducing template instantation boilerplate
   // can this be replaced with a C++11 variant that uses variadic templates?

 #define INSTANTIATE_RECONSTRUCT(func, g, ...)       \
   {                 \
     if (!data.isNative())           \
       errorQuda("Field order %d and precision %d is not native", g.Order(), g.Precision()); \
     if( g.Reconstruct() == QUDA_RECONSTRUCT_NO) {     \
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge; \
       func(Gauge(g), g, __VA_ARGS__);         \
     } else if( g.Reconstruct() == QUDA_RECONSTRUCT_13){     \
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type Gauge; \
       func(Gauge(g), g, __VA_ARGS__);         \
     } else if( g.Reconstruct() == QUDA_RECONSTRUCT_12){     \
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge; \
       func(Gauge(g), g, __VA_ARGS__);         \
     } else if( g.Reconstruct() == QUDA_RECONSTRUCT_9){      \
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type Gauge; \
       func(Gauge(g), g, __VA_ARGS__);         \
     } else if( g.Reconstruct() == QUDA_RECONSTRUCT_8){      \
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge; \
       func(Gauge(g), g, __VA_ARGS__);         \
     } else {                \
       errorQuda("Reconstruction type %d of gauge field not supported", g.Reconstruct()); \
     }                 \
   }

 #define INSTANTIATE_PRECISION(func, lat, ...)       \
   {                 \
     if (lat.Precision() == QUDA_DOUBLE_PRECISION) {     \
       func<double>(lat, __VA_ARGS__);         \
     } else if(lat.Precision() == QUDA_SINGLE_PRECISION) {   \
       func<float>(lat, __VA_ARGS__);          \
     } else {                \
       errorQuda("Precision %d not supported", lat.Precision());   \
     }                 \
   }

 } // namespace quda

 #endif // _GAUGE_ORDER_H
quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1336

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::u
complex< storeFloat > * u[QUDA_MAX_GEOMETRY]
Definition: gauge_field_order.h:371

quda::gauge_mapper< double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3034

quda::gauge::TIFROrder
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
Definition: gauge_field_order.h:2744

quda::gauge::fieldorder_wrapper::scale
const Float scale
Definition: gauge_field_order.h:238

quda::gauge::MILCOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2388

quda::gauge::Ncolor
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
Definition: gauge_field_order.h:1674

quda::gauge_order_mapper< T, QUDA_TIFR_GAUGE_ORDER, Nc >::type
gauge::TIFROrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3124

QUDA_STAGGERED_PHASE_TIFR
Definition: enum_quda.h:492

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::scale
const real scale
Definition: gauge_field_order.h:1590

quda::gauge::fieldorder_wrapper::operator-
__device__ __host__ complex< Float > operator-() const
negation operator
Definition: gauge_field_order.h:269

quda::gauge::fixed_point< float, short >
__host__ __device__ constexpr bool fixed_point< float, short >()
Definition: gauge_field_order.h:220

quda::gauge::LegacyOrder::Ghost
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2123

quda::gauge_wrapper
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
Definition: gauge_field_order.h:44

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:549

quda::gauge::LegacyOrder::geometry
const int geometry
Definition: gauge_field_order.h:2051

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:538

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

fast_intdiv.h

quda::gauge::Accessor::dummy
complex< Float > dummy
Definition: gauge_field_order.h:343

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:524

quda::gauge::fieldorder_wrapper::scale_inv
const Float scale_inv
Definition: gauge_field_order.h:239

quda::gauge::LegacyOrder::ghost
Float * ghost[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2047

quda::gauge_mapper< short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3086

quda::gauge::Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase >::complex
complex< real > complex
Definition: gauge_field_order.h:1127

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::complex
complex< real > complex
Definition: gauge_field_order.h:1364

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:72

quda::gauge::Reconstruct::scale
real scale
Definition: gauge_field_order.h:1128

quda::gauge::FloatNOrder::R
int R[QUDA_MAX_DIM]
Definition: gauge_field_order.h:1714

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::stride
const int stride
Definition: gauge_field_order.h:691

quda::complex< char >
Definition: complex_quda.h:710

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:623

quda::gauge::Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1126

quda::gauge::LegacyOrder::stride
const int stride
Definition: gauge_field_order.h:2050

quda::gauge_wrapper::phase
const Float phase
Definition: gauge_field_order.h:48

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::complex
complex< real > complex
Definition: gauge_field_order.h:1588

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1273

quda::gauge::MILCSiteOrder::offset
const size_t offset
Definition: gauge_field_order.h:2433

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::resetScale
void resetScale(Float max_)
Definition: gauge_field_order.h:725

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::anisotropy
const complex anisotropy
Definition: gauge_field_order.h:1448

quda::gauge::QDPJITOrder::QDPJITOrder
QDPJITOrder(const QDPJITOrder &order)
Definition: gauge_field_order.h:2275

quda::gauge::TIFROrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2837

quda::gauge::FloatNOrder::AllocInt
AllocType< huge_alloc >::type AllocInt
Definition: gauge_field_order.h:1699

quda::gauge::QDPJITOrder
Definition: gauge_field_order.h:2266

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >
Definition: gauge_field_order.h:468

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1231

quda::gauge::Reconstruct::Reconstruct
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1131

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u, real scale=1.0)
Definition: gauge_field_order.h:1457

quda::gauge::TIFROrder::TIFROrder
TIFROrder(const TIFROrder &order)
Definition: gauge_field_order.h:2762

quda::gauge::MILCSiteOrder::MILCSiteOrder
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2435

quda::gauge::square_< ReduceType, short >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
Definition: gauge_field_order.h:181

quda::gauge::MILCOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2346

quda::gauge_mapper< char, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3108

quda::gauge_ghost_wrapper::operator=
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
Definition: gauge_field_order.h:138

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1092

quda::gauge::default_huge_alloc
constexpr bool default_huge_alloc
Definition: gauge_field_order.h:1677

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
Definition: gauge_field_order.h:1595

quda::gauge::fieldorder_wrapper::fieldorder_wrapper
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
Definition: gauge_field_order.h:246

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Pack
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Definition: gauge_field_order.h:1377

quda::gauge::LegacyOrder::hasPhase
const int hasPhase
Definition: gauge_field_order.h:2052

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:626

quda::gauge::fieldorder_wrapper
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors...
Definition: complex_quda.h:32

quda::gauge_wrapper::operator=
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
Definition: gauge_field_order.h:72

quda::gauge::MILCOrder
Definition: gauge_field_order.h:2332

quda::gauge::QDPOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2239

quda::gauge_mapper< float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3056

quda::gauge::TIFRPaddedOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2924

quda::gauge::ct_sqrt
__host__ __device__ constexpr int ct_sqrt(int n, int i=1)
Definition: gauge_field_order.h:1664

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:395

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:46

quda::gauge_mapper< double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3026

quda::gauge::TIFROrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2789

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:840

quda::gauge::TIFROrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2773

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

quda::gauge::FloatNOrder::load
void load()
Restore the field from the host after tuning.
Definition: gauge_field_order.h:2017

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
Definition: gauge_field_order.h:1230

host_free
#define host_free(ptr)
Definition: malloc_quda.h:71

quda::gauge::FloatNOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1696

quda::gauge::FieldOrder::abs_max
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
Definition: gauge_field_order.h:1093

quda::gauge::square_
Definition: gauge_field_order.h:165

quda::gauge::fieldorder_wrapper::v
complex< storeFloat > * v
Definition: gauge_field_order.h:236

quda::gauge::TIFROrder::scale_inv
const real scale_inv
Definition: gauge_field_order.h:2752

quda::gauge::CPSOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2632

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

quda::gauge::TIFRPaddedOrder::scale
const real scale
Definition: gauge_field_order.h:2857

quda::cmac
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
Definition: complex_quda.h:1372

comm_dim
int comm_dim(int dim)
Definition: comm_common.cpp:424

quda::gauge_mapper< short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3082

quda::gauge::FloatNOrder::complex
complex< real > complex
Definition: gauge_field_order.h:1697

quda::gauge_mapper< double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3030

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:527

quda::gauge::CPSOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
Definition: gauge_field_order.h:2563

quda::alloc
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::lastTimeSliceBound
const int lastTimeSliceBound
Definition: gauge_field_order.h:1236

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::gauge::TIFRPaddedOrder
Definition: gauge_field_order.h:2849

quda::gauge::TIFRPaddedOrder::scale_inv
const real scale_inv
Definition: gauge_field_order.h:2858

quda::gauge::timeBoundary
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
Definition: gauge_field_order.h:1179

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::max
Float max
Definition: gauge_field_order.h:693

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1593

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::u
complex< storeFloat > * u
Definition: gauge_field_order.h:684

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::scale_inv
const real scale_inv
Definition: gauge_field_order.h:1591

quda::gauge::square_::square_
square_(ReduceType scale)
Definition: gauge_field_order.h:166

quda::gauge::FieldOrder::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:941

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:413

quda::gauge::CPSOrder
Definition: gauge_field_order.h:2531

quda::gauge::abs_< Float, int >::scale
Float scale
Definition: gauge_field_order.h:212

quda::LatticeField::Scale
double Scale() const
Definition: lattice_field.h:556

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghostVolumeCB
int ghostVolumeCB[8]
Definition: gauge_field_order.h:820

quda::gauge::MILCOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2408

quda::complex< char >::real
__host__ __device__ char real() const volatile
Definition: complex_quda.h:739

quda::gauge_mapper< float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3052

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra...
Definition: gauge_field_order.h:1313

quda::gauge_mapper< float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3048

comm_coord
int comm_coord(int dim)
Definition: comm_common.cpp:431

quda::gauge::square_< ReduceType, char >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:173

quda::complex< double >::complex
__host__ __device__ complex()
Definition: complex_quda.h:585

quda::gauge::Reconstruct::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1130

quda::gauge::CPSOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2533

quda::gauge::FloatNOrder::X
int_fastdiv X[QUDA_MAX_DIM]
Definition: gauge_field_order.h:1713

quda::gauge::QDPOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2209

quda::copy
__host__ __device__ void copy(T1 &a, const T2 &b)
Definition: register_traits.h:152

quda::gauge_order_mapper
Definition: gauge_field_order.h:3119

quda::gauge::LegacyOrder::LegacyOrder
LegacyOrder(const LegacyOrder &order)
Definition: gauge_field_order.h:2069

quda::gauge_ghost_wrapper::ghost_idx
const int ghost_idx
Definition: gauge_field_order.h:111

quda::gauge::FloatNOrder::reconstruct
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
Definition: gauge_field_order.h:1700

R
static int R[4]
Definition: interface_quda.cpp:84

quda::gauge::abs_< Float, int >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:213

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:471

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:516

anisotropy
double anisotropy
Definition: test_util.cpp:1650

quda::LatticeField::SurfaceCB
const int * SurfaceCB() const
Definition: lattice_field.h:515

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:762

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghostOffset
int ghostOffset[8]
Definition: gauge_field_order.h:621

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258

quda::gauge::FloatNOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2024

QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:48

quda::gauge::fieldorder_wrapper::operator-=
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
Definition: gauge_field_order.h:312

quda::gauge::FieldOrder::accessor
Accessor< Float, nColor, order, storeFloat, use_tex > accessor
Definition: gauge_field_order.h:905

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:690

int_fastdiv
Definition: fast_intdiv.h:20

quda::gauge_order_mapper< T, QUDA_BQCD_GAUGE_ORDER, Nc >::type
gauge::BQCDOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3123

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:642

quda::gauge::abs_< Float, short >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:206

quda::gauge::TIFROrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2749

quda::complex< short >
Definition: complex_quda.h:757

quda::gauge::TIFRPaddedOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2979

quda::gauge::LegacyOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2049

length
int length[]
Definition: gauge_force_test.cpp:34

quda::gauge::TIFROrder::gauge
Float * gauge
Definition: gauge_field_order.h:2748

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:694

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::accessor
Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > accessor
Definition: gauge_field_order.h:824

texture_helper.cuh

quda::gauge::fieldorder_wrapper::operator=
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
Definition: gauge_field_order.h:277

quda::gauge_mapper< double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3022

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
Definition: gauge_field_order.h:446

quda::gauge_ghost_wrapper::gauge
T & gauge
Definition: gauge_field_order.h:114

quda::gauge::LegacyOrder::loadGhost
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
Definition: gauge_field_order.h:2081

quda::gauge::BQCDOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2717

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
Definition: gauge_field_order.h:1564

quda::gauge::QDPOrder::QDPOrder
QDPOrder(const QDPOrder &order)
Definition: gauge_field_order.h:2193

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:530

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::isLastTimeSlice
const bool isLastTimeSlice
Definition: gauge_field_order.h:1238

quda::gauge::QDPJITOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2305

quda::gauge::Accessor::Accessor
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:344

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:620

quda::gauge_mapper< float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3044

quda::gauge::TIFROrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2822

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:491

quda::Matrix::operator=
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:121

quda
Definition: blas_cublas.h:5

quda::clover::S
This is just a dummy structure we use for trove to define the required structure size.
Definition: clover_field_order.h:794

quda::gauge::LegacyOrder::complex
complex< real > complex
Definition: gauge_field_order.h:2046

quda::gauge::S::operator[]
__host__ __device__ const real & operator[](int i) const
Definition: gauge_field_order.h:2035

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::u
complex< storeFloat > * u
Definition: gauge_field_order.h:523

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::reconstruct_8
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
Definition: gauge_field_order.h:1589

thrust_helper.cuh

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:542

quda::GaugeField::Nface
int Nface() const
Definition: gauge_field.h:281

quda::gauge::QDPOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2189

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1252

quda::gauge::FloatNOrder::backup_h
void * backup_h
Definition: gauge_field_order.h:1720

quda::gauge::fieldorder_wrapper::operator+=
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
Definition: gauge_field_order.h:299

quda::gauge::match< short, short >
__host__ __device__ constexpr bool match< short, short >()
Definition: gauge_field_order.h:225

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:379

quda::gauge::abs_< Float, char >::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:199

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
Definition: gauge_field_order.h:871

quda::TexVectorType
Definition: register_traits.h:390

quda::gauge::MILCSiteOrder
struct to define gauge fields packed into an opaque MILC site struct:
Definition: gauge_field_order.h:2426

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:526

quda::gauge::FloatNOrder::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1711

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
Definition: gauge_field_order.h:826

atomicAdd
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
Definition: atomic.cuh:51

quda::gauge::FieldOrder::FieldOrder
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:912

quda::gauge::FieldOrder
Definition: gauge_field_order.h:896

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:249

QUDA_RECONSTRUCT_9
Definition: enum_quda.h:70

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:536

quda::gauge::FloatNOrder::geometry
const int geometry
Definition: gauge_field_order.h:1718

quda::gauge::MILCSiteOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2430

quda::gauge::BQCDOrder::BQCDOrder
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2650

QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:484

quda::gauge::MILCOrder::MILCOrder
MILCOrder(const MILCOrder &order)
Definition: gauge_field_order.h:2342

quda::gauge::FloatNOrder::saveGhost
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:1894

quda::gauge_mapper< double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3038

quda::gauge::TIFRPaddedOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2959

quda::gauge::QDPOrder
Definition: gauge_field_order.h:2184

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Pack
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
Definition: gauge_field_order.h:1624

quda::gauge_mapper< char, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3116

quda::gauge::FloatNOrder::offset
const AllocInt offset
Definition: gauge_field_order.h:1704

quda::gauge::MILCOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2337

quda::gauge::FloatNOrder::ghost
Float * ghost[4]
Definition: gauge_field_order.h:1710

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:506

quda::gauge::FieldOrder::Ndim
__device__ __host__ int Ndim() const
Definition: gauge_field_order.h:1053

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::geometry
const int geometry
Definition: gauge_field_order.h:525

quda::gauge_wrapper::parity
const int parity
Definition: gauge_field_order.h:47

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:68

quda::complex< int >
Definition: complex_quda.h:803

comm_allreduce_min
void comm_allreduce_min(double *data)
Definition: comm_mpi.cpp:265

quda::gauge::GhostAccessor::resetScale
void resetScale(Float dummy)
Definition: gauge_field_order.h:362

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::firstTimeSliceBound
const int firstTimeSliceBound
Definition: gauge_field_order.h:1235

quda::VectorType
Definition: register_traits.h:367

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:372

quda::gauge::FieldOrder::abs_min
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
Definition: gauge_field_order.h:1105

quda::gauge::QDPOrder::QDPOrder
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2190

quda::gauge::FieldOrder::NcolorCoarse
__device__ __host__ int NcolorCoarse() const
Definition: gauge_field_order.h:1062

quda::gauge_order_mapper< T, QUDA_QDP_GAUGE_ORDER, Nc >::type
gauge::QDPOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3120

quda::gauge::fieldorder_wrapper::real
__device__ __host__ Float real() const
Definition: gauge_field_order.h:249

QudaStaggeredPhase
enum QudaStaggeredPhase_s QudaStaggeredPhase

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1363

quda::gauge::fieldorder_wrapper::idx
const int idx
Definition: gauge_field_order.h:237

quda::gauge::QDPOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2259

quda::gauge_mapper< short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3074

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::gauge::Accessor::resetScale
void resetScale(Float dummy)
Definition: gauge_field_order.h:348

quda::gauge::BQCDOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2647

quda::gauge::MILCOrder::MILCOrder
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2339

quda::gauge::CPSOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2612

quda::gauge_mapper< char, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3112

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
Definition: gauge_field_order.h:1320

quda::gauge::MILCSiteOrder::size
const size_t size
Definition: gauge_field_order.h:2434

quda::gauge::MILCSiteOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2456

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:695

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Definition: gauge_field_order.h:1263

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:418

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1644

quda::complex< int >::imag
__host__ __device__ int imag() const volatile
Definition: complex_quda.h:833

quda::gauge::BQCDOrder
struct to define BQCD ordered gauge fields:
Definition: gauge_field_order.h:2642

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:44

quda::gauge::abs_< Float, char >::operator()
__host__ __device__ Float operator()(const quda::complex< char > &x)
Definition: gauge_field_order.h:200

quda::gauge::abs_< Float, char >::scale
Float scale
Definition: gauge_field_order.h:198

quda::complex< int >::real
__host__ __device__ int real() const volatile
Definition: complex_quda.h:832

quda::gauge::FloatNOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:1808

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

QudaGhostExchange
enum QudaGhostExchange_s QudaGhostExchange

quda::vector_store
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Definition: register_traits.h:422

quda::gauge::Reconstruct
Generic reconstruction helper with no reconstruction.
Definition: gauge_field_order.h:1125

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >
Definition: gauge_field_order.h:619

quda::gauge::static_phase
__host__ __device__ bool static_phase()
Definition: gauge_field_order.h:1679

quda::gauge::MILCSiteOrder::MILCSiteOrder
MILCSiteOrder(const MILCSiteOrder &order)
Definition: gauge_field_order.h:2446

quda::gauge::fixed_point< float, char >
__host__ __device__ constexpr bool fixed_point< float, char >()
Definition: gauge_field_order.h:219

quda::gauge::FloatNOrder::loadGhostEx
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
Definition: gauge_field_order.h:1953

quda::gauge_mapper< double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3018

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::scale
const real scale
Definition: gauge_field_order.h:1366

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1300

quda::gauge::BQCDOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2684

quda::gauge::FloatNOrder::saveGhostEx
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
Definition: gauge_field_order.h:1978

quda::gauge::FloatNOrder::save
void save()
Backup the field to the host when tuning.
Definition: gauge_field_order.h:2007

quda::complex< short >::imag
__host__ __device__ short imag() const volatile
Definition: complex_quda.h:787

quda::gauge_ghost_wrapper
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
Definition: gauge_field_order.h:109

quda::gauge_mapper< short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3070

quda::gauge_wrapper::gauge
T & gauge
Definition: gauge_field_order.h:49

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1574

quda::gauge::TIFRPaddedOrder::dim
const int dim[4]
Definition: gauge_field_order.h:2859

quda::gauge::MILCOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2358

quda::gauge::TIFRPaddedOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2851

quda::gauge::match< int, int >
__host__ __device__ constexpr bool match< int, int >()
Definition: gauge_field_order.h:224

quda::gauge::FieldOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:1113

quda::gauge::S::operator[]
__host__ __device__ real & operator[](int i)
Definition: gauge_field_order.h:2036

quda::gauge_mapper< char, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3104

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:375

quda::gauge::CPSOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2535

quda::gauge::FieldOrder::Ghost
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:974

quda::gauge::fieldorder_wrapper::imag
__device__ __host__ Float imag() const
Definition: gauge_field_order.h:257

quda::gauge::CPSOrder::CPSOrder
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2541

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
Definition: gauge_field_order.h:1362

quda::gauge::Reconstruct::getPhase
__device__ __host__ real getPhase(const complex in[N/2]) const
Definition: gauge_field_order.h:1164

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:717

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:482

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1241

quda::GaugeField::Ghost
const void ** Ghost() const
Definition: gauge_field.h:323

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::getPhase
__device__ __host__ real getPhase(const complex in[9]) const
Definition: gauge_field_order.h:1416

quda::gauge::Reconstruct::scale_inv
real scale_inv
Definition: gauge_field_order.h:1129

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1315

quda::gauge::QDPOrder::gauge
Float * gauge[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2188

register_traits.h
Provides precision abstractions and defines the register precision given the storage precision using ...

quda::gauge::TIFROrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2842

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::getPhase
__device__ __host__ real getPhase(const complex in[9])
Definition: gauge_field_order.h:1350

X
int X[4]
Definition: covdev_test.cpp:70

quda::gauge::FieldOrder::ghostAccessor
GhostAccessor< Float, nColor, order, native_ghost, storeFloat, use_tex > ghostAccessor
Definition: gauge_field_order.h:906

quda::gauge::LegacyOrder::loadGhostEx
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
Definition: gauge_field_order.h:2145

quda::gauge::fixed_point< float, int >
__host__ __device__ constexpr bool fixed_point< float, int >()
Definition: gauge_field_order.h:221

quda::gauge::QDPOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2197

quda::gauge::TIFRPaddedOrder::TIFRPaddedOrder
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2861

quda::gauge::GhostAccessor::dummy
complex< Float > dummy
Definition: gauge_field_order.h:357

quda::gauge::CPSOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2536

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::tBoundary
const real tBoundary
Definition: gauge_field_order.h:1234

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::isFirstTimeSlice
const bool isFirstTimeSlice
Definition: gauge_field_order.h:1237

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::atomic_add
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Definition: gauge_field_order.h:564

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::getPhase
__device__ __host__ real getPhase(const complex in[9]) const
Definition: gauge_field_order.h:1602

quda::cublas::init
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31

quda::gauge::LegacyOrder::Ghost
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
Definition: gauge_field_order.h:2139

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:650

quda::complex< short >::real
__host__ __device__ short real() const volatile
Definition: complex_quda.h:786

quda::gauge::MILCOrder::geometry
const int geometry
Definition: gauge_field_order.h:2338

quda::gauge::QDPJITOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2271

quda::gauge::CPSOrder::geometry
const int geometry
Definition: gauge_field_order.h:2540

quda::gauge_wrapper::x_cb
const int x_cb
Definition: gauge_field_order.h:46

quda::gauge_ghost_wrapper::parity
const int parity
Definition: gauge_field_order.h:112

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
Definition: gauge_field_order.h:755

quda::gauge::TIFROrder::scale
const real scale
Definition: gauge_field_order.h:2751

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >
Definition: gauge_field_order.h:817

quda::gauge_mapper< short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3090

trove_helper.cuh

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:66

QUDA_STAGGERED_PHASE_CPS
Definition: enum_quda.h:491

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::offset_cb
const int offset_cb
Definition: gauge_field_order.h:685

quda::gauge::Accessor::operator()
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:350

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::isFirstTimeSlice
const bool isFirstTimeSlice
Definition: gauge_field_order.h:1452

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
Definition: gauge_field_order.h:591

quda::gauge::abs_::operator()
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
Definition: gauge_field_order.h:194

quda::gauge::QDPOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2254

quda::gauge::FloatNOrder::phaseOffset
const AllocInt phaseOffset
Definition: gauge_field_order.h:1719

quda::gauge::FloatNOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:1855

quda::gauge::match
__host__ __device__ constexpr bool match()
Definition: gauge_field_order.h:223

quda::gauge_ghost_wrapper::phase
const Float phase
Definition: gauge_field_order.h:113

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:849

quda::gauge::TIFRPaddedOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2854

quda::gauge::CPSOrder::save
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
Definition: gauge_field_order.h:2579

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:47

quda::gauge::QDPJITOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2325

quda::gauge::MILCSiteOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:2503

quda::gauge::FieldOrder::norm1
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
Definition: gauge_field_order.h:1069

quda::gauge_mapper< float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3060

quda::gauge::abs_< Float, int >::operator()
__host__ __device__ Float operator()(const quda::complex< int > &x)
Definition: gauge_field_order.h:214

quda::cmul
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
Definition: complex_quda.h:1361

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

quda::gauge::LegacyOrder::LegacyOrder
LegacyOrder(const GaugeField &u, Float **ghost_)
Definition: gauge_field_order.h:2054

quda::gauge::LegacyOrder
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields, which use the same ghost zone storage.
Definition: gauge_field_order.h:2043

quda::gauge_wrapper::dim
const int dim
Definition: gauge_field_order.h:45

quda::gauge_mapper< short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3078

quda::gauge::FloatNOrder::FloatNOrder
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
Definition: gauge_field_order.h:1723

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:69

quda::gauge::FieldOrder::Ghost
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Definition: gauge_field_order.h:1017

quda::gauge::square_< ReduceType, int >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
Definition: gauge_field_order.h:188

index
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32

quda::gauge_order_mapper< T, QUDA_MILC_GAUGE_ORDER, Nc >::type
gauge::MILCOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3122

quda::gauge::Reconstruct::Pack
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
Definition: gauge_field_order.h:1135

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:657

quda::gauge::QDPJITOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2320

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghostOffset
int ghostOffset[8]
Definition: gauge_field_order.h:470

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
Definition: gauge_field_order.h:733

quda::gauge::QDPJITOrder::gauge
Float * gauge[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2270

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
Definition: gauge_field_order.h:1491

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1239

quda::gauge::FloatNOrder::bytes
size_t bytes
host memory for backing up the field when tuning
Definition: gauge_field_order.h:1721

quda_matrix.h

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1468

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

quda::gauge::Accessor
Definition: gauge_field_order.h:342

quda::isFixed
Definition: register_traits.h:144

index_helper.cuh

quda::complex< float >::operator=
__host__ __device__ volatile complex< float > & operator=(const complex< T > z) volatile
Definition: complex_quda.h:491

quda::gauge::LegacyOrder::faceVolumeCB
int faceVolumeCB[QUDA_MAX_DIM]
Definition: gauge_field_order.h:2048

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::volumeCB
const int volumeCB
Definition: gauge_field_order.h:819

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::scale_inv
const real scale_inv
Definition: gauge_field_order.h:1367

atomic.cuh

quda::gauge::BQCDOrder::BQCDOrder
BQCDOrder(const BQCDOrder &order)
Definition: gauge_field_order.h:2660

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::reconstruct_12
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
Definition: gauge_field_order.h:1365

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1317

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::lastTimeSliceBound
const int lastTimeSliceBound
Definition: gauge_field_order.h:1451

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1232

quda::gauge::FloatNOrder::Vector
VectorType< Float, N >::type Vector
Definition: gauge_field_order.h:1698

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1587

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:822

quda::gauge::fixed_point
__host__ __device__ constexpr bool fixed_point()
Definition: gauge_field_order.h:218

quda::complex< char >::imag
__host__ __device__ char imag() const volatile
Definition: complex_quda.h:740

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >
Definition: gauge_field_order.h:522

quda::gauge::FloatNOrder::faceVolumeCB
int faceVolumeCB[4]
Definition: gauge_field_order.h:1716

quda::gauge::LegacyOrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2045

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::firstTimeSliceBound
const int firstTimeSliceBound
Definition: gauge_field_order.h:1450

quda::mapper
Definition: register_traits.h:43

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::Reconstruct
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
Definition: gauge_field_order.h:1318

quda::gauge::MILCSiteOrder::geometry
const int geometry
Definition: gauge_field_order.h:2432

quda::operator+
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
Definition: color_spinor.h:1023

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:821

quda::RealType
Definition: float_vector.h:312

quda::gauge::Reconstruct::Unpack
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1153

quda::gauge::FloatNOrder::operator()
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Definition: gauge_field_order.h:1840

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:667

quda::gauge_mapper
Definition: gauge_field_order.h:3012

quda::gauge::FieldOrder::nDim
const int nDim
Definition: gauge_field_order.h:900

quda::gauge::FloatNOrder::Ghost
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:1931

quda::gauge::FieldOrder::Ghost
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
Definition: gauge_field_order.h:1033

quda::gauge::Reconstruct< 9, Float, ghostExchange_, stag_phase >
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
Definition: gauge_field_order.h:1586

quda::gauge::FieldOrder::geometry
const int_fastdiv geometry
Definition: gauge_field_order.h:901

quda::gauge::MILCOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2336

quda::gauge::abs_< Float, short >::scale
Float scale
Definition: gauge_field_order.h:205

QUDA_MAX_GEOMETRY
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
Definition: quda_constants.h:24

quda::gauge::square_< ReduceType, int >::scale
const ReduceType scale
Definition: gauge_field_order.h:186

quda::gauge::BQCDOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2737

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1446

quda::gauge::FieldOrder::operator()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Definition: gauge_field_order.h:987

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >
Definition: gauge_field_order.h:370

quda::gauge::TIFROrder::TIFROrder
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2753

quda::gauge_order_mapper< T, QUDA_QDPJIT_GAUGE_ORDER, Nc >::type
gauge::QDPJITOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3121

QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:42

quda::gauge::TIFRPaddedOrder::exVolumeCB
int exVolumeCB
Definition: gauge_field_order.h:2855

quda::gauge::TIFROrder::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:2746

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
Definition: gauge_field_order.h:698

quda::gauge::FieldOrder::Volume
__device__ __host__ int Volume() const
Definition: gauge_field_order.h:1047

QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:490

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field_order.h:1454

quda::gauge::square_< ReduceType, char >::scale
const ReduceType scale
Definition: gauge_field_order.h:172

quda::gauge::FloatNOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:1769

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::gauge::GhostAccessor
Definition: gauge_field_order.h:356

quda::gauge::TIFRPaddedOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2853

quda::gauge::CPSOrder::anisotropy
const real anisotropy
Definition: gauge_field_order.h:2537

QUDA_RECONSTRUCT_13
Definition: enum_quda.h:71

quda::gauge::Reconstruct< 12, Float, ghostExchange_ >::anisotropy
const real anisotropy
Definition: gauge_field_order.h:1233

quda::gauge::Reconstruct< 11, Float, ghostExchange_ >::real
typename mapper< Float >::type real
Definition: gauge_field_order.h:1314

quda::gauge::FieldOrder::Ghost
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:963

quda::Matrix::Matrix
__device__ __host__ Matrix()
Definition: quda_matrix.h:76

complex_quda.h

quda::gauge::BQCDOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2732

quda::gauge::CPSOrder::anisotropy_inv
const real anisotropy_inv
Definition: gauge_field_order.h:2538

quda::gauge::FieldOrder::Geometry
__device__ __host__ int Geometry() const
Definition: gauge_field_order.h:1056

quda::gauge::FieldOrder::resetScale
void resetScale(double max)
Definition: gauge_field_order.h:926

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Unpack
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Definition: gauge_field_order.h:1383

quda::gauge::Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Definition: gauge_field_order.h:559

quda::gauge::FieldOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:899

quda::gauge::FieldOrder::norm2
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
Definition: gauge_field_order.h:1081

quda::gauge::TIFRPaddedOrder::TIFRPaddedOrder
TIFRPaddedOrder(const TIFRPaddedOrder &order)
Definition: gauge_field_order.h:2878

quda::gauge::QDPJITOrder::QDPJITOrder
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Definition: gauge_field_order.h:2272

quda::gauge::TIFRPaddedOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2906

quda::gauge::FieldOrder::VolumeCB
__device__ __host__ int VolumeCB() const
Definition: gauge_field_order.h:1050

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

quda::gauge::MILCSiteOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:2431

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::Accessor
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex > &a)
Definition: gauge_field_order.h:389

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::cb_offset
const int cb_offset
Definition: gauge_field_order.h:374

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::isLastTimeSlice
const bool isLastTimeSlice
Definition: gauge_field_order.h:1453

quda::abs
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125

quda::gauge::FloatNOrder::volumeCB
const int volumeCB
Definition: gauge_field_order.h:1715

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >
Definition: gauge_field_order.h:683

quda::gauge::square_< ReduceType, int >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:187

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const GaugeField &u)
Definition: gauge_field_order.h:1369

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:472

quda::gauge::square_< ReduceType, short >::square_
square_(const ReduceType scale)
Definition: gauge_field_order.h:180

quda::gauge::FloatNOrder::Ghost
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
Definition: gauge_field_order.h:1947

quda::gauge::GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::scale
Float scale
Definition: gauge_field_order.h:622

quda::gauge::square_< ReduceType, char >::operator()
__host__ __device__ ReduceType operator()(const quda::complex< char > &x)
Definition: gauge_field_order.h:174

quda::gauge::LegacyOrder::saveGhost
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2093

quda::gauge::BQCDOrder::load
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2670

QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:477

quda::gauge::GhostAccessor::GhostAccessor
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:358

quda::gauge::fieldorder_wrapper::operator=
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
Definition: gauge_field_order.h:286

quda::gauge::milcStaggeredPhase
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
Definition: gauge_field_order.h:1212

quda::GaugeField::Gauge_p
virtual void * Gauge_p()
Definition: gauge_field.h:315

quda::operator*
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
Definition: color_spinor.h:1067

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::scale_inv
Float scale_inv
Definition: gauge_field_order.h:376

quda::gauge::BQCDOrder::exVolumeCB
int exVolumeCB
Definition: gauge_field_order.h:2648

quda::gauge::QDPJITOrder::load
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:2279

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

QUDA_GHOST_EXCHANGE_INVALID
Definition: enum_quda.h:485

volumeCB
static int volumeCB
Definition: face_gauge.cpp:43

quda::gauge::FieldOrder::Ncolor
__device__ __host__ int Ncolor() const
Definition: gauge_field_order.h:1044

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

quda::gauge::CPSOrder::CPSOrder
CPSOrder(const CPSOrder &order)
Definition: gauge_field_order.h:2551

quda::gauge::GhostAccessor::operator()
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:364

quda::gauge::MILCSiteOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2471

comm_allreduce
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242

quda::gauge::TIFRPaddedOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2974

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::operator()
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Definition: gauge_field_order.h:402

quda::getDeterminant
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

quda::gauge::BQCDOrder::gauge
Float * gauge
Definition: gauge_field_order.h:2646

quda::gauge::FloatNOrder::stride
const int stride
Definition: gauge_field_order.h:1717

quda::gauge_mapper< char, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3096

quda::gauge_ghost_wrapper::dim
const int dim
Definition: gauge_field_order.h:110

quda::gauge::CPSOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2627

quda::gauge::indexFloatN
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
Definition: gauge_field_order.h:673

quda::gauge::Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex >::geometry
const int geometry
Definition: gauge_field_order.h:373

comm_allreduce_max
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:258

quda::gauge_order_mapper< T, QUDA_FLOAT2_GAUGE_ORDER, Nc >::type
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3126

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::tBoundary
const complex tBoundary
Definition: gauge_field_order.h:1449

QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:483

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::GhostAccessor
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
Definition: gauge_field_order.h:475

quda::gauge::FieldOrder::fixedPoint
static constexpr bool fixedPoint()
Definition: gauge_field_order.h:931

quda::AllocType
Definition: register_traits.h:493

thrust_allocator
Definition: thrust_helper.cuh:27

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::operator()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
Definition: gauge_field_order.h:857

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::transform_reduce
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
Definition: gauge_field_order.h:789

quda::gauge::abs_< Float, short >::operator()
__host__ __device__ Float operator()(const quda::complex< short > &x)
Definition: gauge_field_order.h:207

quda::gauge::TIFRPaddedOrder::getPaddedIndex
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn&#39;t change from unpadded to padded...
Definition: gauge_field_order.h:2895

quda::gauge::MILCSiteOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2518

quda::gauge::FloatNOrder::gauge
Float * gauge
Definition: gauge_field_order.h:1703

quda::gauge::MILCSiteOrder::Bytes
size_t Bytes() const
Definition: gauge_field_order.h:2523

quda::gauge::square_< ReduceType, short >::scale
const ReduceType scale
Definition: gauge_field_order.h:179

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::complex
complex< real > complex
Definition: gauge_field_order.h:1447

quda::gauge::abs_::abs_
abs_(const Float scale)
Definition: gauge_field_order.h:193

quda::gauge::LegacyOrder::saveGhostEx
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
Definition: gauge_field_order.h:2158

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >::Pack
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
Definition: gauge_field_order.h:1479

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:64

quda::gauge::FieldOrder::FieldOrder
FieldOrder(const FieldOrder &o)
Definition: gauge_field_order.h:921

quda::gauge::FloatNOrder
Definition: gauge_field_order.h:1692

quda::complex< double >::operator=
__host__ __device__ volatile complex< double > & operator=(const complex< T > z) volatile
Definition: complex_quda.h:613

QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:489

quda::gauge::Reconstruct< 13, Float, ghostExchange_, stag_phase >::Reconstruct
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
Definition: gauge_field_order.h:1370

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::resetScale
void resetScale(Float max)
Definition: gauge_field_order.h:499

quda::gauge::GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:818

quda::gauge::square_::operator()
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
Definition: gauge_field_order.h:167

quda::gauge::FieldOrder::NspinCoarse
__device__ __host__ int NspinCoarse() const
Definition: gauge_field_order.h:1059

quda::gauge_mapper< char, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< char, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3100

quda::gauge::Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex >::geometry
const int geometry
Definition: gauge_field_order.h:692

quda::gauge::FloatNOrder::FloatNOrder
FloatNOrder(const FloatNOrder &order)
Definition: gauge_field_order.h:1752

quda::complex< float >::complex
__host__ __device__ complex()
Definition: complex_quda.h:463

quda::gauge::GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex >::ghost
complex< storeFloat > * ghost[8]
Definition: gauge_field_order.h:469

quda::gauge::FieldOrder::location
const QudaFieldLocation location
Definition: gauge_field_order.h:902

quda::getCoords
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
Definition: dslash_helper.cuh:88

quda::gauge_order_mapper< T, QUDA_TIFR_PADDED_GAUGE_ORDER, Nc >::type
gauge::TIFRPaddedOrder< T, 2 *Nc *Nc > type
Definition: gauge_field_order.h:3125

quda::gauge::S
This is just a dummy structure we use for trove to define the required structure size.
Definition: gauge_field_order.h:2033

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

quda::gauge::abs_
Definition: gauge_field_order.h:192

quda::gauge::MILCOrder::operator()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
Definition: gauge_field_order.h:2403

quda::gauge::FieldOrder::atomicAdd
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
Definition: gauge_field_order.h:1038

quda::gauge::QDPJITOrder::save
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Definition: gauge_field_order.h:2287

quda::GaugeField
Definition: gauge_field.h:164

quda::Trig
Definition: register_traits.h:293

quda::gauge::Reconstruct< 8, Float, ghostExchange_ >
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
Definition: gauge_field_order.h:1445

quda::gauge_mapper< float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase >::type
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
Definition: gauge_field_order.h:3064

quda::gauge::FloatNOrder::loadGhost
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
Definition: gauge_field_order.h:1861