QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
gauge_field_order.h
Go to the documentation of this file.
1 
2 #ifndef _GAUGE_ORDER_H
3 #define _GAUGE_ORDER_H
4 
11 #ifndef __CUDACC_RTC__
12 #include <assert.h>
13 #endif
14 #include <type_traits>
15 
16 #include <register_traits.h>
17 #include <complex_quda.h>
18 #include <quda_matrix.h>
19 #include <index_helper.cuh>
20 #include <fast_intdiv.h>
21 #include <type_traits>
22 #include <limits>
23 #include <atomic.cuh>
24 #include <thrust_helper.cuh>
25 #include <gauge_field.h>
26 #include <index_helper.cuh>
27 #include <trove_helper.cuh>
28 #include <texture_helper.cuh>
29 
30 namespace quda {
31 
43  template <typename Float, typename T>
44  struct gauge_wrapper {
45  const int dim;
46  const int x_cb;
47  const int parity;
48  const Float phase;
49  T &gauge;
50 
58  __device__ __host__ inline gauge_wrapper<Float, T>(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) :
59  gauge(gauge),
60  dim(dim),
61  x_cb(x_cb),
62  parity(parity),
63  phase(phase)
64  {
65  }
66 
71  template<typename M>
72  __device__ __host__ inline void operator=(const M &a) {
73  gauge.save(a.data, x_cb, dim, parity);
74  }
75  };
76 
81  template <typename T, int N>
82  template <typename S>
83  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_wrapper<typename RealType<T>::type,S> &a) {
84  a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
85  }
86 
91  template <typename T, int N>
92  template <typename S>
93  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_wrapper<typename RealType<T>::type,S> &a) {
94  a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
95  }
96 
108  template <typename Float, typename T>
110  const int dim;
111  const int ghost_idx;
112  const int parity;
113  const Float phase;
114  T &gauge;
115 
123  __device__ __host__ inline gauge_ghost_wrapper<Float, T>(
124  T &gauge, int dim, int ghost_idx, int parity, Float phase = 1.0) :
125  gauge(gauge),
126  dim(dim),
127  ghost_idx(ghost_idx),
128  parity(parity),
129  phase(phase)
130  {
131  }
132 
137  template<typename M>
138  __device__ __host__ inline void operator=(const M &a) {
139  gauge.saveGhost(a.data, ghost_idx, dim, parity);
140  }
141  };
142 
147  template <typename T, int N>
148  template <typename S>
149  __device__ __host__ inline void Matrix<T,N>::operator=(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
150  a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
151  }
152 
157  template <typename T, int N>
158  template <typename S>
159  __device__ __host__ inline Matrix<T,N>::Matrix(const gauge_ghost_wrapper<typename RealType<T>::type,S> &a) {
160  a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
161  }
162 
163  namespace gauge {
164 
165  template<typename ReduceType, typename Float> struct square_ {
166  square_(ReduceType scale) { }
167  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
168  { return static_cast<ReduceType>(norm(x)); }
169  };
170 
171  template<typename ReduceType> struct square_<ReduceType,char> {
172  const ReduceType scale;
173  square_(const ReduceType scale) : scale(scale) { }
174  __host__ __device__ inline ReduceType operator()(const quda::complex<char> &x)
175  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
176  };
177 
178  template<typename ReduceType> struct square_<ReduceType,short> {
179  const ReduceType scale;
180  square_(const ReduceType scale) : scale(scale) { }
181  __host__ __device__ inline ReduceType operator()(const quda::complex<short> &x)
182  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
183  };
184 
185  template<typename ReduceType> struct square_<ReduceType,int> {
186  const ReduceType scale;
187  square_(const ReduceType scale) : scale(scale) { }
188  __host__ __device__ inline ReduceType operator()(const quda::complex<int> &x)
189  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
190  };
191 
192  template<typename Float, typename storeFloat> struct abs_ {
193  abs_(const Float scale) { }
194  __host__ __device__ Float operator()(const quda::complex<storeFloat> &x) { return abs(x); }
195  };
196 
197  template<typename Float> struct abs_<Float,char> {
198  Float scale;
199  abs_(const Float scale) : scale(scale) { }
200  __host__ __device__ Float operator()(const quda::complex<char> &x)
201  { return abs(scale * complex<Float>(x.real(), x.imag())); }
202  };
203 
204  template<typename Float> struct abs_<Float,short> {
205  Float scale;
206  abs_(const Float scale) : scale(scale) { }
207  __host__ __device__ Float operator()(const quda::complex<short> &x)
208  { return abs(scale * complex<Float>(x.real(), x.imag())); }
209  };
210 
211  template<typename Float> struct abs_<Float,int> {
212  Float scale;
213  abs_(const Float scale) : scale(scale) { }
214  __host__ __device__ Float operator()(const quda::complex<int> &x)
215  { return abs(scale * complex<Float>(x.real(), x.imag())); }
216  };
217 
218  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
219  template<> __host__ __device__ inline constexpr bool fixed_point<float,char>() { return true; }
220  template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
221  template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }
222 
223  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
224  template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
225  template<> __host__ __device__ inline constexpr bool match<short,short>() { return true; }
226 
234  template <typename Float, typename storeFloat>
235  struct fieldorder_wrapper {
236  complex<storeFloat> *v;
237  const int idx;
238  const Float scale;
239  const Float scale_inv;
240  static constexpr bool fixed = fixed_point<Float,storeFloat>();
241 
246  __device__ __host__ inline fieldorder_wrapper(complex<storeFloat> *v, int idx, Float scale, Float scale_inv)
247  : v(v), idx(idx), scale(scale), scale_inv(scale_inv) {}
248 
249  __device__ __host__ inline Float real() const {
250  if (!fixed) {
251  return v[idx].real();
252  } else {
253  return scale_inv*static_cast<Float>(v[idx].real());
254  }
255  }
256 
257  __device__ __host__ inline Float imag() const {
258  if (!fixed) {
259  return v[idx].imag();
260  } else {
261  return scale_inv*static_cast<Float>(v[idx].imag());
262  }
263  }
264 
269  __device__ __host__ inline complex<Float> operator-() const {
270  return fixed ? -scale_inv*static_cast<complex<Float> >(v[idx]) : -static_cast<complex<Float> >(v[idx]);
271  }
272 
277  __device__ __host__ inline void operator=(const fieldorder_wrapper<Float,storeFloat> &a) {
278  v[idx] = fixed ? complex<storeFloat>(round(scale * a.real()), round(scale * a.imag())) : a.v[a.idx];
279  }
280 
285  template<typename theirFloat>
286  __device__ __host__ inline void operator=(const complex<theirFloat> &a) {
287  if (match<storeFloat,theirFloat>()) {
288  v[idx] = complex<storeFloat>(a.x, a.y);
289  } else {
290  v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
291  }
292  }
293 
298  template<typename theirFloat>
299  __device__ __host__ inline void operator+=(const complex<theirFloat> &a) {
300  if (match<storeFloat,theirFloat>()) {
301  v[idx] += complex<storeFloat>(a.x, a.y);
302  } else {
303  v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
304  }
305  }
306 
311  template<typename theirFloat>
312  __device__ __host__ inline void operator-=(const complex<theirFloat> &a) {
313  if (match<storeFloat,theirFloat>()) {
314  v[idx] -= complex<storeFloat>(a.x, a.y);
315  } else {
316  v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
317  }
318  }
319 
320  };
321 
322  template<typename Float, typename storeFloat>
323  __device__ __host__ inline complex<Float> operator*(const Float &a, const fieldorder_wrapper<Float,storeFloat> &b)
324  {
325  if (fixed_point<Float,storeFloat>()) return a*complex<Float>(b.real(), b.imag());
326  else return a*complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());
327  }
328 
329  template<typename Float, typename storeFloat>
330  __device__ __host__ inline complex<Float> operator+(const fieldorder_wrapper<Float,storeFloat> &a, const complex<Float> &b) {
331  if (fixed_point<Float,storeFloat>()) return complex<Float>(a.real(), a.imag()) + b;
332  else return complex<Float>(a.v[a.idx].real(),a.v[a.idx].imag()) + b;
333  }
334 
335  template<typename Float, typename storeFloat>
336  __device__ __host__ inline complex<Float> operator+(const complex<Float> &a, const fieldorder_wrapper<Float,storeFloat> &b) {
337  if (fixed_point<Float,storeFloat>()) return a + complex<Float>(b.real(), b.imag());
338  else return a + complex<Float>(b.v[b.idx].real(),b.v[b.idx].imag());;
339  }
340 
341  template<typename Float, int nColor, QudaGaugeFieldOrder order, typename storeFloat, bool use_tex>
342  struct Accessor {
343  mutable complex<Float> dummy;
344  Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
345  errorQuda("Not implemented for order=%d", order);
346  }
347 
348  void resetScale(Float dummy) { }
349 
350  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
351  return dummy;
352  }
353  };
354 
355  template<typename Float, int nColor, QudaGaugeFieldOrder order, bool native_ghost, typename storeFloat, bool use_tex>
356  struct GhostAccessor {
357  mutable complex<Float> dummy;
358  GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0) {
359  errorQuda("Not implemented for order=%d", order);
360  }
361 
362  void resetScale(Float dummy) { }
363 
364  __device__ __host__ complex<Float>& operator()(int d, int parity, int x, int row, int col) const {
365  return dummy;
366  }
367  };
368 
369  template<typename Float, int nColor, typename storeFloat, bool use_tex>
370  struct Accessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,storeFloat,use_tex> {
371  complex <storeFloat> *u[QUDA_MAX_GEOMETRY];
372  const int volumeCB;
373  const int geometry;
374  const int cb_offset;
375  Float scale;
376  Float scale_inv;
377  static constexpr bool fixed = fixed_point<Float,storeFloat>();
378 
379  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
380  : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(complex<storeFloat>)*U.Geometry())),
381  scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
382  {
383  for (int d=0; d<U.Geometry(); d++)
384  u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :
385  static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];
386  resetScale(U.Scale());
387  }
388 
390  : volumeCB(a.volumeCB), geometry(a.geometry), cb_offset(a.cb_offset), scale(a.scale), scale_inv(a.scale_inv) {
391  for (int d=0; d<QUDA_MAX_GEOMETRY; d++)
392  u[d] = a.u[d];
393  }
394 
395  void resetScale(Float max) {
396  if (fixed) {
397  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
398  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
399  }
400  }
401 
402  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
403  {
404  complex<storeFloat> tmp = u[d][ parity*cb_offset + (x*nColor + row)*nColor + col];
405 
406  if (fixed) {
407  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
408  } else {
409  return complex<Float>(tmp.x,tmp.y);
410  }
411  }
412 
413  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
414  { return fieldorder_wrapper<Float,storeFloat>(u[d], parity*cb_offset + (x*nColor + row)*nColor + col,
415  scale, scale_inv); }
416 
417  template<typename theirFloat>
418  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col,
419  const complex<theirFloat> &val) const {
420 #ifdef __CUDA_ARCH__
421  typedef typename vector<storeFloat,2>::type vec2;
422  vec2 *u2 = reinterpret_cast<vec2*>(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col);
423  if (fixed && !match<storeFloat,theirFloat>()) {
424  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
425  atomicAdd(u2, (vec2&)val_);
426  } else {
427  atomicAdd(u2, (vec2&)val);
428  }
429 #else
430  if (fixed && !match<storeFloat,theirFloat>()) {
431  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
432 #pragma omp atomic update
433  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += val_.x;
434 #pragma omp atomic update
435  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += val_.y;
436  } else {
437 #pragma omp atomic update
438  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
439 #pragma omp atomic update
440  u[dim][ parity*cb_offset + (x_cb*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
441  }
442 #endif
443  }
444 
445  template<typename helper, typename reducer>
446  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
447  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
448  int lower = (dim == -1) ? 0 : dim;
449  int upper = (dim == -1) ? geometry : dim+1;
450  double result = init;
451  if (location == QUDA_CUDA_FIELD_LOCATION) {
453  for (int d=lower; d<upper; d++) {
454  thrust::device_ptr<complex<storeFloat> > ptr(u[d]);
455  result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+2*volumeCB*nColor*nColor, h, result, r);
456  }
457  } else {
458  for (int d=lower; d<upper; d++) {
459  result = thrust::transform_reduce(thrust::seq, u[d], u[d]+2*volumeCB*nColor*nColor, h, result, r);
460  }
461  }
462  return result;
463  }
464 
465  };
466 
467  template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
468  struct GhostAccessor<Float,nColor,QUDA_QDP_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
469  complex<storeFloat> *ghost[8];
470  int ghostOffset[8];
471  Float scale;
472  Float scale_inv;
473  static constexpr bool fixed = fixed_point<Float,storeFloat>();
474 
475  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
476  : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
477  for (int d=0; d<4; d++) {
478  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
479  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
480  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
481 
482  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
483  ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
484  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
485  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
486  }
487 
488  resetScale(U.Scale());
489  }
490 
492  : scale(a.scale), scale_inv(a.scale_inv) {
493  for (int d=0; d<8; d++) {
494  ghost[d] = a.ghost[d];
495  ghostOffset[d] = a.ghostOffset[d];
496  }
497  }
498 
499  void resetScale(Float max) {
500  if (fixed) {
501  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
502  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
503  }
504  }
505 
506  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
507  {
508  complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
509  if (fixed) {
510  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
511  } else {
512  return complex<Float>(tmp.x,tmp.y);
513  }
514  }
515 
516  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
517  { return fieldorder_wrapper<Float,storeFloat>(ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col,
518  scale, scale_inv); }
519  };
520 
521  template<typename Float, int nColor, typename storeFloat, bool use_tex>
522  struct Accessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,storeFloat,use_tex> {
523  complex<storeFloat> *u;
524  const int volumeCB;
525  const int geometry;
526  Float scale;
527  Float scale_inv;
528  static constexpr bool fixed = fixed_point<Float,storeFloat>();
529 
530  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
531  : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
532  static_cast<complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),
533  volumeCB(U.VolumeCB()), geometry(U.Geometry()),
534  scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
535  resetScale(U.Scale());
536  }
537 
539  : u(a.u), volumeCB(a.volumeCB), geometry(a.geometry), scale(a.scale), scale_inv(a.scale_inv)
540  { }
541 
542  void resetScale(Float max) {
543  if (fixed) {
544  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
545  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
546  }
547  }
548 
549  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
550  {
551  complex<storeFloat> tmp = u[(((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col];
552  if (fixed) {
553  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
554  } else {
555  return complex<Float>(tmp.x,tmp.y);
556  }
557  }
558 
559  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
561  (u, (((parity*volumeCB+x)*geometry + d)*nColor + row)*nColor + col, scale, scale_inv); }
562 
563  template <typename theirFloat>
564  __device__ __host__ inline void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
565 #ifdef __CUDA_ARCH__
566  typedef typename vector<storeFloat,2>::type vec2;
567  vec2 *u2 = reinterpret_cast<vec2*>(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col);
568  if (fixed && !match<storeFloat,theirFloat>()) {
569  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
570  atomicAdd(u2, (vec2&)val_);
571  } else {
572  atomicAdd(u2, (vec2&)val);
573  }
574 #else
575  if (fixed && !match<storeFloat,theirFloat>()) {
576  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
577 #pragma omp atomic update
578  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += val_.x;
579 #pragma omp atomic update
580  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += val_.y;
581  } else {
582 #pragma omp atomic update
583  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].x += static_cast<storeFloat>(val.x);
584 #pragma omp atomic update
585  u[(((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col].y += static_cast<storeFloat>(val.y);
586  }
587 #endif
588  }
589 
590  template<typename helper, typename reducer>
591  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
592  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
593  int lower = (dim == -1) ? 0 : dim;
594  int upper = (dim == -1) ? geometry : dim+1;
595  double result = init;
596  if (location == QUDA_CUDA_FIELD_LOCATION) {
598  thrust::device_ptr<complex<storeFloat> > ptr(u);
599  result = thrust::transform_reduce(thrust::cuda::par(alloc),
600  ptr+(0*geometry+lower)*volumeCB*nColor*nColor,
601  ptr+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
602  result = thrust::transform_reduce(thrust::cuda::par(alloc),
603  ptr+(1*geometry+lower)*volumeCB*nColor*nColor,
604  ptr+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
605  } else {
606  result = thrust::transform_reduce(thrust::seq,
607  u+(0*geometry+lower)*volumeCB*nColor*nColor,
608  u+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
609  result = thrust::transform_reduce(thrust::seq,
610  u+(1*geometry+lower)*volumeCB*nColor*nColor,
611  u+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
612  }
613  return result;
614  }
615 
616  };
617 
618  template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
619  struct GhostAccessor<Float,nColor,QUDA_MILC_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
620  complex<storeFloat> *ghost[8];
621  int ghostOffset[8];
622  Float scale;
623  Float scale_inv;
624  static constexpr bool fixed = fixed_point<Float,storeFloat>();
625 
626  GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
627  : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
628  for (int d=0; d<4; d++) {
629  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
630  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
631  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
632 
633  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
634  ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
635  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
636  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
637  }
638 
639  resetScale(U.Scale());
640  }
641 
643  : scale(a.scale), scale_inv(a.scale_inv) {
644  for (int d=0; d<8; d++) {
645  ghost[d] = a.ghost[d];
646  ghostOffset[d] = a.ghostOffset[d];
647  }
648  }
649 
650  void resetScale(Float max) {
651  if (fixed) {
652  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
653  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
654  }
655  }
656 
657  __device__ __host__ inline complex<Float> operator()(int d, int parity, int x, int row, int col) const
658  {
659  complex<storeFloat> tmp = ghost[d][ parity*ghostOffset[d] + (x*nColor + row)*nColor + col];
660  if (fixed) {
661  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
662  } else {
663  return complex<Float>(tmp.x,tmp.y);
664  }
665  }
666 
667  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x, int row, int col)
669  (ghost[d], parity*ghostOffset[d] + (x*nColor + row)*nColor + col, scale, scale_inv); }
670  };
671 
672  template<int nColor, int N>
673  __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) {
674  constexpr int M = (2*nColor*nColor) / N;
675  int j = ((row*nColor+col)*2) / N; // factor of two for complexity
676  int i = ((row*nColor+col)*2) % N;
677  int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset
678  index += parity*offset_cb;
679  return index;
680  };
681 
682  template<typename Float, int nColor, typename storeFloat, bool use_tex>
683  struct Accessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex> {
684  complex<storeFloat> *u;
685  const int offset_cb;
686 #ifdef USE_TEXTURE_OBJECTS
687  typedef typename TexVectorType<Float,2>::type TexVector;
688  cudaTextureObject_t tex;
689 #endif
690  const int volumeCB;
691  const int stride;
692  const int geometry;
693  Float max;
694  Float scale;
695  Float scale_inv;
696  static constexpr bool fixed = fixed_point<Float,storeFloat>();
697 
698  Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
699  : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
700  static_cast<complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),
701  offset_cb( (U.Bytes()>>1) / sizeof(complex<storeFloat>)),
702 #ifdef USE_TEXTURE_OBJECTS
703  tex(0),
704 #endif
705  volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),
706  max(static_cast<Float>(1.0)), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
707  {
708  resetScale(U.Scale());
709 #ifdef USE_TEXTURE_OBJECTS
710  if (U.Location() == QUDA_CUDA_FIELD_LOCATION) tex = static_cast<const cudaGaugeField&>(U).Tex();
711  if (use_tex && this->u != U.Gauge_p() && !override) {
712  errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with use_tex=false instead");
713  }
714 #endif
715  }
716 
718  : u(a.u), offset_cb(a.offset_cb),
719 #ifdef USE_TEXTURE_OBJECTS
720  tex(a.tex),
721 #endif
722  volumeCB(a.volumeCB), stride(a.stride), geometry(a.geometry),
723  scale(a.scale), scale_inv(a.scale_inv) { }
724 
725  void resetScale(Float max_) {
726  if (fixed) {
727  max = max_;
728  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
729  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
730  }
731  }
732 
733  __device__ __host__ inline const complex<Float> operator()(int dim, int parity, int x_cb, int row, int col) const
734  {
735 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
736  if (use_tex) {
737  TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
738  if (fixed) {
739  return max*complex<Float>(vecTmp.x, vecTmp.y);
740  } else {
741  return complex<Float>(vecTmp.x, vecTmp.y);
742  }
743  } else
744 #endif
745  {
746  complex<storeFloat> tmp = u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb];
747  if (fixed) {
748  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
749  } else {
750  return complex<Float>(tmp.x, tmp.y);
751  }
752  }
753  }
754 
755  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int dim, int parity, int x_cb, int row, int col)
756  {
757  int index = parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb;
758  return fieldorder_wrapper<Float,storeFloat>(u, index, scale, scale_inv);
759  }
760 
761  template <typename theirFloat>
762  __device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex<theirFloat> &val) const {
763 #ifdef __CUDA_ARCH__
764  typedef typename vector<storeFloat,2>::type vec2;
765  vec2 *u2 = reinterpret_cast<vec2*>(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb);
766  if (fixed && !match<storeFloat,theirFloat>()) {
767  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
768  atomicAdd(u2, (vec2&)val_);
769  } else {
770  atomicAdd(u2, (vec2&)val);
771  }
772 #else
773  if (fixed && !match<storeFloat,theirFloat>()) {
774  complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
775 #pragma omp atomic update
776  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += val_.x;
777 #pragma omp atomic update
778  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += val_.y;
779  } else {
780 #pragma omp atomic update
781  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].x += static_cast<storeFloat>(val.x);
782 #pragma omp atomic update
783  u[parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb].y += static_cast<storeFloat>(val.y);
784  }
785 #endif
786  }
787 
788  template<typename helper, typename reducer>
789  __host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const {
790  if (dim >= geometry) errorQuda("Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
791  int lower = (dim == -1) ? 0 : dim;
792  int upper = (dim == -1) ? geometry : dim+1;
793  double result = init;
794  if (location == QUDA_CUDA_FIELD_LOCATION) {
796  thrust::device_ptr<complex<storeFloat> > ptr(u);
797  result = thrust::transform_reduce(thrust::cuda::par(alloc),
798  ptr+0*offset_cb+lower*stride*nColor*nColor,
799  ptr+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
800  result = thrust::transform_reduce(thrust::cuda::par(alloc),
801  ptr+1*offset_cb+lower*stride*nColor*nColor,
802  ptr+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
803  } else {
804  result = thrust::transform_reduce(thrust::seq,
805  u+0*offset_cb+lower*stride*nColor*nColor,
806  u+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
807  result = thrust::transform_reduce(thrust::seq,
808  u+1*offset_cb+lower*stride*nColor*nColor,
809  u+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
810  }
811  return result;
812  }
813 
814  };
815 
816  template<typename Float, int nColor, bool native_ghost, typename storeFloat, bool use_tex>
817  struct GhostAccessor<Float,nColor,QUDA_FLOAT2_GAUGE_ORDER,native_ghost,storeFloat,use_tex> {
818  complex<storeFloat> *ghost[8];
819  const int volumeCB;
820  int ghostVolumeCB[8];
821  Float scale;
822  Float scale_inv;
823  static constexpr bool fixed = fixed_point<Float,storeFloat>();
825 
826  GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
827  : volumeCB(U.VolumeCB()), accessor(U, gauge_, ghost_),
828  scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
829  {
830  if (!native_ghost) assert(ghost_ != nullptr);
831  for (int d=0; d<4; d++) {
832  ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
833  ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
834  ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
835  ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
836  }
837  resetScale(U.Scale());
838  }
839 
841  : volumeCB(a.volumeCB), scale(a.scale), scale_inv(a.scale_inv), accessor(a.accessor)
842  {
843  for (int d=0; d<8; d++) {
844  ghost[d] = a.ghost[d];
845  ghostVolumeCB[d] = a.ghostVolumeCB[d];
846  }
847  }
848 
849  void resetScale(Float max) {
850  accessor.resetScale(max);
851  if (fixed) {
852  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
853  scale_inv = max / static_cast<Float>(std::numeric_limits<storeFloat>::max());
854  }
855  }
856 
857  __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x_cb, int row, int col) const
858  {
859  if (native_ghost) {
860  return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
861  } else {
862  complex<storeFloat> tmp = ghost[d][ ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb ];
863  if (fixed) {
864  return scale_inv*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
865  } else {
866  return complex<Float>(tmp.x, tmp.y);
867  }
868  }
869  }
870 
871  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int d, int parity, int x_cb, int row, int col)
872  {
873  if (native_ghost)
874  return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
875  else
877  (ghost[d], ((parity*nColor + row)*nColor+col)*ghostVolumeCB[d] + x_cb, scale, scale_inv);
878  }
879  };
880 
881 
894  template <typename Float, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order,
895  bool native_ghost=true, typename storeFloat=Float, bool use_tex=false>
896  struct FieldOrder {
897 
899  const int volumeCB;
900  const int nDim;
903  static constexpr int nColorCoarse = nColor / nSpinCoarse;
904 
907 
912  FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
913  : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),
914  location(U.Location()),
915  accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)
916  {
917  if (U.Reconstruct() != QUDA_RECONSTRUCT_NO)
918  errorQuda("GaugeField ordering not supported with reconstruction");
919  }
920 
921  FieldOrder(const FieldOrder &o) : volumeCB(o.volumeCB),
922  nDim(o.nDim), geometry(o.geometry), location(o.location),
923  accessor(o.accessor), ghostAccessor(o.ghostAccessor)
924  { }
925 
926  void resetScale(double max) {
927  accessor.resetScale(max);
928  ghostAccessor.resetScale(max);
929  }
930 
931  static constexpr bool fixedPoint() { return fixed_point<Float,storeFloat>(); }
932 
941  __device__ __host__ complex<Float> operator()(int d, int parity, int x, int row, int col) const
942  { return accessor(d,parity,x,row,col); }
943 
952  __device__ __host__ fieldorder_wrapper<Float,storeFloat> operator() (int d, int parity, int x, int row, int col)
953  { return accessor(d,parity,x,row,col); }
954 
963  __device__ __host__ complex<Float> Ghost(int d, int parity, int x, int row, int col) const
964  { return ghostAccessor(d,parity,x,row,col); }
965 
974  __device__ __host__ fieldorder_wrapper<Float,storeFloat> Ghost(int d, int parity, int x, int row, int col)
975  { return ghostAccessor(d,parity,x,row,col); }
976 
987  __device__ __host__ inline const complex<Float> operator()(int d, int parity, int x, int s_row,
988  int s_col, int c_row, int c_col) const {
989  return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
990  }
991 
1002  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()
1003  (int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) {
1004  return (*this)(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1005  }
1006 
1017  __device__ __host__ inline complex<Float> Ghost(int d, int parity, int x, int s_row,
1018  int s_col, int c_row, int c_col) const {
1019  return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1020  }
1021 
1032  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat>
1033  Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) {
1034  return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1035  }
1036 
1037  template <typename theirFloat>
1038  __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col,
1039  int c_row, int c_col, const complex<theirFloat> &val) {
1040  accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);
1041  }
1042 
1044  __device__ __host__ inline int Ncolor() const { return nColor; }
1045 
1047  __device__ __host__ inline int Volume() const { return 2*volumeCB; }
1048 
1050  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
1051 
1053  __device__ __host__ inline int Ndim() const { return nDim; }
1054 
1056  __device__ __host__ inline int Geometry() const { return geometry; }
1057 
1059  __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; }
1060 
1062  __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; }
1063 
1069  __host__ double norm1(int dim=-1, bool global=true) const {
1070  double nrm1 = accessor.transform_reduce(location, dim, abs_<double,storeFloat>(accessor.scale_inv),
1071  thrust::plus<double>(), 0.0);
1072  if (global) comm_allreduce(&nrm1);
1073  return nrm1;
1074  }
1075 
1081  __host__ double norm2(int dim=-1, bool global=true) const {
1082  double nrm2 = accessor.transform_reduce(location, dim, square_<double,storeFloat>(accessor.scale_inv),
1083  thrust::plus<double>(), 0.0);
1084  if (global) comm_allreduce(&nrm2);
1085  return nrm2;
1086  }
1087 
1093  __host__ double abs_max(int dim=-1, bool global=true) const {
1094  double absmax = accessor.transform_reduce(location, dim, abs_<Float,storeFloat>(accessor.scale_inv),
1095  thrust::maximum<Float>(), 0.0);
1096  if (global) comm_allreduce_max(&absmax);
1097  return absmax;
1098  }
1099 
1105  __host__ double abs_min(int dim=-1, bool global=true) const {
1106  double absmin = accessor.transform_reduce(location, dim, abs_<Float,storeFloat>(accessor.scale_inv),
1107  thrust::minimum<Float>(), std::numeric_limits<double>::max());
1108  if (global) comm_allreduce_min(&absmin);
1109  return absmin;
1110  }
1111 
1113  size_t Bytes() const { return static_cast<size_t>(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); }
1114  };
1115 
1124  template <int N, typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
1125  struct Reconstruct {
1126  using real = typename mapper<Float>::type;
1130  Reconstruct(const GaugeField &u) : scale(u.LinkMax()), scale_inv(1.0 / scale) {}
1131  Reconstruct(const Reconstruct<N, Float, ghostExchange_> &recon) : scale(recon.scale), scale_inv(recon.scale_inv)
1132  {
1133  }
1134 
1135  __device__ __host__ inline void Pack(real out[N], const complex in[N / 2], int idx) const
1136  {
1137  if (isFixed<Float>::value) {
1138 #pragma unroll
1139  for (int i = 0; i < N / 2; i++) {
1140  out[2 * i + 0] = scale_inv * in[i].real();
1141  out[2 * i + 1] = scale_inv * in[i].imag();
1142  }
1143  } else {
1144 #pragma unroll
1145  for (int i = 0; i < N / 2; i++) {
1146  out[2 * i + 0] = in[i].real();
1147  out[2 * i + 1] = in[i].imag();
1148  }
1149  }
1150  }
1151 
1152  template <typename I>
1153  __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int idx, int dir, real phase,
1154  const I *X, const int *R) const
1155  {
1156  if (isFixed<Float>::value) {
1157 #pragma unroll
1158  for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }
1159  } else {
1160 #pragma unroll
1161  for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
1162  }
1163  }
1164  __device__ __host__ inline real getPhase(const complex in[N / 2]) const { return 0; }
1165  };
1166 
1178  template <QudaGhostExchange ghostExchange_, typename T, typename I>
1179  __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
1180  T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,
1181  bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
1182  {
1183 
1184  // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
1185 
1186  if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
1187  || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
1188  if (idx >= firstTimeSliceBound) { // halo region on the first time slice
1189  return isFirstTimeSlice ? tBoundary : scale;
1190  } else if (idx >= lastTimeSliceBound) { // last link on the last time slice
1191  return isLastTimeSlice ? tBoundary : scale;
1192  } else {
1193  return scale;
1194  }
1195  } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
1196  || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
1197  if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
1198  // the boundary condition is on the R[3]-1 time slice
1199  return isFirstTimeSlice ? tBoundary : scale;
1200  } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
1201  // the boundary condition lies on the X[3]-R[3]-1 time slice
1202  return isLastTimeSlice ? tBoundary : scale;
1203  } else {
1204  return scale;
1205  }
1206  }
1207  return scale;
1208  }
1209 
1210  // not actually used - here for reference
1211  template <typename Float, typename I>
1212  __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) {
1213  // could consider non-extended variant too?
1214  Float sign = static_cast<Float>(1.0);
1215  switch (dim) {
1216  case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1217  case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1218  case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0); break;
1219  }
1220  return sign;
1221  }
1222 
1230  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<12, Float, ghostExchange_> {
1231  using real = typename mapper<Float>::type;
1237  const bool isFirstTimeSlice;
1238  const bool isLastTimeSlice;
1240 
1242  anisotropy(u.Anisotropy()),
1243  tBoundary(static_cast<real>(u.TBoundary())),
1244  firstTimeSliceBound(u.VolumeCB()),
1245  lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1246  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
1247  isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
1248  ghostExchange(u.GhostExchange())
1249  {
1250  }
1251 
1253  anisotropy(recon.anisotropy),
1254  tBoundary(recon.tBoundary),
1255  firstTimeSliceBound(recon.firstTimeSliceBound),
1256  lastTimeSliceBound(recon.lastTimeSliceBound),
1257  isFirstTimeSlice(recon.isFirstTimeSlice),
1258  isLastTimeSlice(recon.isLastTimeSlice),
1259  ghostExchange(recon.ghostExchange)
1260  {
1261  }
1262 
1263  __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
1264  {
1265 #pragma unroll
1266  for (int i = 0; i < 6; i++) {
1267  out[2 * i + 0] = in[i].real();
1268  out[2 * i + 1] = in[i].imag();
1269  }
1270  }
1271 
1272  template <typename I>
1273  __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
1274  const I *X, const int *R) const
1275  {
1276 #pragma unroll
1277  for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);
1278 
1279  const real u0 = dir < 3 ?
1280  anisotropy :
1281  timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
1282  lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1283 
1284  // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);
1285  out[6] = cmul(out[2], out[4]);
1286  out[6] = cmac(out[1], out[5], -out[6]);
1287  out[6] = u0 * conj(out[6]);
1288 
1289  // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]);
1290  out[7] = cmul(out[0], out[5]);
1291  out[7] = cmac(out[2], out[3], -out[7]);
1292  out[7] = u0 * conj(out[7]);
1293 
1294  // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]);
1295  out[8] = cmul(out[1], out[3]);
1296  out[8] = cmac(out[0], out[4], -out[8]);
1297  out[8] = u0 * conj(out[8]);
1298  }
1299 
1300  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1301  };
1302 
1313  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<11, Float, ghostExchange_> {
1314  using real = typename mapper<Float>::type;
1316 
1317  Reconstruct(const GaugeField &u) { ; }
1319 
1320  __device__ __host__ inline void Pack(real out[10], const complex in[9], int idx) const
1321  {
1322 #pragma unroll
1323  for (int i = 0; i < 2; i++) {
1324  out[2 * i + 0] = in[i + 1].real();
1325  out[2 * i + 1] = in[i + 1].imag();
1326  }
1327  out[4] = in[5].real();
1328  out[5] = in[5].imag();
1329  out[6] = in[0].imag();
1330  out[7] = in[4].imag();
1331  out[8] = in[8].imag();
1332  out[9] = 0.0;
1333  }
1334 
1335  template <typename I>
1336  __device__ __host__ inline void Unpack(complex out[9], const real in[10], int idx, int dir, real phase,
1337  const I *X, const int *R) const
1338  {
1339  out[0] = complex(0.0, in[6]);
1340  out[1] = complex(in[0], in[1]);
1341  out[2] = complex(in[2], in[3]);
1342  out[3] = complex(-out[1].real(), out[1].imag());
1343  out[4] = complex(0.0, in[7]);
1344  out[5] = complex(in[4], in[5]);
1345  out[6] = complex(-out[2].real(), out[2].imag());
1346  out[7] = complex(-out[5].real(), out[5].imag());
1347  out[8] = complex(0.0, in[8]);
1348  }
1349 
1350  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1351  };
1352 
1361  template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1362  struct Reconstruct<13, Float, ghostExchange_, stag_phase> {
1363  using real = typename mapper<Float>::type;
1366  const real scale;
1368 
1369  Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {}
1371  reconstruct_12(recon.reconstruct_12),
1372  scale(recon.scale),
1373  scale_inv(recon.scale_inv)
1374  {
1375  }
1376 
1377  __device__ __host__ inline void Pack(real out[12], const complex in[9], int idx) const
1378  {
1379  reconstruct_12.Pack(out, in, idx);
1380  }
1381 
1382  template <typename I>
1383  __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real phase,
1384  const I *X, const int *R) const
1385  {
1386 #pragma unroll
1387  for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]);
1388 
1389  out[6] = cmul(out[2], out[4]);
1390  out[6] = cmac(out[1], out[5], -out[6]);
1391  out[6] = scale_inv * conj(out[6]);
1392 
1393  out[7] = cmul(out[0], out[5]);
1394  out[7] = cmac(out[2], out[3], -out[7]);
1395  out[7] = scale_inv * conj(out[7]);
1396 
1397  out[8] = cmul(out[1], out[3]);
1398  out[8] = cmac(out[0], out[4], -out[8]);
1399  out[8] = scale_inv * conj(out[8]);
1400 
1401  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
1402  // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
1403  real cos_sin[2];
1404  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(3. * phase), &cos_sin[1], &cos_sin[0]);
1405  complex A(cos_sin[0], cos_sin[1]);
1406  out[6] = cmul(A, out[6]);
1407  out[7] = cmul(A, out[7]);
1408  out[8] = cmul(A, out[8]);
1409  } else { // phase is +/- 1 so real multiply is sufficient
1410  out[6] *= phase;
1411  out[7] *= phase;
1412  out[8] *= phase;
1413  }
1414  }
1415 
1416  __device__ __host__ inline real getPhase(const complex in[9]) const
1417  {
1418 #if 1 // phase from cross product
1419  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
1420  complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1421  complex expI3Phase = in[8] / denom; // numerator = U[2][2]
1422 
1423  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
1424  return arg(expI3Phase) / static_cast<real>(3.0);
1425  } else {
1426  return expI3Phase.real() > 0 ? 1 : -1;
1427  }
1428 #else // phase from determinant
1430 #pragma unroll
1431  for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1432  const complex det = getDeterminant(a);
1433  return phase = arg(det) / 3;
1434 #endif
1435  }
1436  };
1437 
1445  template <typename Float, QudaGhostExchange ghostExchange_> struct Reconstruct<8, Float, ghostExchange_> {
1446  using real = typename mapper<Float>::type;
1448  const complex anisotropy; // imaginary value stores inverse
1449  const complex tBoundary; // imaginary value stores inverse
1452  const bool isFirstTimeSlice;
1453  const bool isLastTimeSlice;
1455 
1456  // scale factor is set when using recon-9
1457  Reconstruct(const GaugeField &u, real scale = 1.0) :
1458  anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
1459  tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),
1460  firstTimeSliceBound(u.VolumeCB()),
1461  lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1462  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
1463  isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
1464  ghostExchange(u.GhostExchange())
1465  {
1466  }
1467 
1469  anisotropy(recon.anisotropy),
1470  tBoundary(recon.tBoundary),
1471  firstTimeSliceBound(recon.firstTimeSliceBound),
1472  lastTimeSliceBound(recon.lastTimeSliceBound),
1473  isFirstTimeSlice(recon.isFirstTimeSlice),
1474  isLastTimeSlice(recon.isLastTimeSlice),
1475  ghostExchange(recon.ghostExchange)
1476  {
1477  }
1478 
1479  __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
1480  {
1481  out[0] = Trig<isFixed<Float>::value, real>::Atan2(in[0].imag(), in[0].real());
1482  out[1] = Trig<isFixed<Float>::value, real>::Atan2(in[6].imag(), in[6].real());
1483 #pragma unroll
1484  for (int i = 1; i < 4; i++) {
1485  out[2 * i + 0] = in[i].real();
1486  out[2 * i + 1] = in[i].imag();
1487  }
1488  }
1489 
1490  template <typename I>
1491  __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
1492  const I *X, const int *R, const complex scale, const complex u) const
1493  {
1494  real u0 = u.real();
1495  real u0_inv = u.imag();
1496 
1497 #pragma unroll
1498  for (int i = 1; i <= 3; i++)
1499  out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly
1500 
1501  real tmp[2];
1502  Trig<isFixed<Float>::value, real>::SinCos(in[0], &tmp[1], &tmp[0]);
1503  out[0] = complex(tmp[0], tmp[1]);
1504 
1505  Trig<isFixed<Float>::value, real>::SinCos(in[1], &tmp[1], &tmp[0]);
1506  out[6] = complex(tmp[0], tmp[1]);
1507 
1508  // First, reconstruct first row
1509  real row_sum = out[1].real() * out[1].real();
1510  row_sum += out[1].imag() * out[1].imag();
1511  row_sum += out[2].real() * out[2].real();
1512  row_sum += out[2].imag() * out[2].imag();
1513  real row_sum_inv = static_cast<real>(1.0) / row_sum;
1514 
1515  real diff = u0_inv * u0_inv - row_sum;
1516  real U00_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);
1517 
1518  out[0] *= U00_mag;
1519 
1520  // Second, reconstruct first column
1521  real column_sum = out[0].real() * out[0].real();
1522  column_sum += out[0].imag() * out[0].imag();
1523  column_sum += out[3].real() * out[3].real();
1524  column_sum += out[3].imag() * out[3].imag();
1525 
1526  diff = u0_inv * u0_inv - column_sum;
1527  real U20_mag = diff > 0.0 ? diff * rsqrt(diff) : static_cast<real>(0.0);
1528 
1529  out[6] *= U20_mag;
1530 
1531  // Finally, reconstruct last elements from SU(2) rotation
1532  real r_inv2 = u0_inv * row_sum_inv;
1533  {
1534  complex A = cmul(conj(out[0]), out[3]);
1535 
1536  // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11
1537  out[4] = cmul(conj(out[6]), conj(out[2]));
1538  out[4] = cmac(u0 * A, out[1], out[4]);
1539  out[4] = -r_inv2 * out[4];
1540 
1541  // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12
1542  out[5] = cmul(conj(out[6]), conj(out[1]));
1543  out[5] = cmac(-u0 * A, out[2], out[5]);
1544  out[5] = r_inv2 * out[5];
1545  }
1546 
1547  {
1548  complex A = cmul(conj(out[0]), out[6]);
1549 
1550  // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21
1551  out[7] = cmul(conj(out[3]), conj(out[2]));
1552  out[7] = cmac(-u0 * A, out[1], out[7]);
1553  out[7] = r_inv2 * out[7];
1554 
1555  // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12
1556  out[8] = cmul(conj(out[3]), conj(out[1]));
1557  out[8] = cmac(u0 * A, out[2], out[8]);
1558  out[8] = -r_inv2 * out[8];
1559  }
1560  }
1561 
1562  template <typename I>
1563  __device__ __host__ inline void
1564  Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R,
1565  const complex scale = complex(static_cast<real>(1.0), static_cast<real>(1.0))) const
1566  {
1567  complex u = dir < 3 ?
1568  anisotropy :
1569  timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
1570  isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1571  Unpack(out, in, idx, dir, phase, X, R, scale, u);
1572  }
1573 
1574  __device__ __host__ inline real getPhase(const complex in[9]) { return 0; }
1575  };
1576 
1585  template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1586  struct Reconstruct<9, Float, ghostExchange_, stag_phase> {
1587  using real = typename mapper<Float>::type;
1590  const real scale;
1592 
1593  Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {}
1594 
1596  reconstruct_8(recon.reconstruct_8),
1597  scale(recon.scale),
1598  scale_inv(recon.scale_inv)
1599  {
1600  }
1601 
1602  __device__ __host__ inline real getPhase(const complex in[9]) const
1603  {
1604 #if 1 // phase from cross product
1605  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
1606  complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1607  complex expI3Phase = in[8] / denom; // numerator = U[2][2]
1608  if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
1609  return arg(expI3Phase) / static_cast<real>(3.0);
1610  } else {
1611  return expI3Phase.real() > 0 ? 1 : -1;
1612  }
1613 #else // phase from determinant
1615 #pragma unroll
1616  for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1617  const complex det = getDeterminant(a);
1618  real phase = arg(det) / 3;
1619  return phase;
1620 #endif
1621  }
1622 
1623  // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,
1624  __device__ __host__ inline void Pack(real out[8], const complex in[9], int idx) const
1625  {
1626  real phase = getPhase(in);
1627  complex su3[9];
1628 
1629  if (stag_phase == QUDA_STAGGERED_PHASE_NO) {
1630  real cos_sin[2];
1631  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(-phase), &cos_sin[1], &cos_sin[0]);
1632  complex z(cos_sin[0], cos_sin[1]);
1633  z *= scale_inv;
1634 #pragma unroll
1635  for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]);
1636  } else {
1637 #pragma unroll
1638  for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }
1639  }
1640  reconstruct_8.Pack(out, su3, idx);
1641  }
1642 
1643  template <typename I>
1644  __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase,
1645  const I *X, const int *R) const
1646  {
1647  reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast<real>(1.0), static_cast<real>(1.0)),
1648  complex(static_cast<real>(1.0), static_cast<real>(1.0)));
1649 
1650  if (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase
1651  real cos_sin[2];
1652  Trig<isFixed<real>::value, real>::SinCos(static_cast<real>(phase), &cos_sin[1], &cos_sin[0]);
1653  complex z(cos_sin[0], cos_sin[1]);
1654  z *= scale;
1655 #pragma unroll
1656  for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
1657  } else { // stagic phase
1658 #pragma unroll
1659  for (int i = 0; i < 18; i++) { out[i] *= phase; }
1660  }
1661  }
1662  };
1663 
1664  __host__ __device__ constexpr int ct_sqrt(int n, int i = 1)
1665  {
1666  return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i);
1667  }
1668 
1674  __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); }
1675 
1676  // we default to huge allocations for gauge field (for now)
1677  constexpr bool default_huge_alloc = true;
1678 
1679  template <QudaStaggeredPhase phase> __host__ __device__ inline bool static_phase()
1680  {
1681  switch (phase) {
1684  case QUDA_STAGGERED_PHASE_TIFR: return true;
1685  default: return false;
1686  }
1687  }
1688 
1689  template <typename Float, int length, int N, int reconLenParam,
1690  QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
1691  QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>
1692  struct FloatNOrder {
1693  using Accessor
1695 
1696  using real = typename mapper<Float>::type;
1701  static const int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
1702  static const int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
1703  Float *gauge;
1704  const AllocInt offset;
1705 #ifdef USE_TEXTURE_OBJECTS
1706  typedef typename TexVectorType<real, N>::type TexVector;
1707  cudaTextureObject_t tex;
1708  const int tex_offset;
1709 #endif
1710  Float *ghost[4];
1712  int coords[QUDA_MAX_DIM];
1715  const int volumeCB;
1716  int faceVolumeCB[4];
1717  const int stride;
1718  const int geometry;
1719  const AllocInt phaseOffset;
1720  void *backup_h;
1721  size_t bytes;
1722 
1723  FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
1724  : reconstruct(u), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1725  offset(u.Bytes()/(2*sizeof(Float))),
1726 #ifdef USE_TEXTURE_OBJECTS
1727  tex(0), tex_offset(offset/N),
1728 #endif
1729  ghostExchange(u.GhostExchange()),
1730  volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()),
1731  phaseOffset(u.PhaseOffset()), backup_h(nullptr), bytes(u.Bytes())
1732  {
1733  if (geometry == QUDA_COARSE_GEOMETRY)
1734  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
1735 
1736  // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12),
1737  // "staggered phase only presently supported for 18 and 12 reconstruct");
1738  for (int i = 0; i < 4; i++) {
1739  X[i] = u.X()[i];
1740  R[i] = u.R()[i];
1741  ghost[i] = ghost_ ? ghost_[i] : 0;
1742  faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
1743  }
1744 #ifdef USE_TEXTURE_OBJECTS
1745  if (u.Location() == QUDA_CUDA_FIELD_LOCATION) tex = static_cast<const cudaGaugeField&>(u).Tex();
1746  if (!huge_alloc && this->gauge != u.Gauge_p() && !override) {
1747  errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
1748  }
1749 #endif
1750  }
1751 
1752  FloatNOrder(const FloatNOrder &order)
1753  : reconstruct(order.reconstruct), gauge(order.gauge), offset(order.offset),
1754 #ifdef USE_TEXTURE_OBJECTS
1755  tex(order.tex), tex_offset(order.tex_offset),
1756 #endif
1757  ghostExchange(order.ghostExchange),
1758  volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry),
1759  phaseOffset(order.phaseOffset), backup_h(nullptr), bytes(order.bytes)
1760  {
1761  for (int i=0; i<4; i++) {
1762  X[i] = order.X[i];
1763  R[i] = order.R[i];
1764  ghost[i] = order.ghost[i];
1765  faceVolumeCB[i] = order.faceVolumeCB[i];
1766  }
1767  }
1768 
1769  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
1770  {
1771  const int M = reconLen / N;
1772  real tmp[reconLen];
1773 
1774 #pragma unroll
1775  for (int i=0; i<M; i++){
1776  // first do texture load from memory
1777 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
1778  if (!huge_alloc) { // use textures unless we have a huge alloc
1779  TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity * tex_offset + (dir * M + i) * stride + x);
1780  // now insert into output array
1781 #pragma unroll
1782  for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<real *>(&vecTmp)[j]);
1783  } else
1784 #endif
1785  {
1786  // first load from memory
1787  Vector vecTmp = vector_load<Vector>(gauge + parity * offset, (dir * M + i) * stride + x);
1788  // second do copy converting into register type
1789 #pragma unroll
1790  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1791  }
1792  }
1793 
1794  real phase = 0.; // TODO - add texture support for phases
1795 
1796  if (hasPhase) {
1797  if (static_phase<stag_phase>() && (reconLen == 13 || use_inphase)) {
1798  phase = inphase;
1799  } else {
1800  copy(phase, (gauge + parity * offset)[phaseOffset / sizeof(Float) + stride * dir + x]);
1801  phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
1802  }
1803  }
1804 
1805  reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
1806  }
1807 
1808  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
1809  {
1810  const int M = reconLen / N;
1811  real tmp[reconLen];
1812  reconstruct.Pack(tmp, v, x);
1813 
1814 #pragma unroll
1815  for (int i=0; i<M; i++){
1816  Vector vecTmp;
1817  // first do copy converting into storage type
1818 #pragma unroll
1819  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1820  // second do vectorized copy into memory
1821  vector_store(gauge + parity * offset, x + (dir * M + i) * stride, vecTmp);
1822  }
1823  if (hasPhase) {
1824  real phase = reconstruct.getPhase(v);
1825  copy((gauge + parity * offset)[phaseOffset / sizeof(Float) + dir * stride + x],
1826  static_cast<real>(phase / (2. * M_PI)));
1827  }
1828  }
1829 
1840  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity, real phase = 1.0)
1841  {
1843  }
1844 
1855  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity,
1856  real phase = 1.0) const
1857  {
1858  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity, phase);
1859  }
1860 
1861  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
1862  {
1863  if (!ghost[dir]) { // load from main field not separate array
1864  load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region
1865  // This also works perfectly when phases are stored. No need to change this.
1866  } else {
1867  const int M = reconLen / N;
1868  real tmp[reconLen];
1869 
1870 #pragma unroll
1871  for (int i=0; i<M; i++) {
1872  // first do vectorized copy from memory into registers
1873  Vector vecTmp = vector_load<Vector>(
1874  ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x);
1875  // second do copy converting into register type
1876 #pragma unroll
1877  for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
1878  }
1879  real phase = 0.;
1880 
1881  if (hasPhase) {
1882 
1883  // if(stag_phase == QUDA_STAGGERED_PHASE_MILC ) {
1884  // phase = inphase < static_cast<Float>(0) ? static_cast<Float>(-1./(2.*M_PI)) : static_cast<Float>(1./2.*M_PI);
1885  // } else {
1886  copy(phase, ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x]);
1887  phase *= static_cast<real>(2.0) * static_cast<real>(M_PI);
1888  // }
1889  }
1890  reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
1891  }
1892  }
1893 
1894  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
1895  {
1896  if (!ghost[dir]) { // store in main field not separate array
1897  save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region
1898  } else {
1899  const int M = reconLen / N;
1900  real tmp[reconLen];
1901  reconstruct.Pack(tmp, v, x);
1902 
1903 #pragma unroll
1904  for (int i=0; i<M; i++) {
1905  Vector vecTmp;
1906  // first do copy converting into storage type
1907 #pragma unroll
1908  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1909  // second do vectorized copy into memory
1910  vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);
1911  }
1912 
1913  if (hasPhase) {
1914  real phase = reconstruct.getPhase(v);
1915  copy(ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x],
1916  static_cast<real>(phase / (2. * M_PI)));
1917  }
1918  }
1919  }
1920 
1931  __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
1932  real phase = 1.0)
1933  {
1934  return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
1935  }
1936 
1947  __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
1948  real phase = 1.0) const
1949  {
1950  return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
1951  }
1952 
1953  __device__ __host__ inline void loadGhostEx(complex v[length / 2], int buff_idx, int extended_idx, int dir,
1954  int dim, int g, int parity, const int R[]) const
1955  {
1956  const int M = reconLen / N;
1957  real tmp[reconLen];
1958 
1959 #pragma unroll
1960  for (int i=0; i<M; i++) {
1961  // first do vectorized copy from memory
1962  Vector vecTmp = vector_load<Vector>(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
1963  +i*R[dim]*faceVolumeCB[dim]+buff_idx);
1964  // second do copy converting into register type
1965 #pragma unroll
1966  for (int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1967  }
1968  real phase = 0.;
1969  if (hasPhase)
1970  copy(phase,
1971  ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
1972  + R[dim] * faceVolumeCB[dim] * M * N + buff_idx]);
1973 
1974  // use the extended_idx to determine the boundary condition
1975  reconstruct.Unpack(v, tmp, extended_idx, g, 2. * M_PI * phase, X, R);
1976  }
1977 
1978  __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int buff_idx, int extended_idx, int dir,
1979  int dim, int g, int parity, const int R[])
1980  {
1981  const int M = reconLen / N;
1982  real tmp[reconLen];
1983  // use the extended_idx to determine the boundary condition
1984  reconstruct.Pack(tmp, v, extended_idx);
1985 
1986 #pragma unroll
1987  for (int i=0; i<M; i++) {
1988  Vector vecTmp;
1989  // first do copy converting into storage type
1990 #pragma unroll
1991  for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1992  // second do vectorized copy to memory
1993  vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
1994  i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);
1995  }
1996  if (hasPhase) {
1997  real phase = reconstruct.getPhase(v);
1998  copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
1999  + R[dim] * faceVolumeCB[dim] * M * N + buff_idx],
2000  static_cast<real>(phase / (2. * M_PI)));
2001  }
2002  }
2003 
2007  void save() {
2008  if (backup_h) errorQuda("Already allocated host backup");
2009  backup_h = safe_malloc(bytes);
2010  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
2011  checkCudaError();
2012  }
2013 
2017  void load() {
2018  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
2019  host_free(backup_h);
2020  backup_h = nullptr;
2021  checkCudaError();
2022  }
2023 
2024  size_t Bytes() const { return reconLen * sizeof(Float); }
2025  };
2026 
2033  template <typename real, int length> struct S {
2034  real v[length];
2035  __host__ __device__ const real &operator[](int i) const { return v[i]; }
2036  __host__ __device__ real &operator[](int i) { return v[i]; }
2037  };
2038 
2043  template <typename Float, int length> struct LegacyOrder {
2045  using real = typename mapper<Float>::type;
2047  Float *ghost[QUDA_MAX_DIM];
2048  int faceVolumeCB[QUDA_MAX_DIM];
2049  const int volumeCB;
2050  const int stride;
2051  const int geometry;
2052  const int hasPhase;
2053 
2054  LegacyOrder(const GaugeField &u, Float **ghost_) :
2055  volumeCB(u.VolumeCB()),
2056  stride(u.Stride()),
2057  geometry(u.Geometry()),
2058  hasPhase(0)
2059  {
2060  if (geometry == QUDA_COARSE_GEOMETRY)
2061  errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
2062 
2063  for (int i = 0; i < 4; i++) {
2064  ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);
2065  faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
2066  }
2067  }
2068 
2069  LegacyOrder(const LegacyOrder &order) :
2070  volumeCB(order.volumeCB),
2071  stride(order.stride),
2072  geometry(order.geometry),
2073  hasPhase(0)
2074  {
2075  for (int i = 0; i < 4; i++) {
2076  ghost[i] = order.ghost[i];
2077  faceVolumeCB[i] = order.faceVolumeCB[i];
2078  }
2079  }
2080 
2081  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const
2082  {
2083 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2084  typedef S<Float, length> structure;
2085  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2086  structure v_ = ghost_[parity * faceVolumeCB[dir] + x];
2087 #else
2088  auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2089 #endif
2090  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2091  }
2092 
2093  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity)
2094  {
2095 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2096  typedef S<Float, length> structure;
2097  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2098  structure v_;
2099  for (int i = 0; i < length / 2; i++) {
2100  v_[2 * i + 0] = (Float)v[i].real();
2101  v_[2 * i + 1] = (Float)v[i].imag();
2102  }
2103  ghost_[parity * faceVolumeCB[dir] + x] = v_;
2104 #else
2105  auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2106  for (int i = 0; i < length / 2; i++) {
2107  v_[2 * i + 0] = (Float)v[i].real();
2108  v_[2 * i + 1] = (Float)v[i].imag();
2109  }
2110 #endif
2111  }
2112 
2123  __device__ __host__ inline gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2124  real phase = 1.0)
2125  {
2126  return gauge_ghost_wrapper<real, Accessor>(*this, dim, ghost_idx, parity, phase);
2127  }
2128 
2139  __device__ __host__ inline const gauge_ghost_wrapper<real, Accessor> Ghost(int dim, int ghost_idx, int parity,
2140  real phase = 1.0) const
2141  {
2142  return gauge_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, ghost_idx, parity, phase);
2143  }
2144 
2145  __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int dummy, int dir, int dim, int g,
2146  int parity, const int R[]) const
2147  {
2148 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2149  typedef S<Float,length> structure;
2150  trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
2151  structure v_ = ghost_[((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];
2152 #else
2153  auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2154 #endif
2155  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2156  }
2157 
2158  __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int dummy, int dir, int dim,
2159  int g, int parity, const int R[])
2160  {
2161 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2162  typedef S<Float, length> structure;
2163  trove::coalesced_ptr<structure> ghost_((structure *)ghost[dim]);
2164  structure v_;
2165  for (int i = 0; i < length / 2; i++) {
2166  v_[2 * i + 0] = (Float)v[i].real();
2167  v_[2 * i + 1] = (Float)v[i].imag();
2168  }
2169  ghost_[((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g] = v_;
2170 #else
2171  auto v_ = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2172  for (int i = 0; i < length / 2; i++) {
2173  v_[2 * i + 0] = (Float)v[i].real();
2174  v_[2 * i + 1] = (Float)v[i].imag();
2175  }
2176 #endif
2177  }
2178  };
2179 
2184  template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
2186  using real = typename mapper<Float>::type;
2189  const int volumeCB;
2190  QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
2191  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2192  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
2193  QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
2194  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
2195  }
2196 
2197  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2198  {
2199 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2200  typedef S<Float,length> structure;
2201  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2202  structure v_ = gauge_[parity*volumeCB + x];
2203 #else
2204  auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2205 #endif
2206  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2207  }
2208 
2209  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2210  {
2211 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2212  typedef S<Float,length> structure;
2213  trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2214  structure v_;
2215  for (int i = 0; i < length / 2; i++) {
2216  v_[2 * i + 0] = (Float)v[i].real();
2217  v_[2 * i + 1] = (Float)v[i].imag();
2218  }
2219  gauge_[parity * volumeCB + x] = v_;
2220 #else
2221  auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2222  for (int i = 0; i < length / 2; i++) {
2223  v_[2 * i + 0] = (Float)v[i].real();
2224  v_[2 * i + 1] = (Float)v[i].imag();
2225  }
2226 #endif
2227  }
2228 
2239  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2240  {
2241  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2242  }
2243 
2254  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2255  {
2256  return gauge_wrapper<real, QDPOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2257  }
2258 
2259  size_t Bytes() const { return length * sizeof(Float); }
2260  };
2261 
2266  template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
2268  using real = typename mapper<Float>::type;
2271  const int volumeCB;
2272  QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
2273  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2274  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
2275  QDPJITOrder(const QDPJITOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
2276  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
2277  }
2278 
2279  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2280  {
2281  for (int i = 0; i < length / 2; i++) {
2282  v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2283  v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2284  }
2285  }
2286 
2287  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2288  {
2289  for (int i = 0; i < length / 2; i++) {
2290  gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].real();
2291  gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x] = v[i].imag();
2292  }
2293  }
2294 
2305  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2306  {
2307  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2308  }
2309 
2320  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2321  {
2322  return gauge_wrapper<real, QDPJITOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2323  }
2324 
2325  size_t Bytes() const { return length * sizeof(Float); }
2326  };
2327 
2332  template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
2334  using real = typename mapper<Float>::type;
2336  Float *gauge;
2337  const int volumeCB;
2338  const int geometry;
2339  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
2340  LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
2341  volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
2342  MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),
2343  gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
2344  { ; }
2345 
2346  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2347  {
2348 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2349  typedef S<Float,length> structure;
2350  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2351  structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];
2352 #else
2353  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2354 #endif
2355  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2356  }
2357 
2358  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2359  {
2360 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2361  typedef S<Float,length> structure;
2362  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2363  structure v_;
2364  for (int i = 0; i < length / 2; i++) {
2365  v_[2 * i + 0] = v[i].real();
2366  v_[2 * i + 1] = v[i].imag();
2367  }
2368  gauge_[(parity*volumeCB+x)*geometry + dir] = v_;
2369 #else
2370  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2371  for (int i = 0; i < length / 2; i++) {
2372  v_[2 * i + 0] = v[i].real();
2373  v_[2 * i + 1] = v[i].imag();
2374  }
2375 #endif
2376  }
2377 
2388  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2389  {
2390  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2391  }
2392 
2403  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2404  {
2405  return gauge_wrapper<real, MILCOrder<Float, length>>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2406  }
2407 
2408  size_t Bytes() const { return length * sizeof(Float); }
2409  };
2410 
2426  template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {
2428  using real = typename mapper<Float>::type;
2430  Float *gauge;
2431  const int volumeCB;
2432  const int geometry;
2433  const size_t offset;
2434  const size_t size;
2435  MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2436  LegacyOrder<Float, length>(u, ghost_),
2437  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2438  volumeCB(u.VolumeCB()),
2439  geometry(u.Geometry()),
2440  offset(u.SiteOffset()),
2441  size(u.SiteSize())
2442  {
2443  if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); }
2444  }
2445 
2447  LegacyOrder<Float, length>(order),
2448  gauge(order.gauge),
2449  volumeCB(order.volumeCB),
2450  geometry(order.geometry),
2451  offset(order.offset),
2452  size(order.size)
2453  {
2454  }
2455 
2456  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const
2457  {
2458  // get base pointer
2459  const Float *gauge0 = reinterpret_cast<const Float*>(reinterpret_cast<const char*>(gauge) + (parity*volumeCB+x)*size + offset);
2460 
2461 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2462  typedef S<Float,length> structure;
2463  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2464  structure v_ = gauge_[dir];
2465 #else
2466  auto v_ = &gauge0[dir * length];
2467 #endif
2468  for (int i = 0; i < length / 2; i++) v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]);
2469  }
2470 
2471  __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity)
2472  {
2473  // get base pointer
2474  Float *gauge0 = reinterpret_cast<Float*>(reinterpret_cast<char*>(gauge) + (parity*volumeCB+x)*size + offset);
2475 
2476 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2477  typedef S<Float,length> structure;
2478  trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2479  structure v_;
2480  for (int i = 0; i < length / 2; i++) {
2481  v_[2 * i + 0] = v[i].real();
2482  v_[2 * i + 1] = v[i].imag();
2483  }
2484  gauge_[dir] = v_;
2485 #else
2486  for (int i = 0; i < length / 2; i++) {
2487  gauge0[dir * length + 2 * i + 0] = v[i].real();
2488  gauge0[dir * length + 2 * i + 1] = v[i].imag();
2489  }
2490 #endif
2491  }
2492 
2503  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2504  {
2505  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2506  }
2507 
2518  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2519  {
2520  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2521  }
2522 
2523  size_t Bytes() const { return length * sizeof(Float); }
2524  };
2525 
2526 
2531  template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
2533  using real = typename mapper<Float>::type;
2535  Float *gauge;
2536  const int volumeCB;
2539  static constexpr int Nc = 3;
2540  const int geometry;
2541  CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2542  LegacyOrder<Float, length>(u, ghost_),
2543  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2544  volumeCB(u.VolumeCB()),
2545  anisotropy(u.Anisotropy()),
2546  anisotropy_inv(1.0 / anisotropy),
2547  geometry(u.Geometry())
2548  {
2549  if (length != 18) errorQuda("Gauge length %d not supported", length);
2550  }
2551  CPSOrder(const CPSOrder &order) :
2552  LegacyOrder<Float, length>(order),
2553  gauge(order.gauge),
2554  volumeCB(order.volumeCB),
2555  anisotropy(order.anisotropy),
2556  anisotropy_inv(order.anisotropy_inv),
2557  geometry(order.geometry)
2558  {
2559  ;
2560  }
2561 
2562  // we need to transpose and scale for CPS ordering
2563  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float inphase = 1.0) const
2564  {
2565 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2566  typedef S<Float,length> structure;
2567  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2568  structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];
2569 #else
2570  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2571 #endif
2572  for (int i=0; i<Nc; i++) {
2573  for (int j=0; j<Nc; j++) {
2574  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * anisotropy_inv;
2575  }
2576  }
2577  }
2578 
2579  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2580  {
2581 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2582  typedef S<Float,length> structure;
2583  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2584  structure v_;
2585  for (int i=0; i<Nc; i++)
2586  for (int j = 0; j < Nc; j++) {
2587  v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
2588  v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2589  }
2590  gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;
2591 #else
2592  auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
2593  for (int i=0; i<Nc; i++) {
2594  for (int j=0; j<Nc; j++) {
2595  v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].real();
2596  v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2597  }
2598  }
2599 #endif
2600  }
2601 
2612  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2613  {
2614  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2615  }
2616 
2627  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2628  {
2629  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2630  }
2631 
2632  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2633  };
2634 
2642  template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
2644  using real = typename mapper<Float>::type;
2646  Float *gauge;
2647  const int volumeCB;
2648  int exVolumeCB; // extended checkerboard volume
2649  static constexpr int Nc = 3;
2650  BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2651  LegacyOrder<Float, length>(u, ghost_),
2652  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2653  volumeCB(u.VolumeCB())
2654  {
2655  if (length != 18) errorQuda("Gauge length %d not supported", length);
2656  // compute volumeCB + halo region
2657  exVolumeCB = u.X()[0]/2 + 2;
2658  for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;
2659  }
2660  BQCDOrder(const BQCDOrder &order) :
2661  LegacyOrder<Float, length>(order),
2662  gauge(order.gauge),
2663  volumeCB(order.volumeCB),
2664  exVolumeCB(order.exVolumeCB)
2665  {
2666  if (length != 18) errorQuda("Gauge length %d not supported", length);
2667  }
2668 
2669  // we need to transpose for BQCD ordering
2670  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
2671  {
2672 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2673  typedef S<Float, length> structure;
2674  trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2675  structure v_ = gauge_[(dir * 2 + parity) * exVolumeCB + x];
2676 #else
2677  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
2678 #endif
2679  for (int i = 0; i < Nc; i++) {
2680  for (int j = 0; j < Nc; j++) { v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]); }
2681  }
2682  }
2683 
2684  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2685  {
2686 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2687  typedef S<Float,length> structure;
2688  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2689  structure v_;
2690  for (int i=0; i<Nc; i++)
2691  for (int j = 0; j < Nc; j++) {
2692  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
2693  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2694  }
2695  gauge_[(dir * 2 + parity) * exVolumeCB + x] = v_;
2696 #else
2697  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + x) * length];
2698  for (int i = 0; i < Nc; i++) {
2699  for (int j = 0; j < Nc; j++) {
2700  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real();
2701  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2702  }
2703  }
2704 #endif
2705  }
2706 
2717  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2718  {
2719  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2720  }
2721 
2732  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2733  {
2734  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2735  }
2736 
2737  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2738  };
2739 
2744  template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
2746  using real = typename mapper<Float>::type;
2748  Float *gauge;
2749  const int volumeCB;
2750  static constexpr int Nc = 3;
2751  const real scale;
2753  TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2754  LegacyOrder<Float, length>(u, ghost_),
2755  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2756  volumeCB(u.VolumeCB()),
2757  scale(u.Scale()),
2758  scale_inv(1.0 / scale)
2759  {
2760  if (length != 18) errorQuda("Gauge length %d not supported", length);
2761  }
2762  TIFROrder(const TIFROrder &order) :
2763  LegacyOrder<Float, length>(order),
2764  gauge(order.gauge),
2765  volumeCB(order.volumeCB),
2766  scale(order.scale),
2767  scale_inv(1.0 / scale)
2768  {
2769  if (length != 18) errorQuda("Gauge length %d not supported", length);
2770  }
2771 
2772  // we need to transpose for TIFR ordering
2773  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
2774  {
2775 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2776  typedef S<Float, length> structure;
2777  trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2778  structure v_ = gauge_[(dir * 2 + parity) * volumeCB + x];
2779 #else
2780  auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
2781 #endif
2782  for (int i = 0; i < Nc; i++) {
2783  for (int j = 0; j < Nc; j++) {
2784  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
2785  }
2786  }
2787  }
2788 
2789  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2790  {
2791 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2792  typedef S<Float,length> structure;
2793  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2794  structure v_;
2795  for (int i=0; i<Nc; i++)
2796  for (int j = 0; j < Nc; j++) {
2797  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2798  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2799  }
2800  gauge_[(dir * 2 + parity) * volumeCB + x] = v_;
2801 #else
2802  auto v_ = &gauge[((dir * 2 + parity) * volumeCB + x) * length];
2803  for (int i = 0; i < Nc; i++) {
2804  for (int j = 0; j < Nc; j++) {
2805  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2806  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2807  }
2808  }
2809 #endif
2810  }
2811 
2822  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2823  {
2824  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2825  }
2826 
2837  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2838  {
2839  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2840  }
2841 
2842  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2843  };
2844 
2849  template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {
2851  using real = typename mapper<Float>::type;
2853  Float *gauge;
2854  const int volumeCB;
2856  static constexpr int Nc = 3;
2857  const real scale;
2859  const int dim[4];
2860  const int exDim[4];
2861  TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
2862  LegacyOrder<Float, length>(u, ghost_),
2863  gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2864  volumeCB(u.VolumeCB()),
2865  exVolumeCB(1),
2866  scale(u.Scale()),
2867  scale_inv(1.0 / scale),
2868  dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},
2869  exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}
2870  {
2871  if (length != 18) errorQuda("Gauge length %d not supported", length);
2872 
2873  // exVolumeCB is the padded checkboard volume
2874  for (int i=0; i<4; i++) exVolumeCB *= exDim[i];
2875  exVolumeCB /= 2;
2876  }
2877 
2879  LegacyOrder<Float, length>(order),
2880  gauge(order.gauge),
2881  volumeCB(order.volumeCB),
2882  exVolumeCB(order.exVolumeCB),
2883  scale(order.scale),
2884  scale_inv(order.scale_inv),
2885  dim {order.dim[0], order.dim[1], order.dim[2], order.dim[3]},
2886  exDim {order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]}
2887  {
2888  if (length != 18) errorQuda("Gauge length %d not supported", length);
2889  }
2890 
2895  __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const {
2896  // find coordinates
2897  int coord[4];
2898  getCoords(coord, x_cb, dim, parity);
2899 
2900  // get z-extended index
2901  coord[2] += 2; // offset for halo
2902  return linkIndex(coord, exDim);
2903  }
2904 
2905  // we need to transpose for TIFR ordering
2906  __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, real inphase = 1.0) const
2907  {
2908  int y = getPaddedIndex(x, parity);
2909 
2910 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2911  typedef S<Float,length> structure;
2912  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2913  structure v_ = gauge_[(dir*2+parity)*exVolumeCB + y];
2914 #else
2915  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
2916 #endif
2917  for (int i = 0; i < Nc; i++) {
2918  for (int j = 0; j < Nc; j++) {
2919  v[i * Nc + j] = complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
2920  }
2921  }
2922  }
2923 
2924  __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity)
2925  {
2926  int y = getPaddedIndex(x, parity);
2927 
2928 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2929  typedef S<Float,length> structure;
2930  trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2931  structure v_;
2932  for (int i=0; i<Nc; i++)
2933  for (int j = 0; j < Nc; j++) {
2934  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2935  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2936  }
2937  gauge_[(dir * 2 + parity) * exVolumeCB + y] = v_;
2938 #else
2939  auto v_ = &gauge[((dir * 2 + parity) * exVolumeCB + y) * length];
2940  for (int i = 0; i < Nc; i++) {
2941  for (int j = 0; j < Nc; j++) {
2942  v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].real() * scale;
2943  v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2944  }
2945  }
2946 #endif
2947  }
2948 
2959  __device__ __host__ inline gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity)
2960  {
2961  return gauge_wrapper<real, Accessor>(*this, dim, x_cb, parity);
2962  }
2963 
2974  __device__ __host__ inline const gauge_wrapper<real, Accessor> operator()(int dim, int x_cb, int parity) const
2975  {
2976  return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
2977  }
2978 
2979  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
2980  };
2981 
2982  } // namespace gauge
2983 
2984  template <typename otherFloat, typename storeFloat>
2986  x = a.real();
2987  y = a.imag();
2988  }
2989 
2990  template <typename otherFloat, typename storeFloat>
2992  x = a.real();
2993  y = a.imag();
2994  }
2995 
2996  template <typename otherFloat, typename storeFloat>
2998  x = a.real();
2999  y = a.imag();
3000  }
3001 
3002  template <typename otherFloat, typename storeFloat>
3004  x = a.real();
3005  y = a.imag();
3006  }
3007 
3008  // Use traits to reduce the template explosion
3009  template <typename T, QudaReconstructType, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
3010  bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
3011  bool use_inphase = false>
3012  struct gauge_mapper {
3013  };
3014 
3015  // double precision
3016  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3017  struct gauge_mapper<double, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
3019  };
3020  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3021  struct gauge_mapper<double, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
3023  };
3024  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3025  struct gauge_mapper<double, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
3027  };
3028  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3029  struct gauge_mapper<double, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
3031  };
3032  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3033  struct gauge_mapper<double, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
3035  };
3036  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3037  struct gauge_mapper<double, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
3039  };
3040 
3041  // single precision
3042  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3043  struct gauge_mapper<float, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
3045  };
3046  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3047  struct gauge_mapper<float, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
3049  };
3050  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3051  struct gauge_mapper<float, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
3053  };
3054  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3055  struct gauge_mapper<float, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
3057  };
3058  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3059  struct gauge_mapper<float, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
3061  };
3062  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3063  struct gauge_mapper<float, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
3065  };
3066 
3067  // half precision
3068  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3069  struct gauge_mapper<short, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
3071  };
3072  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3073  struct gauge_mapper<short, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
3075  };
3076  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3077  struct gauge_mapper<short, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
3079  };
3080  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3081  struct gauge_mapper<short, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
3083  };
3084  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3085  struct gauge_mapper<short, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
3087  };
3088  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3089  struct gauge_mapper<short, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
3091  };
3092 
3093  // quarter precision
3094  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3095  struct gauge_mapper<char, QUDA_RECONSTRUCT_NO, N, stag, huge_alloc, ghostExchange, use_inphase> {
3097  };
3098  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3099  struct gauge_mapper<char, QUDA_RECONSTRUCT_13, N, stag, huge_alloc, ghostExchange, use_inphase> {
3101  };
3102  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3103  struct gauge_mapper<char, QUDA_RECONSTRUCT_12, N, stag, huge_alloc, ghostExchange, use_inphase> {
3105  };
3106  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3107  struct gauge_mapper<char, QUDA_RECONSTRUCT_10, N, stag, huge_alloc, ghostExchange, use_inphase> {
3109  };
3110  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3111  struct gauge_mapper<char, QUDA_RECONSTRUCT_9, N, stag, huge_alloc, ghostExchange, use_inphase> {
3113  };
3114  template <int N, QudaStaggeredPhase stag, bool huge_alloc, QudaGhostExchange ghostExchange, bool use_inphase>
3115  struct gauge_mapper<char, QUDA_RECONSTRUCT_8, N, stag, huge_alloc, ghostExchange, use_inphase> {
3117  };
3118 
3119  template<typename T, QudaGaugeFieldOrder order, int Nc> struct gauge_order_mapper { };
3120  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDP_GAUGE_ORDER,Nc> { typedef gauge::QDPOrder<T, 2*Nc*Nc> type; };
3121  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_QDPJIT_GAUGE_ORDER,Nc> { typedef gauge::QDPJITOrder<T, 2*Nc*Nc> type; };
3122  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_MILC_GAUGE_ORDER,Nc> { typedef gauge::MILCOrder<T, 2*Nc*Nc> type; };
3123  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_BQCD_GAUGE_ORDER,Nc> { typedef gauge::BQCDOrder<T, 2*Nc*Nc> type; };
3124  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_GAUGE_ORDER,Nc> { typedef gauge::TIFROrder<T, 2*Nc*Nc> type; };
3125  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> { typedef gauge::TIFRPaddedOrder<T, 2*Nc*Nc> type; };
3126  template<typename T, int Nc> struct gauge_order_mapper<T,QUDA_FLOAT2_GAUGE_ORDER,Nc> { typedef gauge::FloatNOrder<T, 2*Nc*Nc, 2, 2*Nc*Nc> type; };
3127 
3128  // experiments in reducing template instantation boilerplate
3129  // can this be replaced with a C++11 variant that uses variadic templates?
3130 
3131 #define INSTANTIATE_RECONSTRUCT(func, g, ...) \
3132  { \
3133  if (!data.isNative()) \
3134  errorQuda("Field order %d and precision %d is not native", g.Order(), g.Precision()); \
3135  if( g.Reconstruct() == QUDA_RECONSTRUCT_NO) { \
3136  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge; \
3137  func(Gauge(g), g, __VA_ARGS__); \
3138  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_13){ \
3139  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type Gauge; \
3140  func(Gauge(g), g, __VA_ARGS__); \
3141  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_12){ \
3142  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge; \
3143  func(Gauge(g), g, __VA_ARGS__); \
3144  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_9){ \
3145  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type Gauge; \
3146  func(Gauge(g), g, __VA_ARGS__); \
3147  } else if( g.Reconstruct() == QUDA_RECONSTRUCT_8){ \
3148  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge; \
3149  func(Gauge(g), g, __VA_ARGS__); \
3150  } else { \
3151  errorQuda("Reconstruction type %d of gauge field not supported", g.Reconstruct()); \
3152  } \
3153  }
3154 
3155 #define INSTANTIATE_PRECISION(func, lat, ...) \
3156  { \
3157  if (lat.Precision() == QUDA_DOUBLE_PRECISION) { \
3158  func<double>(lat, __VA_ARGS__); \
3159  } else if(lat.Precision() == QUDA_SINGLE_PRECISION) { \
3160  func<float>(lat, __VA_ARGS__); \
3161  } else { \
3162  errorQuda("Precision %d not supported", lat.Precision()); \
3163  } \
3164  }
3165 
3166 } // namespace quda
3167 
3168 #endif // _GAUGE_ORDER_H
__device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase > type
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
__device__ __host__ complex< Float > operator-() const
negation operator
__host__ __device__ constexpr bool fixed_point< float, short >()
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex > &a)
complex< Float > dummy
Float * ghost[QUDA_MAX_DIM]
gauge::FloatNOrder< short, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
QDPJITOrder(const QDPJITOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
AllocType< huge_alloc >::type AllocInt
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
Reconstruct(const GaugeField &u, real scale=1.0)
TIFROrder(const TIFROrder &order)
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
gauge::FloatNOrder< char, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
constexpr bool default_huge_alloc
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors...
Definition: complex_quda.h:32
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
gauge::FloatNOrder< float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__host__ __device__ constexpr int ct_sqrt(int n, int i=1)
#define errorQuda(...)
Definition: util_quda.h:121
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
void load()
Restore the field from the host after tuning.
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
#define host_free(ptr)
Definition: malloc_quda.h:71
typename mapper< Float >::type real
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
int comm_dim(int dim)
gauge::FloatNOrder< short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
square_(ReduceType scale)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
double Scale() const
__host__ __device__ char real() const volatile
Definition: complex_quda.h:739
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra...
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
int comm_coord(int dim)
__host__ __device__ complex()
Definition: complex_quda.h:585
Reconstruct(const GaugeField &u)
typename mapper< Float >::type real
int_fastdiv X[QUDA_MAX_DIM]
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__host__ __device__ void copy(T1 &a, const T2 &b)
LegacyOrder(const LegacyOrder &order)
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
static int R[4]
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
double anisotropy
Definition: test_util.cpp:1650
const int * SurfaceCB() const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
Accessor< Float, nColor, order, storeFloat, use_tex > accessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
int length[]
Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > accessor
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase > type
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
QDPOrder(const QDPOrder &order)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:121
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ const real & operator[](int i) const
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
int Nface() const
Definition: gauge_field.h:281
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__host__ __device__ constexpr bool match< short, short >()
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
struct to define gauge fields packed into an opaque MILC site struct:
QudaGhostExchange ghostExchange
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
Definition: atomic.cuh:51
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
int Ncolor() const
Definition: gauge_field.h:249
const int * R() const
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
MILCOrder(const MILCOrder &order)
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
gauge::FloatNOrder< char, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ int Ndim() const
void comm_allreduce_min(double *data)
Definition: comm_mpi.cpp:265
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int NcolorCoarse() const
__device__ __host__ Float real() const
enum QudaStaggeredPhase_s QudaStaggeredPhase
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
const int nColor
Definition: covdev_test.cpp:75
void resetScale(Float dummy)
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
gauge::FloatNOrder< char, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
__host__ __device__ int imag() const volatile
Definition: complex_quda.h:833
struct to define BQCD ordered gauge fields:
__host__ __device__ Float operator()(const quda::complex< char > &x)
__host__ __device__ int real() const volatile
Definition: complex_quda.h:832
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
cpuColorSpinorField * in
enum QudaGhostExchange_s QudaGhostExchange
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Generic reconstruction helper with no reconstruction.
__host__ __device__ bool static_phase()
MILCSiteOrder(const MILCSiteOrder &order)
__host__ __device__ constexpr bool fixed_point< float, char >()
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
void save()
Backup the field to the host when tuning.
__host__ __device__ short imag() const volatile
Definition: complex_quda.h:787
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
typename mapper< Float >::type real
__host__ __device__ constexpr bool match< int, int >()
__host__ __device__ real & operator[](int i)
gauge::FloatNOrder< char, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
__device__ __host__ Float imag() const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
__device__ __host__ real getPhase(const complex in[N/2]) const
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > &a)
const void ** Ghost() const
Definition: gauge_field.h:323
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
__device__ __host__ real getPhase(const complex in[9]) const
Float * gauge[QUDA_MAX_DIM]
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ real getPhase(const complex in[9])
int X[4]
Definition: covdev_test.cpp:70
GhostAccessor< Float, nColor, order, native_ghost, storeFloat, use_tex > ghostAccessor
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
__host__ __device__ constexpr bool fixed_point< float, int >()
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ real getPhase(const complex in[9]) const
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
__host__ __device__ short real() const volatile
Definition: complex_quda.h:786
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
gauge::FloatNOrder< short, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
#define safe_malloc(size)
Definition: malloc_quda.h:66
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__host__ __device__ constexpr bool match()
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
__host__ __device__ Float operator()(const quda::complex< int > &x)
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
QudaFieldLocation Location() const
LegacyOrder(const GaugeField &u, Float **ghost_)
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields, which use the same ghost zone storage.
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
size_t bytes
host memory for backing up the field when tuning
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ volatile complex< float > & operator=(const complex< T > z) volatile
Definition: complex_quda.h:491
int faceVolumeCB[QUDA_MAX_DIM]
cpuColorSpinorField * out
BQCDOrder(const BQCDOrder &order)
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
VectorType< Float, N >::type Vector
__host__ __device__ constexpr bool fixed_point()
__host__ __device__ char imag() const volatile
Definition: complex_quda.h:740
typename mapper< Float >::type real
enum QudaReconstructType_s QudaReconstructType
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
const int_fastdiv geometry
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
typename mapper< Float >::type real
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
__device__ __host__ int Volume() const
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
__device__ __host__ Matrix()
Definition: quda_matrix.h:76
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ int Geometry() const
void resetScale(double max)
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
TIFRPaddedOrder(const TIFRPaddedOrder &order)
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ int VolumeCB() const
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex > &a)
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
__host__ __device__ ReduceType operator()(const quda::complex< char > &x)
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
virtual void * Gauge_p()
Definition: gauge_field.h:315
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
static int volumeCB
Definition: face_gauge.cpp:43
__device__ __host__ int Ncolor() const
#define checkCudaError()
Definition: util_quda.h:161
CPSOrder(const CPSOrder &order)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
gauge::FloatNOrder< char, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:258
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
static constexpr bool fixedPoint()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__host__ __device__ Float operator()(const quda::complex< short > &x)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn&#39;t change from unpadded to padded...
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
abs_(const Float scale)
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
FieldOrder(const FieldOrder &o)
__host__ __device__ volatile complex< double > & operator=(const complex< T > z) volatile
Definition: complex_quda.h:613
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__device__ __host__ int NspinCoarse() const
gauge::FloatNOrder< char, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
FloatNOrder(const FloatNOrder &order)
__host__ __device__ complex()
Definition: complex_quda.h:463
const QudaFieldLocation location
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
This is just a dummy structure we use for trove to define the required structure size.
const int * X() const
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const