QUDA  v1.1.0
A library for QCD on GPUs
color_spinor_field_order.h
Go to the documentation of this file.
1 #ifndef _COLOR_SPINOR_ORDER_H
2 #define _COLOR_SPINOR_ORDER_H
3 
15 #include <register_traits.h>
16 #include <convert.h>
17 #include <typeinfo>
18 #include <complex_quda.h>
19 #include <index_helper.cuh>
20 #include <color_spinor.h>
21 #include <color_spinor_field.h>
22 #include <trove_helper.cuh>
23 #include <transform_reduce.h>
24 
25 namespace quda {
26 
39  template <typename Float, typename T>
41  T &field;
42  const int x_cb;
43  const int parity;
44 
51  __device__ __host__ inline colorspinor_wrapper<Float, T>(T &field, int x_cb, int parity) :
52  field(field),
53  x_cb(x_cb),
54  parity(parity)
55  {
56  }
57 
62  template <typename C> __device__ __host__ inline void operator=(const C &a) { field.save(a.data, x_cb, parity); }
63  };
64 
65  template <typename T, int Nc, int Ns>
66  template <typename S>
67  __device__ __host__ inline void ColorSpinor<T,Nc,Ns>::operator=(const colorspinor_wrapper<T,S> &a) {
68  a.field.load(data, a.x_cb, a.parity);
69  }
70 
71  template <typename T, int Nc, int Ns>
72  template <typename S>
73  __device__ __host__ inline ColorSpinor<T,Nc,Ns>::ColorSpinor(const colorspinor_wrapper<T,S> &a) {
74  a.field.load(data, a.x_cb, a.parity);
75  }
76 
77  template <typename T, int Nc>
78  template <typename S>
79  __device__ __host__ inline void ColorSpinor<T,Nc,2>::operator=(const colorspinor_wrapper<T,S> &a) {
80  a.field.load(data, a.x_cb, a.parity);
81  }
82 
83  template <typename T, int Nc>
84  template <typename S>
85  __device__ __host__ inline ColorSpinor<T,Nc,2>::ColorSpinor(const colorspinor_wrapper<T,S> &a) {
86  a.field.load(data, a.x_cb, a.parity);
87  }
88 
89  template <typename T, int Nc>
90  template <typename S>
91  __device__ __host__ inline void ColorSpinor<T,Nc,4>::operator=(const colorspinor_wrapper<T,S> &a) {
92  a.field.load(data, a.x_cb, a.parity);
93  }
94 
95  template <typename T, int Nc>
96  template <typename S>
97  __device__ __host__ inline ColorSpinor<T,Nc,4>::ColorSpinor(const colorspinor_wrapper<T,S> &a) {
98  a.field.load(data, a.x_cb, a.parity);
99  }
100 
113  template <typename Float, typename T>
115  const int dim;
116  const int dir;
117  const int ghost_idx;
118  const int parity;
119  T &field;
120 
129  __device__ __host__ inline colorspinor_ghost_wrapper<Float, T>(
130  T &field, int dim, int dir, int ghost_idx, int parity) :
131  field(field),
132  dim(dim),
133  dir(dir),
135  parity(parity)
136  {
137  }
138 
143  template<typename C>
144  __device__ __host__ inline void operator=(const C &a) {
145  field.saveGhost(a.data, ghost_idx, dim, dir, parity);
146  }
147  };
148 
149  template <typename T, int Nc, int Ns>
150  template <typename S>
151  __device__ __host__ inline void ColorSpinor<T,Nc,Ns>::operator=(const colorspinor_ghost_wrapper<T,S> &a) {
152  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
153  }
154 
155  template <typename T, int Nc, int Ns>
156  template <typename S>
158  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
159  }
160 
161  template <typename T, int Nc>
162  template <typename S>
163  __device__ __host__ inline void ColorSpinor<T, Nc, 2>::operator=(const colorspinor_ghost_wrapper<T, S> &a)
164  {
165  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
166  }
167 
168  template <typename T, int Nc>
169  template <typename S>
170  __device__ __host__ inline ColorSpinor<T, Nc, 2>::ColorSpinor(const colorspinor_ghost_wrapper<T, S> &a)
171  {
172  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
173  }
174 
175  template <typename T, int Nc>
176  template <typename S>
177  __device__ __host__ inline void ColorSpinor<T,Nc,4>::operator=(const colorspinor_ghost_wrapper<T,S> &a) {
178  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
179  }
180 
181  template <typename T, int Nc>
182  template <typename S>
183  __device__ __host__ inline ColorSpinor<T,Nc,4>::ColorSpinor(const colorspinor_ghost_wrapper<T,S> &a) {
184  a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
185  }
186 
187  namespace colorspinor {
188 
189  template<typename ReduceType, typename Float> struct square_ {
190  square_(ReduceType scale) { }
191  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
192  { return static_cast<ReduceType>(norm(x)); }
193  };
194 
195  template<typename ReduceType> struct square_<ReduceType,short> {
196  const ReduceType scale;
197  square_(ReduceType scale) : scale(scale) { }
198  __host__ __device__ inline ReduceType operator()(const quda::complex<short> &x)
199  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
200  };
201 
202  template <typename ReduceType> struct square_<ReduceType, int8_t> {
203  const ReduceType scale;
204  square_(ReduceType scale) : scale(scale) { }
205  __host__ __device__ inline ReduceType operator()(const quda::complex<int8_t> &x)
206  { return norm(scale * complex<ReduceType>(x.real(), x.imag())); }
207  };
208 
209  template<typename Float, typename storeFloat> struct abs_ {
210  abs_(const Float scale) { }
211  __host__ __device__ Float operator()(const quda::complex<storeFloat> &x) { return abs(x); }
212  };
213 
214  template<typename Float> struct abs_<Float,short> {
216  abs_(const Float scale) : scale(scale) { }
217  __host__ __device__ Float operator()(const quda::complex<short> &x)
218  { return abs(scale * complex<Float>(x.real(), x.imag())); }
219  };
220 
221  template <typename Float> struct abs_<Float, int8_t> {
223  abs_(const Float scale) : scale(scale) { }
224  __host__ __device__ Float operator()(const quda::complex<int8_t> &x)
225  { return abs(scale * complex<Float>(x.real(), x.imag())); }
226  };
227 
228  template <typename Float, int nSpin, int nColor, int nVec, QudaFieldOrder order> struct AccessorCB {
229  AccessorCB(const ColorSpinorField &) { errorQuda("Not implemented"); }
230  AccessorCB() { errorQuda("Not implemented"); }
231  __device__ __host__ inline int index(int parity, int x_cb, int s, int c, int v) const { return 0; }
232  };
233 
234  template<typename Float, int nSpin, int nColor, int nVec, QudaFieldOrder order> struct GhostAccessorCB {
235  GhostAccessorCB(const ColorSpinorField &) { errorQuda("Not implemented"); }
236  GhostAccessorCB() { errorQuda("Not implemented"); }
237  __device__ __host__ inline int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
238  { return 0; }
239  };
240 
241  template <typename Float, int nSpin, int nColor, int nVec>
243  const int offset_cb;
244  AccessorCB(const ColorSpinorField &field) : offset_cb((field.Bytes()>>1) / sizeof(complex<Float>)) { }
245  AccessorCB() : offset_cb(0) { }
246  __device__ __host__ inline int index(int parity, int x_cb, int s, int c, int v) const
247  {
248  return parity * offset_cb + ((x_cb * nSpin + s) * nColor + c) * nVec + v;
249  }
250 
259  __device__ __host__ inline int wrap_index(int parity, int x_cb, int s) const
260  {
261  return parity * offset_cb + (x_cb * nSpin + s) * nColor * nVec;
262  }
263  };
264 
265  template<typename Float, int nSpin, int nColor, int nVec>
267  int faceVolumeCB[4];
268  int ghostOffset[4];
269  GhostAccessorCB(const ColorSpinorField &a, int nFace = 1) {
270  for (int d=0; d<4; d++) {
271  faceVolumeCB[d] = nFace*a.SurfaceCB(d);
272  ghostOffset[d] = faceVolumeCB[d]*nColor*nSpin*nVec;
273  }
274  }
275  GhostAccessorCB() : ghostOffset{ } { }
276  __device__ __host__ inline int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
277  { return parity*ghostOffset[dim] + ((x_cb*nSpin+s)*nColor+c)*nVec+v; }
278 
282  __device__ __host__ inline int wrap_index(int dim, int dir, int parity, int x_cb, int s) const
283  {
284  return parity * ghostOffset[dim] + (x_cb * nSpin + s) * nColor * nVec;
285  }
286  };
287 
288  template <int nSpin, int nColor, int nVec, int N> // note this will not work for N=1
289  __device__ __host__ inline int indexFloatN(int x_cb, int s, int c, int v, int stride)
290  {
291  int k = (s * nColor + c) * nVec + v;
292  int j = k / (N / 2);
293  int i = k % (N / 2);
294  return (j * stride + x_cb) * (N / 2) + i;
295  };
296 
297  template <typename Float, int nSpin, int nColor, int nVec>
299  const int stride;
300  const int offset_cb;
301  AccessorCB(const ColorSpinorField &field) :
302  stride(field.Stride()),
303  offset_cb((field.Bytes() >> 1) / sizeof(complex<Float>))
304  {
305  }
306  AccessorCB() : stride(0), offset_cb(0) {}
307  __device__ __host__ inline int index(int parity, int x_cb, int s, int c, int v) const
308  {
309  return parity * offset_cb + ((s * nColor + c) * nVec + v) * stride + x_cb;
310  }
311 
312  template <int nSpinBlock>
313  __device__ __host__ inline void load(complex<Float> out[nSpinBlock * nColor * nVec], complex<Float> *in,
314  int parity, int x_cb, int chi) const
315  {
316  using vec_t = typename VectorType<Float, 2>::type;
317  constexpr int M = nSpinBlock * nColor * nVec;
318 #pragma unroll
319  for (int i = 0; i < M; i++) {
320  vec_t tmp = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb),
321  (chi * M + i) * stride + x_cb);
322  memcpy(&out[i], &tmp, sizeof(vec_t));
323  }
324  }
325  };
326 
327  template<typename Float, int nSpin, int nColor, int nVec>
329  int faceVolumeCB[4];
330  int ghostOffset[4];
331  GhostAccessorCB(const ColorSpinorField &a, int nFace = 1) {
332  for (int d=0; d<4; d++) {
333  faceVolumeCB[d] = nFace*a.SurfaceCB(d);
334  ghostOffset[d] = faceVolumeCB[d]*nColor*nSpin*nVec;
335  }
336  }
337  GhostAccessorCB() : faceVolumeCB{ }, ghostOffset{ } { }
338  __device__ __host__ inline int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
339  { return parity*ghostOffset[dim] + ((s*nColor+c)*nVec+v)*faceVolumeCB[dim] + x_cb; }
340  };
341 
342  template <typename Float, int nSpin, int nColor, int nVec>
344  const int stride;
345  const int offset_cb;
346  AccessorCB(const ColorSpinorField &field) :
347  stride(field.Stride()),
348  offset_cb((field.Bytes() >> 1) / sizeof(complex<Float>))
349  {
350  }
351  AccessorCB() : stride(0), offset_cb(0) {}
352  __device__ __host__ inline int index(int parity, int x_cb, int s, int c, int v) const
353  {
354  return parity * offset_cb + indexFloatN<nSpin, nColor, nVec, 4>(x_cb, s, c, v, stride);
355  }
356 
357  template <int nSpinBlock>
358  __device__ __host__ inline void load(complex<Float> out[nSpinBlock * nColor * nVec], complex<Float> *in,
359  int parity, int x_cb, int chi) const
360  {
361  using vec_t = typename VectorType<Float, 4>::type;
362  constexpr int M = (nSpinBlock * nColor * nVec * 2) / 4;
363 #pragma unroll
364  for (int i = 0; i < M; i++) {
365  vec_t tmp = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb),
366  (chi * M + i) * stride + x_cb);
367  memcpy(&out[i * 2], &tmp, sizeof(vec_t));
368  }
369  }
370  };
371 
372  template<typename Float, int nSpin, int nColor, int nVec>
374  int faceVolumeCB[4];
375  int ghostOffset[4];
376  GhostAccessorCB(const ColorSpinorField &a, int nFace = 1) {
377  for (int d = 0; d < 4; d++) {
378  faceVolumeCB[d] = nFace * a.SurfaceCB(d);
379  ghostOffset[d] = faceVolumeCB[d] * nColor * nSpin * nVec;
380  }
381  }
382  GhostAccessorCB() : faceVolumeCB {}, ghostOffset {} {}
383  __device__ __host__ inline int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
384  { return parity*ghostOffset[dim] + indexFloatN<nSpin,nColor,nVec,4>(x_cb, s, c, v, faceVolumeCB[dim]); }
385  };
386 
387  template <typename Float, int nSpin, int nColor, int nVec>
389  const int stride;
390  const int offset_cb;
391  AccessorCB(const ColorSpinorField &field) :
392  stride(field.Stride()),
393  offset_cb((field.Bytes() >> 1) / sizeof(complex<Float>))
394  {
395  }
396  AccessorCB() : stride(0), offset_cb(0) {}
397  __device__ __host__ inline int index(int parity, int x_cb, int s, int c, int v) const
398  {
399  return parity * offset_cb + indexFloatN<nSpin, nColor, nVec, 8>(x_cb, s, c, v, stride);
400  }
401 
402  template <int nSpinBlock>
403  __device__ __host__ inline void load(complex<Float> out[nSpinBlock * nColor * nVec], complex<Float> *in,
404  int parity, int x_cb, int chi) const
405  {
406  using vec_t = typename VectorType<Float, 8>::type;
407 
408  // in case the vector length isn't divisible by 8, load in the entire vector and then pick the chirality
409  // (the compiler will remove any unused loads)
410  constexpr int N = nSpin * nColor * nVec * 2; // real numbers in the loaded vector
411  constexpr int M = N / 8;
412  Float tmp[N];
413 #pragma unroll
414  for (int i = 0; i < M; i++) {
415  vec_t ld_tmp = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), i * stride + x_cb);
416  memcpy(&tmp[i * 8], &ld_tmp, sizeof(vec_t));
417  }
418  constexpr int N_chi = N / (nSpin / nSpinBlock);
419 #pragma unroll
420  for (int i = 0; i < N_chi; i++)
421  out[i] = complex<Float>(tmp[chi * N_chi + 2 * i + 0], tmp[chi * N_chi + 2 * i + 1]);
422  }
423  };
424 
425  template <typename Float, int nSpin, int nColor, int nVec>
427  int faceVolumeCB[4];
428  int ghostOffset[4];
429  GhostAccessorCB(const ColorSpinorField &a, int nFace = 1)
430  {
431  for (int d = 0; d < 4; d++) {
432  faceVolumeCB[d] = nFace * a.SurfaceCB(d);
433  ghostOffset[d] = faceVolumeCB[d] * nColor * nSpin * nVec;
434  }
435  }
436  GhostAccessorCB() : faceVolumeCB {}, ghostOffset {} {}
437  __device__ __host__ inline int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
438  {
439  return parity * ghostOffset[dim] + indexFloatN<nSpin, nColor, nVec, 8>(x_cb, s, c, v, faceVolumeCB[dim]);
440  }
441  };
442 
443  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
444  template <> __host__ __device__ inline constexpr bool fixed_point<float, int8_t>() { return true; }
445  template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
446  template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }
447 
448  template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
449  template <> __host__ __device__ inline constexpr bool match<int8_t, int8_t>() { return true; }
450  template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
451  template<> __host__ __device__ inline constexpr bool match<short,short>() { return true; }
452 
460  template <typename Float, typename storeFloat>
465  using type = Float;
466  using store_type = storeFloat;
468  const int idx;
469  const Float scale;
471  static constexpr bool fixed = fixed_point<Float, storeFloat>();
472 
477  __device__ __host__ inline fieldorder_wrapper(complex<storeFloat> *v, int idx, Float scale, Float scale_inv) :
478  v(v),
479  idx(idx),
480  scale(scale),
482  {
483  }
484 
485  __device__ __host__ inline Float real() const {
486  if (!fixed) {
487  return v[idx].real();
488  } else {
489  return scale_inv*static_cast<Float>(v[idx].real());
490  }
491  }
492 
493  __device__ __host__ inline Float imag() const {
494  if (!fixed) {
495  return v[idx].imag();
496  } else {
497  return scale_inv*static_cast<Float>(v[idx].imag());
498  }
499  }
500 
501  __device__ __host__ inline void real(const Float &a) {
502  if (!fixed) {
503  v[idx].real(storeFloat(a));
504  } else { // we need to scale and then round
505  v[idx].real(storeFloat(round(scale * a)));
506  }
507  }
508  __device__ __host__ inline void imag(const Float &a) {
509  if (!fixed) {
510  v[idx].imag(storeFloat(a));
511  } else { // we need to scale and then round
512  v[idx].imag(storeFloat(round(scale * a)));
513  }
514  }
515 
519  __device__ __host__ inline auto data() { return &v[idx]; }
520 
521  __device__ __host__ inline const auto data() const { return &v[idx]; }
522 
527  __device__ __host__ inline complex<Float> operator-() const {
528  return fixed ? -scale_inv*static_cast<complex<Float> >(v[idx]) : -static_cast<complex<Float> >(v[idx]);
529  }
530 
535  __device__ __host__ inline void operator=(const fieldorder_wrapper<Float,storeFloat> &a) {
536  v[idx] = fixed ? complex<storeFloat>(round(scale * a.real()), round(scale * a.imag())) : a.v[a.idx];
537  }
538 
543  template<typename theirFloat>
544  __device__ __host__ inline void operator=(const complex<theirFloat> &a) {
545  if (match<storeFloat,theirFloat>()) {
546  v[idx] = complex<storeFloat>(a.x, a.y);
547  } else {
548  v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
549  }
550  }
551 
556  template<typename theirFloat>
557  __device__ __host__ inline void operator=(const theirFloat &a) { *this = complex<theirFloat>(a,static_cast<theirFloat>(0.0)); }
558 
563  template<typename theirFloat>
564  __device__ __host__ inline void operator+=(const complex<theirFloat> &a) {
565  if (match<storeFloat,theirFloat>()) {
566  v[idx] += complex<storeFloat>(a.x, a.y);
567  } else {
568  v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
569  }
570  }
571 
576  template<typename theirFloat>
577  __device__ __host__ inline void operator-=(const complex<theirFloat> &a) {
578  if (match<storeFloat,theirFloat>()) {
579  v[idx] -= complex<storeFloat>(a.x, a.y);
580  } else {
581  v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
582  }
583  }
584 
585  };
586 
587  template <typename Float, int nSpin, int nColor, int nVec, QudaFieldOrder order, typename storeFloat = Float,
588  typename ghostFloat = storeFloat, bool disable_ghost = false, bool block_float = false>
590  {
591  typedef float norm_type;
592 
593  public:
595  static constexpr bool supports_ghost_zone = true;
596 
597  protected:
600  // since these variables are mutually exclusive, we use a union to minimize the accessor footprint
601  union {
602  norm_type *norm;
604  };
605  union {
608  };
609 #ifndef DISABLE_GHOST
611  mutable norm_type *ghost_norm[8];
612  mutable int x[QUDA_MAX_DIM];
613  const int volumeCB;
614  const int nDim;
616  const int siteSubset;
617  const int nParity;
622 #endif
623  static constexpr bool fixed = fixed_point<Float,storeFloat>();
624  static constexpr bool ghost_fixed = fixed_point<Float,ghostFloat>();
625  static constexpr bool block_float_ghost = !fixed && ghost_fixed;
626 
627  public:
632  FieldOrderCB(const ColorSpinorField &field, int nFace=1, void *v_=0, void **ghost_=0)
633  : v(v_? static_cast<complex<storeFloat>*>(const_cast<void*>(v_))
634  : static_cast<complex<storeFloat>*>(const_cast<void*>(field.V()))),
635  accessor(field), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
636 #ifndef DISABLE_GHOST
637  , volumeCB(field.VolumeCB()), nDim(field.Ndim()), gammaBasis(field.GammaBasis()),
638  siteSubset(field.SiteSubset()), nParity(field.SiteSubset()),
639  location(field.Location()), ghostAccessor(field,nFace),
640  ghost_scale(static_cast<Float>(1.0)), ghost_scale_inv(static_cast<Float>(1.0))
641 #endif
642  {
643 #ifndef DISABLE_GHOST
644  for (int d=0; d<QUDA_MAX_DIM; d++) x[d]=field.X(d);
645  resetGhost(field, ghost_ ? ghost_ : field.Ghost());
646 #endif
647  resetScale(field.Scale());
648 
649 #ifdef DISABLE_GHOST
650  if (!disable_ghost) errorQuda("DISABLE_GHOST macro set but corresponding disable_ghost template not set");
651 #endif
652 
653  if (block_float) {
654  // only if we have block_float format do we set these (only block_orthogonalize.cu at present)
655  norm = static_cast<norm_type *>(const_cast<void *>(field.Norm()));
656  norm_offset = field.NormBytes() / (2 * sizeof(norm_type));
657  }
658  }
659 
660 #ifndef DISABLE_GHOST
661  void resetGhost(const ColorSpinorField &a, void * const *ghost_) const
662  {
663  for (int dim=0; dim<4; dim++) {
664  for (int dir=0; dir<2; dir++) {
665  ghost[2 * dim + dir] = static_cast<complex<ghostFloat> *>(ghost_[2 * dim + dir]);
666  ghost_norm[2 * dim + dir] = !block_float_ghost ?
667  nullptr :
668  reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
669  + nParity * nColor * nSpin * nVec * 2 * ghostAccessor.faceVolumeCB[dim]
670  * sizeof(ghostFloat));
671  }
672  }
673  }
674 #endif
675 
676  void resetScale(Float max) {
677  if (fixed) {
678  scale = static_cast<Float>(std::numeric_limits<storeFloat>::max() / max);
679  scale_inv = static_cast<Float>(max / std::numeric_limits<storeFloat>::max());
680  }
681 #ifndef DISABLE_GHOST
682  if (ghost_fixed) {
683  if (block_float_ghost && max != static_cast<Float>(1.0))
684  errorQuda("Block-float accessor requires max=1.0 not max=%e\n", max);
685  ghost_scale = static_cast<Float>(std::numeric_limits<ghostFloat>::max() / max);
686  ghost_scale_inv = static_cast<Float>(max / std::numeric_limits<ghostFloat>::max());
687  }
688 #endif
689  }
690 
700  template <int nSpinBlock>
701  __device__ __host__ inline void load(complex<Float> out[nSpinBlock * nColor * nVec], int parity, int x_cb,
702  int chi) const
703  {
704  if (!fixed) {
705  accessor.template load<nSpinBlock>((complex<storeFloat> *)out, v, parity, x_cb, chi);
706  } else {
707  complex<storeFloat> tmp[nSpinBlock * nColor * nVec];
708  accessor.template load<nSpinBlock>(tmp, v, parity, x_cb, chi);
709  Float norm_ = block_float ? norm[parity * norm_offset + x_cb] : scale_inv;
710  for (int s = 0; s < nSpinBlock; s++) {
711  for (int c = 0; c < nColor; c++) {
712  for (int v = 0; v < nVec; v++) {
713  int k = (s * nColor + c) * nVec + v;
714  out[k] = norm_ * complex<Float>(static_cast<Float>(tmp[k].real()), static_cast<Float>(tmp[k].imag()));
715  }
716  }
717  }
718  }
719  }
720 
730  __device__ __host__ inline const complex<Float> operator()(int parity, int x_cb, int s, int c, int n=0) const
731  {
732 #if (__CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 520)
733  if (!fixed) {
734  auto v_ = __ldg(v + accessor.index(parity, x_cb, s, c, n));
735  return complex<Float>(v_.x, v_.y);
736  } else {
737  auto v_ = __ldg(v + accessor.index(parity, x_cb, s, c, n));
738  complex<storeFloat> tmp(v_.x, v_.y);
739  Float norm_ = block_float ? __ldg(norm + parity * norm_offset + x_cb) : scale_inv;
740  return norm_ * complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
741  }
742 #else
743  if (!fixed) {
744  return complex<Float>( v[accessor.index(parity,x_cb,s,c,n)] );
745  } else {
746  complex<storeFloat> tmp = v[accessor.index(parity,x_cb,s,c,n)];
747  Float norm_ = block_float ? norm[parity*norm_offset+x_cb] : scale_inv;
748  return norm_*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
749  }
750 #endif
751  }
752 
762  __device__ __host__ inline fieldorder_wrapper<Float,storeFloat> operator()(int parity, int x_cb, int s, int c, int n=0)
763  { return fieldorder_wrapper<Float,storeFloat>(v, accessor.index(parity,x_cb,s,c,n), scale, scale_inv); }
764 
773  __device__ __host__ inline const auto wrap(int parity, int x_cb, int s) const
774  {
775  return fieldorder_wrapper<Float, storeFloat>(v, accessor.wrap_index(parity, x_cb, s), scale, scale_inv);
776  }
777 
781  __device__ __host__ inline auto wrap(int parity, int x_cb, int s)
782  {
783  return fieldorder_wrapper<Float, storeFloat>(v, accessor.wrap_index(parity, x_cb, s), scale, scale_inv);
784  }
785 
786 #ifndef DISABLE_GHOST
796  __device__ __host__ inline const complex<Float> Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0) const
797  {
798 #if (__CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 520)
799  if (!ghost_fixed) {
800  auto v_ = __ldg(ghost[2 * dim + dir] + ghostAccessor.index(dim, dir, parity, x_cb, s, c, n));
801  return complex<Float>(v_.x, v_.y);
802  } else {
804  if (block_float_ghost)
805  scale *= __ldg(ghost_norm[2 * dim + dir] + parity * ghostAccessor.faceVolumeCB[dim] + x_cb);
806  auto v_ = __ldg(ghost[2 * dim + dir] + ghostAccessor.index(dim, dir, parity, x_cb, s, c, n));
807  complex<ghostFloat> tmp(v_.x, v_.y);
808  return scale*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
809  }
810 #else
811  if (!ghost_fixed) {
812  return complex<Float>( ghost[2*dim+dir][ghostAccessor.index(dim,dir,parity,x_cb,s,c,n)] );
813  } else {
815  if (block_float_ghost) scale *= ghost_norm[2*dim+dir][parity*ghostAccessor.faceVolumeCB[dim] + x_cb];
816  complex<ghostFloat> tmp = ghost[2*dim+dir][ghostAccessor.index(dim,dir,parity,x_cb,s,c,n)];
817  return scale*complex<Float>(static_cast<Float>(tmp.x), static_cast<Float>(tmp.y));
818  }
819 #endif
820  }
821 
832  __device__ __host__ inline fieldorder_wrapper<Float,ghostFloat> Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0, Float max=0)
833  {
834  if (block_float_ghost && s==0 && c==0 && n==0) ghost_norm[2*dim+dir][parity*ghostAccessor.faceVolumeCB[dim] + x_cb] = max;
835  const int idx = ghostAccessor.index(dim,dir,parity,x_cb,s,c,n);
839 
840  }
841 
852  __device__ __host__ inline const auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s) const
853  {
854  const int idx = ghostAccessor.wrap_index(dim, dir, parity, x_cb, s);
856  }
857 
861  __device__ __host__ inline auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s)
862  {
863  const int idx = ghostAccessor.wrap_index(dim, dir, parity, x_cb, s);
865  }
866 
873  __device__ __host__ inline void LatticeIndex(int y[QUDA_MAX_DIM], int i) const {
874  if (siteSubset == QUDA_FULL_SITE_SUBSET) x[0] /= 2;
875 
876  for (int d=0; d<nDim; d++) {
877  y[d] = i % x[d];
878  i /= x[d];
879  }
880  int parity = i; // parity is the slowest running dimension
881 
882  // convert into the full-field lattice coordinate
884  for (int d=1; d<nDim; d++) parity += y[d];
885  parity = parity & 1;
886  x[0] *= 2; // restore x[0]
887  }
888  y[0] = 2*y[0] + parity; // compute the full x coordinate
889  }
890 
896  __device__ __host__ inline void OffsetIndex(int &i, int y[QUDA_MAX_DIM]) const {
897  int parity = 0;
898  int savey0 = y[0];
899 
901  for (int d=0; d<nDim; d++) parity += y[d];
902  parity = parity & 1;
903  y[0] /= 2;
904  x[0] /= 2;
905  }
906 
907  i = parity;
908  for (int d=nDim-1; d>=0; d--) i = x[d]*i + y[d];
909 
911  //y[0] = 2*y[0] + parity;
912  y[0] = savey0;
913  x[0] *= 2; // restore x[0]
914  }
915  }
916 
918  __device__ __host__ inline int X(int d) const { return x[d]; }
919 
921  __device__ __host__ inline const int* X() const { return x; }
922 #endif
923 
925  __device__ __host__ inline int Ncolor() const { return nColor; }
926 
928  __device__ __host__ inline int Nspin() const { return nSpin; }
929 
931  __device__ __host__ inline int Nvec() const { return nVec; }
932 
933 #ifndef DISABLE_GHOST
935  __device__ __host__ inline int Nparity() const { return nParity; }
936 
938  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
939 
941  __device__ __host__ inline int Ndim() const { return nDim; }
942 
944  __device__ __host__ inline QudaGammaBasis GammaBasis() const { return gammaBasis; }
945 
951  __host__ double norm2(bool global = true) const
952  {
953  double nrm2 = ::quda::transform_reduce(location, v, nParity * volumeCB * nSpin * nColor * nVec,
955  if (global) comm_allreduce(&nrm2);
956  return nrm2;
957  }
958 
964  __host__ double abs_max(bool global = true) const
965  {
966  double absmax = ::quda::transform_reduce(location, v, nParity * volumeCB * nSpin * nColor * nVec,
968  if (global) comm_allreduce_max(&absmax);
969  return absmax;
970  }
971 
972  size_t Bytes() const { return nParity * static_cast<size_t>(volumeCB) * nColor * nSpin * nVec * 2ll * sizeof(storeFloat); }
973 #endif
974  };
975 
987  template <typename Float, int Ns, int Nc, int N_, bool spin_project = false, bool huge_alloc = false>
988  struct FloatNOrder {
989  static_assert((2 * Ns * Nc) % N_ == 0, "Internal degrees of freedom not divisible by short-vector length");
990  static constexpr int length = 2 * Ns * Nc;
991  static constexpr int length_ghost = spin_project ? length / 2 : length;
992  static constexpr int N = N_;
993  static constexpr int M = length / N;
994  // if spin projecting, check that short vector length is compatible, if not halve the vector length
995  static constexpr int N_ghost = !spin_project ? N : (Ns * Nc) % N == 0 ? N : N / 2;
996  static constexpr int M_ghost = length_ghost / N_ghost;
998  using real = typename mapper<Float>::type;
1003  using norm_type = float;
1006  const AllocInt offset; // offset can be 32-bit or 64-bit
1010  int stride;
1011  mutable Float *ghost[8];
1012  mutable norm_type *ghost_norm[8];
1013  int nParity;
1014  void *backup_h;
1015  size_t bytes;
1016 
1017  FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, norm_type *norm_ = 0,
1018  Float **ghost_ = 0, bool override = false) :
1019  field(field_ ? field_ : (Float *)a.V()),
1020  offset(a.Bytes() / (2 * sizeof(Float) * N)),
1021  norm(norm_ ? norm_ : (norm_type *)a.Norm()),
1022  norm_offset(a.NormBytes() / (2 * sizeof(norm_type))),
1023  volumeCB(a.VolumeCB()),
1024  stride(a.Stride()),
1025  nParity(a.SiteSubset()),
1026  backup_h(nullptr),
1027  bytes(a.Bytes())
1028  {
1029  for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
1030  resetGhost(a, ghost_ ? (void **)ghost_ : a.Ghost());
1031  }
1032 
1033  void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
1034  {
1035  for (int dim = 0; dim < 4; dim++) {
1036  for (int dir = 0; dir < 2; dir++) {
1037  ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
1038  ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ?
1039  nullptr :
1040  reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
1041  + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
1042  }
1043  }
1044  }
1045 
1046  __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
1047  {
1048  real v[length];
1049  norm_type nrm;
1050  if (isFixed<Float>::value) { nrm = vector_load<float>(norm, x + parity * norm_offset); }
1051 
1052 #pragma unroll
1053  for (int i=0; i<M; i++) {
1054  // first load from memory
1055  Vector vecTmp = vector_load<Vector>(field, parity * offset + x + stride * i);
1056  // now copy into output and scale
1057 #pragma unroll
1058  for (int j = 0; j < N; j++) copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
1059  }
1060 
1061 #pragma unroll
1062  for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
1063  }
1064 
1065  __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0)
1066  {
1067  real v[length];
1068 
1069 #pragma unroll
1070  for (int i = 0; i < length / 2; i++) {
1071  v[2 * i + 0] = in[i].real();
1072  v[2 * i + 1] = in[i].imag();
1073  }
1074 
1075  if (isFixed<Float>::value) {
1076  norm_type max_[length / 2];
1077  // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
1078 #pragma unroll
1079  for (int i = 0; i < length / 2; i++) max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
1080  norm_type scale = 0.0;
1081 #pragma unroll
1082  for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
1083  norm[x+parity*norm_offset] = scale;
1084 
1085 #ifdef __CUDA_ARCH__
1086  real scale_inv = __fdividef(fixedMaxValue<Float>::value, scale);
1087 #else
1088  real scale_inv = fixedMaxValue<Float>::value / scale;
1089 #endif
1090 #pragma unroll
1091  for (int i = 0; i < length; i++) v[i] = v[i] * scale_inv;
1092  }
1093 
1094 #pragma unroll
1095  for (int i=0; i<M; i++) {
1096  Vector vecTmp;
1097  // first do scalar copy converting into storage type
1098 #pragma unroll
1099  for (int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i * N + j]);
1100  // second do vectorized copy into memory
1101  vector_store(field, parity * offset + x + stride * i, vecTmp);
1102  }
1103  }
1104 
1114  __device__ __host__ inline colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity)
1115  {
1116  return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
1117  }
1118 
1128  __device__ __host__ inline const colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity) const
1129  {
1130  return colorspinor_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity);
1131  }
1132 
1133  __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
1134  {
1135  real v[length_ghost];
1136  norm_type nrm;
1137  if (isFixed<Float>::value) { nrm = vector_load<float>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x); }
1138 
1139 #pragma unroll
1140  for (int i = 0; i < M_ghost; i++) {
1141  GhostVector vecTmp = vector_load<GhostVector>(ghost[2 * dim + dir],
1142  parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x);
1143 #pragma unroll
1144  for (int j = 0; j < N_ghost; j++) copy_and_scale(v[i * N_ghost + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
1145  }
1146 
1147 #pragma unroll
1148  for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
1149  }
1150 
1151  __device__ __host__ inline void saveGhost(const complex in[length_ghost / 2], int x, int dim, int dir,
1152  int parity = 0) const
1153  {
1154  real v[length_ghost];
1155 #pragma unroll
1156  for (int i = 0; i < length_ghost / 2; i++) {
1157  v[2 * i + 0] = in[i].real();
1158  v[2 * i + 1] = in[i].imag();
1159  }
1160 
1161  if (isFixed<Float>::value) {
1162  norm_type max_[length_ghost / 2];
1163  // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
1164 #pragma unroll
1165  for (int i = 0; i < length_ghost / 2; i++)
1166  max_[i] = fmaxf( (norm_type)fabsf( (norm_type)v[i] ),
1167  (norm_type)fabsf( (norm_type)v[i + length_ghost / 2] ) );
1168  norm_type scale = 0.0;
1169 #pragma unroll
1170  for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
1171  ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale;
1172 
1173 #ifdef __CUDA_ARCH__
1174  real scale_inv = __fdividef(fixedMaxValue<Float>::value, scale);
1175 #else
1176  real scale_inv = fixedMaxValue<Float>::value / scale;
1177 #endif
1178 #pragma unroll
1179  for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv;
1180  }
1181 
1182 #pragma unroll
1183  for (int i = 0; i < M_ghost; i++) {
1184  GhostVector vecTmp;
1185  // first do scalar copy converting into storage type
1186 #pragma unroll
1187  for (int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i * N_ghost + j]);
1188  // second do vectorized copy into memory
1189  vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x, vecTmp);
1190  }
1191  }
1192 
1203  __device__ __host__ inline colorspinor_ghost_wrapper<real, Accessor> Ghost(int dim, int dir, int ghost_idx, int parity)
1204  {
1205  return colorspinor_ghost_wrapper<real, Accessor>(*this, dim, dir, ghost_idx, parity);
1206  }
1207 
1219  __device__ __host__ inline const colorspinor_ghost_wrapper<real, Accessor> Ghost(int dim, int dir, int ghost_idx,
1220  int parity) const
1221  {
1222  return colorspinor_ghost_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, dir, ghost_idx, parity);
1223  }
1224 
1228  void save() {
1229  if (backup_h) errorQuda("Already allocated host backup");
1231  qudaMemcpy(backup_h, field, bytes, cudaMemcpyDeviceToHost);
1232  }
1233 
1237  void load() {
1238  qudaMemcpy(field, backup_h, bytes, cudaMemcpyHostToDevice);
1240  backup_h = nullptr;
1241  }
1242 
1243  size_t Bytes() const
1244  {
1245  return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float) + (isFixed<Float>::value ? sizeof(norm_type) : 0));
1246  }
1247  };
1248 
1255  template <typename real, int length> struct S { real v[length]; };
1256 
1257  template <typename Float, int Ns, int Nc>
1260  using real = typename mapper<Float>::type;
1262  static const int length = 2 * Ns * Nc;
1264  size_t offset;
1268  int stride;
1269  int nParity;
1270  SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
1271  : field(field_ ? field_ : (Float*)a.V()), offset(a.Bytes()/(2*sizeof(Float))),
1272  volumeCB(a.VolumeCB()), stride(a.Stride()), nParity(a.SiteSubset())
1273  {
1274  if (volumeCB != stride) errorQuda("Stride must equal volume for this field order");
1275  for (int i=0; i<4; i++) {
1276  ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1277  ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1278  faceVolumeCB[i] = a.SurfaceCB(i)*nFace;
1279  }
1280  }
1281 
1282  __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const
1283  {
1284 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1285  typedef S<Float,length> structure;
1286  trove::coalesced_ptr<structure> field_((structure*)field);
1287  structure v_ = field_[parity*volumeCB + x];
1288  for (int s=0; s<Ns; s++) {
1289  for (int c = 0; c < Nc; c++) { v[s * Nc + c] = complex(v_.v[(c * Ns + s) * 2 + 0], v_.v[(c * Ns + s) * 2 + 1]); }
1290  }
1291 #else
1292  for (int s=0; s<Ns; s++) {
1293  for (int c=0; c<Nc; c++) {
1294  v[s * Nc + c] = complex(field[parity * offset + ((x * Nc + c) * Ns + s) * 2 + 0],
1295  field[parity * offset + ((x * Nc + c) * Ns + s) * 2 + 1]);
1296  }
1297  }
1298 #endif
1299  }
1300 
1301  __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0)
1302  {
1303 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1304  typedef S<Float,length> structure;
1305  trove::coalesced_ptr<structure> field_((structure*)field);
1306  structure v_;
1307  for (int s=0; s<Ns; s++) {
1308  for (int c=0; c<Nc; c++) {
1309  v_.v[(c*Ns + s)*2 + 0] = (Float)v[s*Nc+c].real();
1310  v_.v[(c*Ns + s)*2 + 1] = (Float)v[s*Nc+c].imag();
1311  }
1312  }
1313  field_[parity*volumeCB + x] = v_;
1314 #else
1315  for (int s=0; s<Ns; s++) {
1316  for (int c=0; c<Nc; c++) {
1317  field[parity*offset + ((x*Nc + c)*Ns + s)*2 + 0] = v[s*Nc+c].real();
1318  field[parity*offset + ((x*Nc + c)*Ns + s)*2 + 1] = v[s*Nc+c].imag();
1319  }
1320  }
1321 #endif
1322  }
1323 
1333  __device__ __host__ inline colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity)
1334  {
1335  return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
1336  }
1337 
1347  __device__ __host__ inline const colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity) const
1348  {
1349  return colorspinor_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity);
1350  }
1351 
1352  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const
1353  {
1354  for (int s=0; s<Ns; s++) {
1355  for (int c=0; c<Nc; c++) {
1356  v[s * Nc + c] = complex(ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Nc + c) * Ns + s) * 2 + 0],
1357  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Nc + c) * Ns + s) * 2 + 1]);
1358  }
1359  }
1360  }
1361 
1362  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0)
1363  {
1364  for (int s=0; s<Ns; s++) {
1365  for (int c=0; c<Nc; c++) {
1366  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Nc + c) * Ns + s) * 2 + 0] = v[s * Nc + c].real();
1367  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Nc + c) * Ns + s) * 2 + 1] = v[s * Nc + c].imag();
1368  }
1369  }
1370  }
1371 
1372  size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
1373  };
1374 
1375  template <typename Float, int Ns, int Nc>
1378  using real = typename mapper<Float>::type;
1380  static const int length = 2 * Ns * Nc;
1382  size_t offset;
1386  int stride;
1387  int nParity;
1388  SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
1389  : field(field_ ? field_ : (Float*)a.V()), offset(a.Bytes()/(2*sizeof(Float))),
1390  volumeCB(a.VolumeCB()), stride(a.Stride()), nParity(a.SiteSubset())
1391  {
1392  if (volumeCB != stride) errorQuda("Stride must equal volume for this field order");
1393  for (int i=0; i<4; i++) {
1394  ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1395  ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1396  faceVolumeCB[i] = a.SurfaceCB(i)*nFace;
1397  }
1398  }
1399 
1400  __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const
1401  {
1402 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1403  typedef S<Float,length> structure;
1404  trove::coalesced_ptr<structure> field_((structure*)field);
1405  structure v_ = field_[parity*volumeCB + x];
1406  for (int s=0; s<Ns; s++) {
1407  for (int c = 0; c < Nc; c++) { v[s * Nc + c] = complex(v_.v[(s * Nc + c) * 2 + 0], v_.v[(s * Nc + c) * 2 + 1]); }
1408  }
1409 #else
1410  for (int s=0; s<Ns; s++) {
1411  for (int c=0; c<Nc; c++) {
1412  v[s * Nc + c] = complex(field[parity * offset + ((x * Ns + s) * Nc + c) * 2 + 0],
1413  field[parity * offset + ((x * Ns + s) * Nc + c) * 2 + 1]);
1414  }
1415  }
1416 #endif
1417  }
1418 
1419  __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0)
1420  {
1421 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1422  typedef S<Float,length> structure;
1423  trove::coalesced_ptr<structure> field_((structure*)field);
1424  structure v_;
1425  for (int s=0; s<Ns; s++) {
1426  for (int c=0; c<Nc; c++) {
1427  v_.v[(s * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1428  v_.v[(s * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1429  }
1430  }
1431  field_[parity*volumeCB + x] = v_;
1432 #else
1433  for (int s=0; s<Ns; s++) {
1434  for (int c=0; c<Nc; c++) {
1435  field[parity * offset + ((x * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1436  field[parity * offset + ((x * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1437  }
1438  }
1439 #endif
1440  }
1441 
1451  __device__ __host__ inline colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity)
1452  {
1453  return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
1454  }
1455 
1465  __device__ __host__ inline const colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity) const
1466  {
1467  return colorspinor_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity);
1468  }
1469 
1470  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const
1471  {
1472  for (int s=0; s<Ns; s++) {
1473  for (int c=0; c<Nc; c++) {
1474  v[s * Nc + c] = complex(ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0],
1475  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1]);
1476  }
1477  }
1478  }
1479 
1480  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0)
1481  {
1482  for (int s=0; s<Ns; s++) {
1483  for (int c=0; c<Nc; c++) {
1484  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1485  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1486  }
1487  }
1488  }
1489 
1490  size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
1491  };
1492 
1493  // custom accessor for TIFR z-halo padded arrays
1494  template <typename Float, int Ns, int Nc>
1497  using real = typename mapper<Float>::type;
1499  static const int length = 2 * Ns * Nc;
1501  size_t offset;
1506  int stride;
1507  int nParity;
1508  int dim[4]; // full field dimensions
1509  int exDim[4]; // full field dimensions
1510  PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
1511  : field(field_ ? field_ : (Float*)a.V()),
1512  volumeCB(a.VolumeCB()), exVolumeCB(1), stride(a.Stride()), nParity(a.SiteSubset()),
1513  dim{ a.X(0), a.X(1), a.X(2), a.X(3)}, exDim{ a.X(0), a.X(1), a.X(2) + 4, a.X(3)}
1514  {
1515  if (volumeCB != stride) errorQuda("Stride must equal volume for this field order");
1516  for (int i=0; i<4; i++) {
1517  ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1518  ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1519  faceVolumeCB[i] = a.SurfaceCB(i)*nFace;
1520  exVolumeCB *= exDim[i];
1521  }
1522  exVolumeCB /= nParity;
1523  dim[0] *= (nParity == 1) ? 2 : 1; // need to full dimensions
1524  exDim[0] *= (nParity == 1) ? 2 : 1; // need to full dimensions
1525 
1526  offset = exVolumeCB*Ns*Nc*2; // compute manually since Bytes is likely wrong due to z-padding
1527  }
1528 
1533  __device__ __host__ int getPaddedIndex(int x_cb, int parity) const {
1534  // find coordinates
1535  int coord[4];
1536  getCoords(coord, x_cb, dim, parity);
1537 
1538  // get z-extended index
1539  coord[2] += 2; // offset for halo
1540  return linkIndex(coord, exDim);
1541  }
1542 
1543  __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const
1544  {
1545  int y = getPaddedIndex(x, parity);
1546 
1547 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1548  typedef S<Float,length> structure;
1549  trove::coalesced_ptr<structure> field_((structure*)field);
1550  structure v_ = field_[parity*exVolumeCB + y];
1551  for (int s=0; s<Ns; s++) {
1552  for (int c = 0; c < Nc; c++) { v[s * Nc + c] = complex(v_.v[(s * Nc + c) * 2 + 0], v_.v[(s * Nc + c) * 2 + 1]); }
1553  }
1554 #else
1555  for (int s=0; s<Ns; s++) {
1556  for (int c=0; c<Nc; c++) {
1557  v[s * Nc + c] = complex(field[parity * offset + ((y * Ns + s) * Nc + c) * 2 + 0],
1558  field[parity * offset + ((y * Ns + s) * Nc + c) * 2 + 1]);
1559  }
1560  }
1561 #endif
1562  }
1563 
1564  __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0)
1565  {
1566  int y = getPaddedIndex(x, parity);
1567 
1568 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1569  typedef S<Float,length> structure;
1570  trove::coalesced_ptr<structure> field_((structure*)field);
1571  structure v_;
1572  for (int s=0; s<Ns; s++) {
1573  for (int c=0; c<Nc; c++) {
1574  v_.v[(s * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1575  v_.v[(s * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1576  }
1577  }
1578  field_[parity*exVolumeCB + y] = v_;
1579 #else
1580  for (int s=0; s<Ns; s++) {
1581  for (int c=0; c<Nc; c++) {
1582  field[parity * offset + ((y * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1583  field[parity * offset + ((y * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1584  }
1585  }
1586 #endif
1587  }
1588 
1598  __device__ __host__ inline colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity)
1599  {
1600  return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
1601  }
1602 
1612  __device__ __host__ inline const colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity) const
1613  {
1614  return colorspinor_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity);
1615  }
1616 
1617  __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const
1618  {
1619  for (int s=0; s<Ns; s++) {
1620  for (int c=0; c<Nc; c++) {
1621  v[s * Nc + c] = complex(ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0],
1622  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1]);
1623  }
1624  }
1625  }
1626 
1627  __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0)
1628  {
1629  for (int s=0; s<Ns; s++) {
1630  for (int c=0; c<Nc; c++) {
1631  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1632  ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1633  }
1634  }
1635  }
1636 
1637  size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
1638  };
1639 
1640 
1641  template <typename Float, int Ns, int Nc>
1644  using real = typename mapper<Float>::type;
1648  int stride;
1649  int nParity;
1650  QDPJITDiracOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0)
1651  : field(field_ ? field_ : (Float*)a.V()), volumeCB(a.VolumeCB()), stride(a.Stride()), nParity(a.SiteSubset())
1652  {
1653  if (volumeCB != stride) errorQuda("Stride must equal volume for this field order");
1654  }
1655 
1656  __device__ __host__ inline void load(complex v[Ns * Nc], int x, int parity = 0) const
1657  {
1658  for (int s=0; s<Ns; s++) {
1659  for (int c=0; c<Nc; c++) {
1660  v[s * Nc + c] = complex(field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x],
1661  field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x]);
1662  }
1663  }
1664  }
1665 
1666  __device__ __host__ inline void save(const complex v[Ns * Nc], int x, int parity = 0)
1667  {
1668  for (int s=0; s<Ns; s++) {
1669  for (int c=0; c<Nc; c++) {
1670  field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].real();
1671  field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].imag();
1672  }
1673  }
1674  }
1675 
1685  __device__ __host__ inline colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity)
1686  {
1687  return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
1688  }
1689 
1699  __device__ __host__ inline const colorspinor_wrapper<real, Accessor> operator()(int x_cb, int parity) const
1700  {
1701  return colorspinor_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity);
1702  }
1703 
1704  size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
1705  };
1706 
1707  } // namespace colorspinor
1708 
1709  template <typename otherFloat, typename storeFloat>
1711  x = a.real();
1712  y = a.imag();
1713  }
1714 
1715  template <typename otherFloat, typename storeFloat>
1717  x = a.real();
1718  y = a.imag();
1719  }
1720 
1721  template <typename otherFloat, typename storeFloat>
1723  x = a.real();
1724  y = a.imag();
1725  }
1726 
1727  template <typename otherFloat, typename storeFloat>
1729  x = a.real();
1730  y = a.imag();
1731  }
1732 
1733  // Use traits to reduce the template explosion
1734  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false> struct colorspinor_mapper {
1735  };
1736 
1737  // double precision
1738  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, false, huge_alloc> {
1740  };
1741  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, true, huge_alloc> {
1743  };
1744  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 2, Nc, false, huge_alloc> {
1746  };
1747  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 1, Nc, false, huge_alloc> {
1749  };
1750 
1751  // single precision
1752  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 4, Nc, false, huge_alloc> {
1754  };
1755  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 4, Nc, true, huge_alloc> {
1757  };
1758  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 2, Nc, false, huge_alloc> {
1760  };
1761  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 1, Nc, false, huge_alloc> {
1763  };
1764 
1765 #ifdef FLOAT8
1766 #define N8 8
1767 #else
1768 #define N8 4
1769 #endif
1770 
1771  // half precision
1772  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 4, Nc, false, huge_alloc> {
1774  };
1775  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 4, Nc, true, huge_alloc> {
1777  };
1778  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 2, Nc, false, huge_alloc> {
1780  };
1781  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 1, Nc, false, huge_alloc> {
1783  };
1784 
1785  // quarter precision
1786  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 4, Nc, false, huge_alloc> {
1788  };
1789  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 4, Nc, true, huge_alloc> {
1791  };
1792  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 2, Nc, false, huge_alloc> {
1794  };
1795  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 1, Nc, false, huge_alloc> {
1797  };
1798 
1799 #undef N8
1800 
1801  template<typename T, QudaFieldOrder order, int Ns, int Nc> struct colorspinor_order_mapper { };
1804  template<typename T, int Ns, int Nc> struct colorspinor_order_mapper<T,QUDA_FLOAT2_FIELD_ORDER,Ns,Nc> { typedef colorspinor::FloatNOrder<T, Ns, Nc, 2> type; };
1805 
1806 } // namespace quda
1807 
1808 #endif // _COLOR_SPINOR_ORDER_H
void * Ghost(const int i)
const int * X() const
const int * SurfaceCB() const
double Scale() const
__device__ __host__ int Nparity() const
FieldOrderCB(const ColorSpinorField &field, int nFace=1, void *v_=0, void **ghost_=0)
__device__ __host__ const auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__host__ double norm2(bool global=true) const
__device__ __host__ const complex< Float > Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0) const
__device__ __host__ fieldorder_wrapper< Float, ghostFloat > Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0, Float max=0)
const GhostAccessorCB< ghostFloat, nSpin, nColor, nVec, order > ghostAccessor
__device__ __host__ QudaGammaBasis GammaBasis() const
__host__ double abs_max(bool global=true) const
__device__ __host__ int Ncolor() const
__device__ __host__ auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s)
the non-const wrap_ghost method
__device__ __host__ const complex< Float > operator()(int parity, int x_cb, int s, int c, int n=0) const
__device__ __host__ int VolumeCB() const
__device__ __host__ int X(int d) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int parity, int x_cb, int s, int c, int n=0)
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], int parity, int x_cb, int chi) const
__device__ __host__ void LatticeIndex(int y[QUDA_MAX_DIM], int i) const
__device__ __host__ auto wrap(int parity, int x_cb, int s)
__device__ __host__ const int * X() const
__device__ __host__ int Nvec() const
const AccessorCB< storeFloat, nSpin, nColor, nVec, order > accessor
__device__ __host__ const auto wrap(int parity, int x_cb, int s) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ int Ndim() const
__device__ __host__ int Nspin() const
__device__ __host__ void OffsetIndex(int &i, int y[QUDA_MAX_DIM]) const
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
int comm_dim_partitioned(int dim)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
std::array< int, 4 > dim
int V
Definition: host_utils.cpp:37
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
const int nColor
Definition: covdev_test.cpp:44
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
enum QudaFieldOrder_s QudaFieldOrder
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348
@ QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:352
@ QUDA_FLOAT4_FIELD_ORDER
Definition: enum_quda.h:349
@ QUDA_FLOAT8_FIELD_ORDER
Definition: enum_quda.h:350
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
enum QudaGammaBasis_s QudaGammaBasis
int length[]
__device__ __forceinline__ T __ldg(const T *ptr)
Definition: ldg.h:44
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define host_free(ptr)
Definition: malloc_quda.h:115
__host__ constexpr __device__ bool match< int, int >()
__host__ constexpr __device__ bool match()
__host__ constexpr __device__ bool fixed_point< float, short >()
__host__ constexpr __device__ bool fixed_point< float, int8_t >()
__device__ __host__ int indexFloatN(int x_cb, int s, int c, int v, int stride)
__host__ constexpr __device__ bool fixed_point< float, int >()
__host__ constexpr __device__ bool match< int8_t, int8_t >()
__host__ constexpr __device__ bool match< short, short >()
__host__ constexpr __device__ bool fixed_point()
void transform_reduce(Arg &arg)
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ ColorSpinor< Float, Nc, Ns > & operator=(const ColorSpinor< Float, Nc, Ns > &a)
Definition: color_spinor.h:40
__device__ __host__ ColorSpinor()
Definition: color_spinor.h:29
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int wrap_index(int parity, int x_cb, int s) const
This and the following wrap_index method returns the index for the pointer that points to the start o...
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
AccessorCB(const ColorSpinorField &)
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
Accessor routine for ColorSpinorFields in native field order.
void save()
Backup the field to the host when tuning.
void load()
Restore the field from the host after tuning.
__device__ __host__ const colorspinor_ghost_wrapper< real, Accessor > Ghost(int dim, int dir, int ghost_idx, int parity) const
This accessor routine returns a const colorspinor_ghost_wrapper to this object, allowing us to overlo...
__device__ __host__ void loadGhost(complex out[length_ghost/2], int x, int dim, int dir, int parity=0) const
typename AllocType< huge_alloc >::type AllocInt
typename VectorType< Float, N >::type Vector
FloatNOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, norm_type *norm_=0, Float **ghost_=0, bool override=false)
__device__ __host__ colorspinor_ghost_wrapper< real, Accessor > Ghost(int dim, int dir, int ghost_idx, int parity)
This accessor routine returns a colorspinor_ghost_wrapper to this object, allowing us to overload var...
__device__ __host__ void save(const complex in[length/2], int x, int parity=0)
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
typename VectorType< Float, N_ghost >::type GhostVector
size_t bytes
host memory for backing up the field when tuning
__device__ __host__ void load(complex out[length/2], int x, int parity=0) const
typename mapper< Float >::type real
__device__ __host__ void saveGhost(const complex in[length_ghost/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ int wrap_index(int dim, int dir, int parity, int x_cb, int s) const
This wrap_index method for ghost.
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded.
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
QDPJITDiracOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0)
__device__ __host__ void load(complex v[Ns *Nc], int x, int parity=0) const
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void save(const complex v[Ns *Nc], int x, int parity=0)
This is just a dummy structure we use for trove to define the required structure size.
SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__host__ __device__ Float operator()(const quda::complex< int8_t > &x)
__host__ __device__ Float operator()(const quda::complex< short > &x)
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors,...
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
__device__ __host__ Float imag() const
__device__ __host__ Float real() const
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__device__ __host__ complex< Float > operator-() const
negation operator
__device__ __host__ void real(const Float &a)
__device__ __host__ auto data()
returns the pointor of this wrapper object
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
__device__ __host__ void imag(const Float &a)
__device__ __host__ void operator=(const theirFloat &a)
Assignment operator with real number instance as input.
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ const auto data() const
__host__ __device__ ReduceType operator()(const quda::complex< int8_t > &x)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
colorspinor_ghost_wrapper is an internal class that is used to wrap instances of colorspinor accessor...
__device__ __host__ void operator=(const C &a)
Assignment operator with Matrix instance as input.
colorspinor::FloatNOrder< double, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 4, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 4, Nc, 2, true, huge_alloc > type
colorspinor::FloatNOrder< float, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< float, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< float, 4, Nc, 4, false, huge_alloc > type
colorspinor::FloatNOrder< float, 4, Nc, 4, true, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 4, Nc, N8, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 4, Nc, N8, true, huge_alloc > type
colorspinor::FloatNOrder< short, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< short, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< short, 4, Nc, N8, false, huge_alloc > type
colorspinor::FloatNOrder< short, 4, Nc, N8, true, huge_alloc > type
colorspinor_wrapper is an internal class that is used to wrap instances of colorspinor accessors,...
__device__ __host__ void operator=(const C &a)
Assignment operator with ColorSpinor instance as input.
__host__ __device__ int8_t imag() const volatile
Definition: complex_quda.h:736
__host__ __device__ int8_t real() const volatile
Definition: complex_quda.h:735
__host__ __device__ short real() const volatile
Definition: complex_quda.h:782
__host__ __device__ short imag() const volatile
Definition: complex_quda.h:783
__host__ __device__ complex(const ValueType &re=ValueType(), const ValueType &im=ValueType())
Definition: complex_quda.h:375
__host__ __device__ ValueType imag() const volatile
__host__ __device__ ValueType real() const volatile
__host__ __device__ complex< ValueType > & operator=(const complex< T > z)
Definition: complex_quda.h:399
QUDA reimplementation of thrust::transform_reduce as well as wrappers also implementing thrust::reduc...
#define errorQuda(...)
Definition: util_quda.h:120