QUDA  v1.1.0
A library for QCD on GPUs
clover_field_order.h
Go to the documentation of this file.
1 #ifndef _CLOVER_ORDER_H
2 #define _CLOVER_ORDER_H
3 
10 #include <register_traits.h>
11 #include <convert.h>
12 #include <clover_field.h>
13 #include <complex_quda.h>
14 #include <quda_matrix.h>
15 #include <color_spinor.h>
16 #include <trove_helper.cuh>
17 #include <transform_reduce.h>
18 
19 namespace quda {
20 
33  template <typename Float, typename T>
34  struct clover_wrapper {
35  T &field;
36  const int x_cb;
37  const int parity;
38  const int chirality;
39 
47  __device__ __host__ inline clover_wrapper<Float,T>(T &field, int x_cb, int parity, int chirality)
49 
54  template<typename C>
55  __device__ __host__ inline void operator=(const C &a) {
56  field.save(a.data, x_cb, parity, chirality);
57  }
58  };
59 
60  template <typename T, int N>
61  template <typename S>
62  __device__ __host__ inline void HMatrix<T,N>::operator=(const clover_wrapper<T,S> &a) {
63  a.field.load(data, a.x_cb, a.parity, a.chirality);
64  }
65 
66  template <typename T, int N>
67  template <typename S>
68  __device__ __host__ inline HMatrix<T,N>::HMatrix(const clover_wrapper<T,S> &a) {
69  a.field.load(data, a.x_cb, a.parity, a.chirality);
70  }
71 
72  namespace clover {
73 
74  template<typename ReduceType, typename Float> struct square_ {
75  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
76  { return static_cast<ReduceType>(norm(x)); }
77  };
78 
79  template<typename ReduceType, typename Float> struct abs_ {
80  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
81  { return static_cast<ReduceType>(abs(x)); }
82  };
83 
168  template<typename Float, int nColor, int nSpin, QudaCloverFieldOrder order> struct Accessor {
170  Accessor(const CloverField &A, bool inverse=false) {
171  errorQuda("Not implemented for order %d", order);
172  }
173 
174  __device__ __host__ inline complex<Float>& operator()(int parity, int x, int s_row, int s_col,
175  int c_row, int c_col) const {
176  return dummy;
177  }
178 
179  template <typename helper, typename reducer>
180  __host__ double transform_reduce(QudaFieldLocation location, helper h, double i, reducer r) const
181  {
182  return 0.0;
183  }
184  };
185 
186  template<typename Float, int nColor, int nSpin>
189  int stride;
190  size_t offset_cb;
191  static constexpr int N = nSpin * nColor / 2;
192  Accessor(const CloverField &A, bool inverse=false)
193  : a(static_cast<Float*>(const_cast<void*>(A.V(inverse)))), stride(A.Stride()),
194  offset_cb(A.Bytes()/(2*sizeof(Float))) { }
195 
196  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
197  // if not in the diagonal chiral block then return 0.0
198  if (s_col / 2 != s_row / 2) { return complex<Float>(0.0); }
199 
200  const int chirality = s_col / 2;
201 
202  int row = s_row%2 * nColor + c_row;
203  int col = s_col%2 * nColor + c_col;
204  Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
205 
206  if (row == col) {
207  return 2*a_[ row*stride+x ];
208  } else if (col < row) {
209  // switch coordinates to count from bottom right instead of top left of matrix
210  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
211  complex<Float> *off = reinterpret_cast<complex<Float>*>(a_ + N);
212 
213  return 2*off[k*stride + x];
214  } else {
215  // requesting upper triangular so return conjugate transpose
216  // switch coordinates to count from bottom right instead of top left of matrix
217  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
218  complex<Float> *off = reinterpret_cast<complex<Float>*>(a_ + N);
219  return 2*conj(off[k*stride + x]);
220  }
221 
222  }
223 
224  template <typename helper, typename reducer>
225  __host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
226  {
227  // just use offset_cb, since factor of two from parity is equivalent to complexity
228  double result = ::quda::transform_reduce(location, reinterpret_cast<complex<Float> *>(a), offset_cb, h, init, r);
229  return 2.0 * result; // factor of two is normalization
230  }
231  };
232 
233  template<int N>
234  __device__ __host__ inline int indexFloatN(int k, int stride, int x) {
235  int j = k / N;
236  int i = k % N;
237  return (j*stride+x)*N + i;
238  };
239 
240  template<typename Float, int nColor, int nSpin>
243  int stride;
244  size_t offset_cb;
245  static constexpr int N = nSpin * nColor / 2;
246  Accessor(const CloverField &A, bool inverse=false)
247  : a(static_cast<Float*>(const_cast<void*>(A.V(inverse)))), stride(A.Stride()),
248  offset_cb(A.Bytes()/(2*sizeof(Float))) { }
249 
250  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
251  // if not in the diagonal chiral block then return 0.0
252  if (s_col / 2 != s_row / 2) { return complex<Float>(0.0); }
253 
254  const int chirality = s_col / 2;
255 
256  int row = s_row%2 * nColor + c_row;
257  int col = s_col%2 * nColor + c_col;
258  Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
259 
260  if (row == col) {
261  return 2*a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(row, stride, x) ];
262  } else if (col < row) {
263  // switch coordinates to count from bottom right instead of top left of matrix
264  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
265  int idx = N + 2*k;
266 
267  return 2*complex<Float>(a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
268  a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
269  } else {
270  // requesting upper triangular so return conjugate transpose
271  // switch coordinates to count from bottom right instead of top left of matrix
272  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
273  int idx = N + 2*k;
274 
275  return 2*complex<Float>( a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
276  -a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
277  }
278 
279  }
280 
281  template <typename helper, typename reducer>
282  __host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
283  {
284  // just use offset_cb, since factor of two from parity is equivalent to complexity
285  double result = ::quda::transform_reduce(location, reinterpret_cast<complex<Float> *>(a), offset_cb, h, init, r);
286  return 2.0 * result; // factor of two is normalization
287  }
288  };
289 
290  template<typename Float, int nColor, int nSpin>
292  Float *a[2];
293  const int N = nSpin * nColor / 2;
295  Accessor(const CloverField &A, bool inverse=false) {
296  // even
297  a[0] = static_cast<Float*>(const_cast<void*>(A.V(inverse)));
298  // odd
299  a[1] = static_cast<Float*>(const_cast<void*>(A.V(inverse))) + A.Bytes()/(2*sizeof(Float));
300  zero = complex<Float>(0.0,0.0);
301  }
302 
303  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
304  // if not in the diagonal chiral block then return 0.0
305  if (s_col / 2 != s_row / 2) { return zero; }
306 
307  const int chirality = s_col / 2;
308 
309  unsigned int row = s_row%2 * nColor + c_row;
310  unsigned int col = s_col%2 * nColor + c_col;
311 
312  if (row == col) {
313  complex<Float> tmp = a[parity][(x*2 + chirality)*N*N + row];
314  return tmp;
315  } else if (col < row) {
316  // switch coordinates to count from bottom right instead of top left of matrix
317  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
318  int idx = (x*2 + chirality)*N*N + N + 2*k;
319  return complex<Float>(a[parity][idx], a[parity][idx+1]);
320  } else {
321  // switch coordinates to count from bottom right instead of top left of matrix
322  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
323  int idx = (x*2 + chirality)*N*N + N + 2*k;
324  return complex<Float>(a[parity][idx], -a[parity][idx+1]);
325  }
326  }
327 
328  template <typename helper, typename reducer>
329  __host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
330  {
331  errorQuda("Not implemented");
332  return 0.0;
333  }
334  };
335 
336  /*
337  FIXME the below is the old optimization used for reading the
338  clover field, making use of the symmetry to reduce the number of
339  reads.
340 
341 #define READ_CLOVER2_DOUBLE_STR(clover_, chi) \
342  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \
343  double2 C10, C11, C12, C13, C14, C15, C16, C17; \
344  double2* clover = (double2*)clover_; \
345  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
346  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
347  double diag = 0.5*(C0.x + C1.y); \
348  double diag_inv = 1.0/diag; \
349  C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \
350  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
351  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
352  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
353  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
354  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
355  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
356  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
357  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
358  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
359  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
360  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
361  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
362  C15 = make_double2(-C3.x,-C3.y); \
363  C16 = make_double2(-C4.x,-C4.y); \
364  C17 = make_double2(-C8.x,-C8.y); \
365  */
366 
372  template <typename Float, int nColor, int nSpin, QudaCloverFieldOrder order>
373  struct FieldOrder {
374 
376  static constexpr bool supports_ghost_zone = false;
377 
378  protected:
381  const int volumeCB;
383  bool inverse;
385 
386  public:
392  : A(A), volumeCB(A.VolumeCB()), accessor(A,inverse), inverse(inverse), location(A.Location())
393  { }
394 
395  CloverField& Field() { return A; }
396 
407  __device__ __host__ inline const complex<Float> operator()(int parity, int x, int s_row,
408  int s_col, int c_row, int c_col) const {
409  return accessor(parity, x, s_row, s_col, c_row, c_col);
410  }
411 
426  __device__ __host__ inline complex<Float> operator()(int dummy, int parity, int x, int s_row,
427  int s_col, int c_row, int c_col) const {
428  return accessor(parity,x,s_row,s_col,c_row,c_col);
429  }
430 
441  /*
442  __device__ __host__ inline complex<Float>& operator()(int parity, int x, int s_row,
443  int s_col, int c_row, int c_col) {
444  //errorQuda("Clover accessor not implemented as a lvalue");
445  return accessor(parity, x, s_row, s_col, c_row, c_col);
446  }
447  */
448 
450  __device__ __host__ inline int Ncolor() const { return nColor; }
451 
453  __device__ __host__ inline int Volume() const { return 2*volumeCB; }
454 
456  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
457 
459  size_t Bytes() const {
460  constexpr int n = (nSpin * nColor) / 2;
461  constexpr int chiral_block = n * n / 2;
462  return static_cast<size_t>(volumeCB) * chiral_block * 2ll * 2ll * sizeof(Float); // 2 from complex, 2 from chirality
463  }
464 
470  __host__ double norm1(int dim=-1, bool global=true) const {
471  double nrm1 = accessor.transform_reduce(location, abs_<double, Float>(), 0.0, plus<double>());
472  if (global) comm_allreduce(&nrm1);
473  return nrm1;
474  }
475 
481  __host__ double norm2(int dim=-1, bool global=true) const {
482  double nrm2 = accessor.transform_reduce(location, square_<double, Float>(), 0.0, plus<double>());
483  if (global) comm_allreduce(&nrm2);
484  return nrm2;
485  }
486 
492  __host__ double abs_max(int dim=-1, bool global=true) const {
493  double absmax = accessor.transform_reduce(location, abs_<Float, Float>(), 0.0, maximum<Float>());
494  if (global) comm_allreduce_max(&absmax);
495  return absmax;
496  }
497 
503  __host__ double abs_min(int dim=-1, bool global=true) const {
504  double absmax = accessor.transform_reduce(location, abs_<Float, Float>(), std::numeric_limits<double>::max(),
505  minimum<Float>());
506  if (global) comm_allreduce_min(&absmax);
507  return absmax;
508  }
509  };
510 
523  template <typename Float, int length, int N, bool add_rho=false, bool huge_alloc=false>
524  struct FloatNOrder {
526  using real = typename mapper<Float>::type;
529  typedef float norm_type;
530  static const int M = length / (N * 2); // number of short vectors per chiral block
531  static const int block = length / 2; // chiral block size
534  const AllocInt offset; // offset can be 32-bit or 64-bit
536  const int volumeCB;
537  const int stride;
538 
539  const bool twisted;
540  const real mu2;
541  const real rho;
542 
543  size_t bytes;
544  size_t norm_bytes;
545  void *backup_h;
547 
548  FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_ = 0, norm_type *norm_ = 0,
549  bool override = false) :
550  offset(clover.Bytes() / (2 * sizeof(Float) * N)),
551  norm_offset(clover.NormBytes() / (2 * sizeof(norm_type))),
552  volumeCB(clover.VolumeCB()),
553  stride(clover.Stride()),
555  mu2(clover.Mu2()),
556  rho(clover.Rho()),
557  bytes(clover.Bytes()),
558  norm_bytes(clover.NormBytes()),
559  backup_h(nullptr),
560  backup_norm_h(nullptr)
561  {
562  if (clover.Order() != N) {
563  errorQuda("Invalid clover order %d for FloatN (N=%d) accessor", clover.Order(), N);
564  }
565  this->clover = clover_ ? clover_ : (Float *)(clover.V(is_inverse));
566  this->norm = norm_ ? norm_ : (norm_type *)(clover.Norm(is_inverse));
567  }
568 
569  bool Twisted() const { return twisted; }
570  real Mu2() const { return mu2; }
571 
582  __device__ __host__ inline clover_wrapper<real, Accessor> operator()(int x_cb, int parity, int chirality)
583  {
584  return clover_wrapper<real, Accessor>(*this, x_cb, parity, chirality);
585  }
586 
597  __device__ __host__ inline const clover_wrapper<real, Accessor> operator()(
598  int x_cb, int parity, int chirality) const
599  {
600  return clover_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity, chirality);
601  }
602 
610  __device__ __host__ inline void load(real v[block], int x, int parity, int chirality) const
611  {
612  norm_type nrm;
613  if (isFixed<Float>::value) { nrm = vector_load<float>(norm, parity * norm_offset + chirality * stride + x); }
614 
615 #pragma unroll
616  for (int i=0; i<M; i++) {
617  // first load from memory
618  Vector vecTmp = vector_load<Vector>(clover, parity * offset + x + stride * (chirality * M + i));
619  // second do scalar copy converting into register type
620 #pragma unroll
621  for (int j = 0; j < N; j++) { copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm); }
622  }
623 
624  if (add_rho) for (int i=0; i<6; i++) v[i] += rho;
625  }
626 
634  __device__ __host__ inline void save(const real v[block], int x, int parity, int chirality)
635  {
636  real tmp[block];
637 
638  // find the norm of each chiral block
639  if (isFixed<Float>::value) {
640  norm_type scale = 0.0;
641 #pragma unroll
642  for (int i = 0; i < block; i++) scale = fabsf((norm_type)v[i]) > scale ? fabsf((norm_type)v[i]) : scale;
643  norm[parity*norm_offset + chirality*stride + x] = scale;
644 
645 #ifdef __CUDA_ARCH__
646  real scale_inv = __fdividef(fixedMaxValue<Float>::value, scale);
647 #else
648  real scale_inv = fixedMaxValue<Float>::value / scale;
649 #endif
650 #pragma unroll
651  for (int i = 0; i < block; i++) tmp[i] = v[i] * scale_inv;
652  } else {
653 #pragma unroll
654  for (int i = 0; i < block; i++) tmp[i] = v[i];
655  }
656 
657 #pragma unroll
658  for (int i = 0; i < M; i++) {
659  Vector vecTmp;
660  // first do scalar copy converting into storage type
661  for (int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], tmp[i * N + j]);
662  // second do vectorized copy into memory
663  vector_store(clover, parity * offset + x + stride * (chirality * M + i), vecTmp);
664  }
665  }
666 
674  __device__ __host__ inline void load(real v[length], int x, int parity) const {
675 #pragma unroll
676  for (int chirality = 0; chirality < 2; chirality++) load(&v[chirality * block], x, parity, chirality);
677  }
678 
686  __device__ __host__ inline void save(const real v[length], int x, int parity) {
687 #pragma unroll
688  for (int chirality = 0; chirality < 2; chirality++) save(&v[chirality * block], x, parity, chirality);
689  }
690 
694  void save() {
695  if (backup_h) errorQuda("Already allocated host backup");
697  qudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
698  if (norm_bytes) {
700  qudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
701  }
702  }
703 
707  void load()
708  {
709  qudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
711  backup_h = nullptr;
712  if (norm_bytes) {
713  qudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
715  backup_norm_h = nullptr;
716  }
717  }
718 
719  size_t Bytes() const
720  {
721  size_t bytes = length * sizeof(Float);
722  if (isFixed<Float>::value) bytes += 2 * sizeof(norm_type);
723  return bytes;
724  }
725  };
726 
733  template <typename real, int length> struct S { real v[length]; };
734 
738  template <typename Float, int length>
739  struct QDPOrder {
740  typedef typename mapper<Float>::type RegType;
742  const int volumeCB;
743  const int stride;
744  const int offset;
745 
746  const bool twisted;
747  const Float mu2;
748 
749  QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
750  : volumeCB(clover.VolumeCB()), stride(volumeCB), offset(clover.Bytes()/(2*sizeof(Float))),
752  if (clover.Order() != QUDA_PACKED_CLOVER_ORDER) {
753  errorQuda("Invalid clover order %d for this accessor", clover.Order());
754  }
755  this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse));
756  }
757 
758  bool Twisted() const {return twisted;}
759  Float Mu2() const {return mu2;}
760 
761  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
762  // factor of 0.5 comes from basis change
763 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
764  typedef S<Float,length> structure;
765  trove::coalesced_ptr<structure> clover_((structure*)clover);
766  structure v_ = clover_[parity*volumeCB + x];
767  for (int i=0; i<length; i++) v[i] = 0.5*(RegType)v_.v[i];
768 #else
769  for (int i=0; i<length; i++) v[i] = 0.5*clover[parity*offset + x*length+i];
770 #endif
771  }
772 
773  __device__ __host__ inline void save(const RegType v[length], int x, int parity) {
774 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
775  typedef S<Float,length> structure;
776  trove::coalesced_ptr<structure> clover_((structure*)clover);
777  structure v_;
778  for (int i=0; i<length; i++) v_.v[i] = 2.0*(Float)v[i];
779  clover_[parity*volumeCB + x] = v_;
780 #else
781  for (int i=0; i<length; i++) clover[parity*offset + x*length+i] = 2.0*v[i];
782 #endif
783  }
784 
785  size_t Bytes() const { return length*sizeof(Float); }
786  };
787 
791  template <typename Float, int length>
792  struct QDPJITOrder {
793  typedef typename mapper<Float>::type RegType;
796  const int volumeCB;
797  const int stride;
798 
799  const bool twisted;
800  const Float mu2;
801 
802  QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
803  : volumeCB(clover.VolumeCB()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
804  if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) {
805  errorQuda("Invalid clover order %d for this accessor", clover.Order());
806  }
807  offdiag = clover_ ? ((Float **)clover_)[0] : ((Float **)clover.V(inverse))[0];
808  diag = clover_ ? ((Float **)clover_)[1] : ((Float **)clover.V(inverse))[1];
809  }
810 
811  bool Twisted() const {return twisted;}
812  Float Mu2() const {return mu2;}
813 
814  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
815  // the factor of 0.5 comes from a basis change
816  for (int chirality=0; chirality<2; chirality++) {
817  // set diagonal elements
818  for (int i=0; i<6; i++) {
819  v[chirality*36 + i] = 0.5*diag[((i*2 + chirality)*2 + parity)*volumeCB + x];
820  }
821 
822  // the off diagonal elements
823  for (int i=0; i<30; i++) {
824  int z = i%2;
825  int off = i/2;
826  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
827  v[chirality*36 + 6 + i] = 0.5*offdiag[(((z*15 + idtab[off])*2 + chirality)*2 + parity)*volumeCB + x];
828  }
829 
830  }
831  }
832 
833  __device__ __host__ inline void save(const RegType v[length], int x, int parity) {
834  // the factor of 2.0 comes from undoing the basis change
835  for (int chirality=0; chirality<2; chirality++) {
836  // set diagonal elements
837  for (int i=0; i<6; i++) {
838  diag[((i*2 + chirality)*2 + parity)*volumeCB + x] = 2.0*v[chirality*36 + i];
839  }
840 
841  // the off diagonal elements
842  for (int i=0; i<30; i++) {
843  int z = i%2;
844  int off = i/2;
845  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
846  offdiag[(((z*15 + idtab[off])*2 + chirality)*2 + parity)*volumeCB + x] = 2.0*v[chirality*36 + 6 + i];
847  }
848  }
849  }
850 
851  size_t Bytes() const { return length*sizeof(Float); }
852  };
853 
854 
861  template <typename Float, int length>
862  struct BQCDOrder {
863  typedef typename mapper<Float>::type RegType;
865  const int volumeCB;
866  const int stride;
867 
868  const bool twisted;
869  const Float mu2;
870 
871  BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
872  : volumeCB(clover.Stride()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
873  if (clover.Order() != QUDA_BQCD_CLOVER_ORDER) {
874  errorQuda("Invalid clover order %d for this accessor", clover.Order());
875  }
876  this->clover[0] = clover_ ? clover_ : (Float *)(clover.V(inverse));
877  this->clover[1] = (Float *)((char *)this->clover[0] + clover.Bytes() / 2);
878  }
879 
880 
881  bool Twisted() const {return twisted;}
882  Float Mu2() const {return mu2;}
883 
889  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
890  int bq[36] = { 21, 32, 33, 0, 1, 20, // diagonal
891  28, 29, 30, 31, 6, 7, 14, 15, 22, 23, // column 1 6
892  34, 35, 8, 9, 16, 17, 24, 25, // column 2 16
893  10, 11, 18, 19, 26, 27, // column 3 24
894  2, 3, 4, 5, // column 4 30
895  12, 13};
896 
897  // flip the sign of the imaginary components
898  int sign[36];
899  for (int i=0; i<6; i++) sign[i] = 1;
900  for (int i=6; i<36; i+=2) {
901  if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) { sign[i] = -1; sign[i+1] = -1; }
902  else { sign[i] = 1; sign[i+1] = -1; }
903  }
904 
905  const int M=length/2;
906  for (int chirality=0; chirality<2; chirality++)
907  for (int i=0; i<M; i++)
908  v[chirality*M+i] = sign[i] * clover[parity][x*length+chirality*M+bq[i]];
909 
910  }
911 
912  // FIXME implement the save routine for BQCD ordered fields
913  __device__ __host__ inline void save(RegType v[length], int x, int parity) {
914 
915  };
916 
917  size_t Bytes() const { return length*sizeof(Float); }
918  };
919 
920  } // namespace clover
921 
922  // Use traits to reduce the template explosion
923  template<typename Float,int N=72, bool add_rho=false> struct clover_mapper { };
924 
925  // double precision uses Float2
926  template<int N, bool add_rho> struct clover_mapper<double,N,add_rho> { typedef clover::FloatNOrder<double, N, 2, add_rho> type; };
927 
928  // single precision uses Float4
929  template<int N, bool add_rho> struct clover_mapper<float,N,add_rho> { typedef clover::FloatNOrder<float, N, 4, add_rho> type; };
930 
931  // half precision uses Float4
932  template<int N, bool add_rho> struct clover_mapper<short,N,add_rho> { typedef clover::FloatNOrder<short, N, 4, add_rho> type; };
933 
934  // quarter precision uses Float4
935  template <int N, bool add_rho> struct clover_mapper<int8_t, N, add_rho> {
937  };
938 
939 } // namespace quda
940 
941 #endif //_CLOVER_ORDER_H
942 
943 
QudaCloverFieldOrder Order() const
Definition: clover_field.h:162
void * V(bool inverse=false)
Definition: clover_field.h:138
size_t Bytes() const
Definition: clover_field.h:167
__device__ __host__ void operator=(const HMatrix< U, N > &b)
Definition: quda_matrix.h:333
__device__ __host__ HMatrix()
Definition: quda_matrix.h:302
void comm_allreduce_min(double *data)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
std::array< int, 4 > dim
int V
Definition: host_utils.cpp:37
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
const int nColor
Definition: covdev_test.cpp:44
@ QUDA_BQCD_CLOVER_ORDER
Definition: enum_quda.h:260
@ QUDA_FLOAT4_CLOVER_ORDER
Definition: enum_quda.h:257
@ QUDA_FLOAT2_CLOVER_ORDER
Definition: enum_quda.h:256
@ QUDA_QDPJIT_CLOVER_ORDER
Definition: enum_quda.h:259
@ QUDA_PACKED_CLOVER_ORDER
Definition: enum_quda.h:258
enum QudaFieldLocation_s QudaFieldLocation
int length[]
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define host_free(ptr)
Definition: malloc_quda.h:115
void init()
Create the BLAS context.
__device__ __host__ int indexFloatN(int k, int stride, int x)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
void transform_reduce(Arg &arg)
__device__ __host__ void zero(double &a)
Definition: float_vector.h:318
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:605
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T2 >::value, void >::type copy_and_scale(T1 &a, const T2 &b, const T3 &c)
Specialized variants of the copy function that include an additional scale factor....
Definition: convert.h:105
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ complex< Float > & operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Accessor(const CloverField &A, bool inverse=false)
__host__ double transform_reduce(QudaFieldLocation location, helper h, double i, reducer r) const
__device__ __host__ void load(RegType v[length], int x, int parity) const
mapper< Float >::type RegType
__device__ __host__ void save(RegType v[length], int x, int parity)
BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ complex< Float > operator()(int dummy, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function. This is a special variant that is compatible with the equ...
__device__ __host__ int Ncolor() const
Complex-member accessor function.
FieldOrder(CloverField &A, bool inverse=false)
const QudaFieldLocation location
__device__ __host__ int VolumeCB() const
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field.
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm suared of the field.
__device__ __host__ int Volume() const
__device__ __host__ const complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function.
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field.
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
static constexpr bool supports_ghost_zone
const Accessor< Float, nColor, nSpin, order > accessor
Accessor routine for CloverFields in native field order.
__device__ __host__ void load(real v[block], int x, int parity, int chirality) const
Load accessor for a single chiral block.
__device__ __host__ clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality)
This accessor routine returns a clover_wrapper to this object, allowing us to overload various operat...
__device__ __host__ void save(const real v[length], int x, int parity)
Store accessor for the clover matrix.
void save()
Backup the field to the host when tuning.
__device__ __host__ void save(const real v[block], int x, int parity, int chirality)
Store accessor for a single chiral block.
__device__ __host__ const clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
typename mapper< Float >::type real
AllocType< huge_alloc >::type AllocInt
FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, norm_type *norm_=0, bool override=false)
host memory for backing up norm when tuning
void load()
Restore the field from the host after tuning.
VectorType< Float, N >::type Vector
__device__ __host__ void load(real v[length], int x, int parity) const
Load accessor for the clover matrix.
void * backup_norm_h
host memory for backing up the field when tuning
QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void save(const RegType v[length], int x, int parity)
mapper< Float >::type RegType
QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
mapper< Float >::type RegType
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void save(const RegType v[length], int x, int parity)
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
clover::FloatNOrder< double, N, 2, add_rho > type
clover::FloatNOrder< float, N, 4, add_rho > type
clover::FloatNOrder< int8_t, N, 4, add_rho > type
clover::FloatNOrder< short, N, 4, add_rho > type
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors,...
__device__ __host__ void operator=(const C &a)
Assignment operator with H matrix instance as input.
QUDA reimplementation of thrust::transform_reduce as well as wrappers also implementing thrust::reduc...
#define errorQuda(...)
Definition: util_quda.h:120