QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
clover_field_order.h
Go to the documentation of this file.
1 #ifndef _CLOVER_ORDER_H
2 #define _CLOVER_ORDER_H
3 
10 #include <register_traits.h>
11 #include <clover_field.h>
12 #include <complex_quda.h>
13 #include <thrust_helper.cuh>
14 #include <quda_matrix.h>
15 #include <color_spinor.h>
16 #include <trove_helper.cuh>
17 #include <texture_helper.cuh>
18 
19 namespace quda {
20 
33  template <typename Float, typename T>
34  struct clover_wrapper {
35  T &field;
36  const int x_cb;
37  const int parity;
38  const int chirality;
39 
47  __device__ __host__ inline clover_wrapper<Float,T>(T &field, int x_cb, int parity, int chirality)
48  : field(field), x_cb(x_cb), parity(parity), chirality(chirality) { }
49 
54  template<typename C>
55  __device__ __host__ inline void operator=(const C &a) {
56  field.save(a.data, x_cb, parity, chirality);
57  }
58  };
59 
60  template <typename T, int N>
61  template <typename S>
62  __device__ __host__ inline void HMatrix<T,N>::operator=(const clover_wrapper<T,S> &a) {
63  a.field.load(data, a.x_cb, a.parity, a.chirality);
64  }
65 
66  template <typename T, int N>
67  template <typename S>
68  __device__ __host__ inline HMatrix<T,N>::HMatrix(const clover_wrapper<T,S> &a) {
69  a.field.load(data, a.x_cb, a.parity, a.chirality);
70  }
71 
72  namespace clover {
73 
74  template<typename ReduceType, typename Float> struct square_ {
75  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
76  { return static_cast<ReduceType>(norm(x)); }
77  };
78 
79  template<typename ReduceType, typename Float> struct abs_ {
80  __host__ __device__ inline ReduceType operator()(const quda::complex<Float> &x)
81  { return static_cast<ReduceType>(abs(x)); }
82  };
83 
168  template<typename Float, int nColor, int nSpin, QudaCloverFieldOrder order> struct Accessor {
169  mutable complex<Float> dummy;
170  Accessor(const CloverField &A, bool inverse=false) {
171  errorQuda("Not implemented for order %d", order);
172  }
173 
174  __device__ __host__ inline complex<Float>& operator()(int parity, int x, int s_row, int s_col,
175  int c_row, int c_col) const {
176  return dummy;
177  }
178 
179  template<typename helper, typename reducer>
180  __host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double i) const
181  {
182  return 0.0;
183  }
184  };
185 
186  template<typename Float, int nColor, int nSpin>
188  Float *a;
189  int stride;
190  size_t offset_cb;
191  static constexpr int N = nSpin * nColor / 2;
192  Accessor(const CloverField &A, bool inverse=false)
193  : a(static_cast<Float*>(const_cast<void*>(A.V(inverse)))), stride(A.Stride()),
194  offset_cb(A.Bytes()/(2*sizeof(Float))) { }
195 
196  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
197  // if not in the diagonal chiral block then return 0.0
198  if (s_col / 2 != s_row / 2) { return complex<Float>(0.0); }
199 
200  const int chirality = s_col / 2;
201 
202  int row = s_row%2 * nColor + c_row;
203  int col = s_col%2 * nColor + c_col;
204  Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
205 
206  if (row == col) {
207  return 2*a_[ row*stride+x ];
208  } else if (col < row) {
209  // switch coordinates to count from bottom right instead of top left of matrix
210  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
211  complex<Float> *off = reinterpret_cast<complex<Float>*>(a_ + N);
212 
213  return 2*off[k*stride + x];
214  } else {
215  // requesting upper triangular so return conjugate transpose
216  // switch coordinates to count from bottom right instead of top left of matrix
217  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
218  complex<Float> *off = reinterpret_cast<complex<Float>*>(a_ + N);
219  return 2*conj(off[k*stride + x]);
220  }
221 
222  }
223 
224  template<typename helper, typename reducer>
225  __host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const {
226  double result = init;
227  if (location == QUDA_CUDA_FIELD_LOCATION) {
229  thrust::device_ptr<complex<Float> > ptr(reinterpret_cast<complex<Float>*>(a));
230  result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+offset_cb, h, result, r);
231  } else {
232  // just use offset_cb, since factor of two from parity is equivalent to complexity
233  complex<Float> *ptr = reinterpret_cast<complex<Float>*>(a);
234  result = thrust::transform_reduce(thrust::seq, ptr, ptr+offset_cb, h, result, r);
235  }
236  return 2.0 * result; // factor of two is normalization
237  }
238 
239  };
240 
241  template<int N>
242  __device__ __host__ inline int indexFloatN(int k, int stride, int x) {
243  int j = k / N;
244  int i = k % N;
245  return (j*stride+x)*N + i;
246  };
247 
248  template<typename Float, int nColor, int nSpin>
250  Float *a;
251  int stride;
252  size_t offset_cb;
253  static constexpr int N = nSpin * nColor / 2;
254  Accessor(const CloverField &A, bool inverse=false)
255  : a(static_cast<Float*>(const_cast<void*>(A.V(inverse)))), stride(A.Stride()),
256  offset_cb(A.Bytes()/(2*sizeof(Float))) { }
257 
258  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
259  // if not in the diagonal chiral block then return 0.0
260  if (s_col / 2 != s_row / 2) { return complex<Float>(0.0); }
261 
262  const int chirality = s_col / 2;
263 
264  int row = s_row%2 * nColor + c_row;
265  int col = s_col%2 * nColor + c_col;
266  Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
267 
268  if (row == col) {
269  return 2*a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(row, stride, x) ];
270  } else if (col < row) {
271  // switch coordinates to count from bottom right instead of top left of matrix
272  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
273  int idx = N + 2*k;
274 
275  return 2*complex<Float>(a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
276  a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
277  } else {
278  // requesting upper triangular so return conjugate transpose
279  // switch coordinates to count from bottom right instead of top left of matrix
280  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
281  int idx = N + 2*k;
282 
283  return 2*complex<Float>( a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
284  -a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
285  }
286 
287  }
288 
289  template<typename helper, typename reducer>
290  __host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const {
291  double result = init;
292  if (location == QUDA_CUDA_FIELD_LOCATION) {
294  thrust::device_ptr<complex<Float> > ptr(reinterpret_cast<complex<Float>*>(a));
295  result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+offset_cb, h, result, r);
296  } else {
297  // just use offset_cb, since factor of two from parity is equivalent to complexity
298  complex<Float> *ptr = reinterpret_cast<complex<Float>*>(a);
299  result = thrust::transform_reduce(thrust::seq, ptr, ptr+offset_cb, h, result, r);
300  }
301  return 2.0 * result; // factor of two is normalization
302  }
303 
304  };
305 
306  template<typename Float, int nColor, int nSpin>
307  struct Accessor<Float,nColor,nSpin,QUDA_PACKED_CLOVER_ORDER> {
308  Float *a[2];
309  const int N = nSpin * nColor / 2;
310  complex<Float> zero;
311  Accessor(const CloverField &A, bool inverse=false) {
312  // even
313  a[0] = static_cast<Float*>(const_cast<void*>(A.V(inverse)));
314  // odd
315  a[1] = static_cast<Float*>(const_cast<void*>(A.V(inverse))) + A.Bytes()/(2*sizeof(Float));
316  zero = complex<Float>(0.0,0.0);
317  }
318 
319  __device__ __host__ inline complex<Float> operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const {
320  // if not in the diagonal chiral block then return 0.0
321  if (s_col / 2 != s_row / 2) { return zero; }
322 
323  const int chirality = s_col / 2;
324 
325  unsigned int row = s_row%2 * nColor + c_row;
326  unsigned int col = s_col%2 * nColor + c_col;
327 
328  if (row == col) {
329  complex<Float> tmp = a[parity][(x*2 + chirality)*N*N + row];
330  return tmp;
331  } else if (col < row) {
332  // switch coordinates to count from bottom right instead of top left of matrix
333  int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
334  int idx = (x*2 + chirality)*N*N + N + 2*k;
335  return complex<Float>(a[parity][idx], a[parity][idx+1]);
336  } else {
337  // switch coordinates to count from bottom right instead of top left of matrix
338  int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
339  int idx = (x*2 + chirality)*N*N + N + 2*k;
340  return complex<Float>(a[parity][idx], -a[parity][idx+1]);
341  }
342  }
343 
344  template <typename helper, typename reducer>
345  __host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
346  {
347  errorQuda("Not implemented");
348  return 0.0;
349  }
350  };
351 
352  /*
353  FIXME the below is the old optimization used for reading the
354  clover field, making use of the symmetry to reduce the number of
355  reads.
356 
357 #define READ_CLOVER2_DOUBLE_STR(clover_, chi) \
358  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \
359  double2 C10, C11, C12, C13, C14, C15, C16, C17; \
360  double2* clover = (double2*)clover_; \
361  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
362  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
363  double diag = 0.5*(C0.x + C1.y); \
364  double diag_inv = 1.0/diag; \
365  C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \
366  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
367  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
368  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
369  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
370  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
371  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
372  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
373  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
374  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
375  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
376  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
377  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
378  C15 = make_double2(-C3.x,-C3.y); \
379  C16 = make_double2(-C4.x,-C4.y); \
380  C17 = make_double2(-C8.x,-C8.y); \
381  */
382 
388  template <typename Float, int nColor, int nSpin, QudaCloverFieldOrder order>
389  struct FieldOrder {
390 
391  protected:
394  const int volumeCB;
396  bool inverse;
398 
399  public:
404  FieldOrder(CloverField &A, bool inverse=false)
405  : A(A), volumeCB(A.VolumeCB()), accessor(A,inverse), inverse(inverse), location(A.Location())
406  { }
407 
408  CloverField& Field() { return A; }
409 
420  __device__ __host__ inline const complex<Float> operator()(int parity, int x, int s_row,
421  int s_col, int c_row, int c_col) const {
422  return accessor(parity, x, s_row, s_col, c_row, c_col);
423  }
424 
439  __device__ __host__ inline complex<Float> operator()(int dummy, int parity, int x, int s_row,
440  int s_col, int c_row, int c_col) const {
441  return accessor(parity,x,s_row,s_col,c_row,c_col);
442  }
443 
454  /*
455  __device__ __host__ inline complex<Float>& operator()(int parity, int x, int s_row,
456  int s_col, int c_row, int c_col) {
457  //errorQuda("Clover accessor not implemented as a lvalue");
458  return accessor(parity, x, s_row, s_col, c_row, c_col);
459  }
460  */
461 
463  __device__ __host__ inline int Ncolor() const { return nColor; }
464 
466  __device__ __host__ inline int Volume() const { return 2*volumeCB; }
467 
469  __device__ __host__ inline int VolumeCB() const { return volumeCB; }
470 
472  size_t Bytes() const {
473  constexpr int n = (nSpin * nColor) / 2;
474  constexpr int chiral_block = n * n / 2;
475  return static_cast<size_t>(volumeCB) * chiral_block * 2ll * 2ll * sizeof(Float); // 2 from complex, 2 from chirality
476  }
477 
483  __host__ double norm1(int dim=-1, bool global=true) const {
484  double nrm1 = accessor.transform_reduce(location, abs_<double,Float>(),
485  thrust::plus<double>(), 0.0);
486  if (global) comm_allreduce(&nrm1);
487  return nrm1;
488  }
489 
495  __host__ double norm2(int dim=-1, bool global=true) const {
496  double nrm2 = accessor.transform_reduce(location, square_<double,Float>(),
497  thrust::plus<double>(), 0.0);
498  if (global) comm_allreduce(&nrm2);
499  return nrm2;
500  }
501 
507  __host__ double abs_max(int dim=-1, bool global=true) const {
508  double absmax = accessor.transform_reduce(location, abs_<Float,Float>(),
509  thrust::maximum<Float>(), 0.0);
510  if (global) comm_allreduce_max(&absmax);
511  return absmax;
512  }
513 
519  __host__ double abs_min(int dim=-1, bool global=true) const {
520  double absmax = accessor.transform_reduce(location, abs_<Float,Float>(),
521  thrust::minimum<Float>(), std::numeric_limits<double>::max());
522  if (global) comm_allreduce_min(&absmax);
523  return absmax;
524  }
525 
526  };
527 
540  template <typename Float, int length, int N, bool add_rho=false, bool huge_alloc=false>
541  struct FloatNOrder {
543  using real = typename mapper<Float>::type;
546  typedef float norm_type;
547  static const int M = length / (N * 2); // number of short vectors per chiral block
548  static const int block = length / 2; // chiral block size
549  Float *clover;
550  norm_type *norm;
551  const AllocInt offset; // offset can be 32-bit or 64-bit
552  const AllocInt norm_offset;
553 #ifdef USE_TEXTURE_OBJECTS
554  typedef typename TexVectorType<real, N>::type TexVector;
555  cudaTextureObject_t tex;
556  cudaTextureObject_t normTex;
557  const int tex_offset;
558 #endif
559  const int volumeCB;
560  const int stride;
561 
562  const bool twisted;
563  const real mu2;
564  const real rho;
565 
566  size_t bytes;
567  size_t norm_bytes;
568  void *backup_h;
570 
571  FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_ = 0, norm_type *norm_ = 0,
572  bool override = false) :
573  offset(clover.Bytes() / (2 * sizeof(Float))),
574  norm_offset(clover.NormBytes() / (2 * sizeof(norm_type))),
575 #ifdef USE_TEXTURE_OBJECTS
576  tex(0),
577  normTex(0),
578  tex_offset(offset / N),
579 #endif
580  volumeCB(clover.VolumeCB()),
581  stride(clover.Stride()),
582  twisted(clover.Twisted()),
583  mu2(clover.Mu2()),
584  rho(clover.Rho()),
585  bytes(clover.Bytes()),
586  norm_bytes(clover.NormBytes()),
587  backup_h(nullptr),
588  backup_norm_h(nullptr)
589  {
590  this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse));
591  this->norm = norm_ ? norm_ : (norm_type *)(clover.Norm(is_inverse));
592 #ifdef USE_TEXTURE_OBJECTS
593  if (clover.Location() == QUDA_CUDA_FIELD_LOCATION) {
594  if (is_inverse) {
595  tex = static_cast<const cudaCloverField&>(clover).InvTex();
596  normTex = static_cast<const cudaCloverField&>(clover).InvNormTex();
597  } else {
598  tex = static_cast<const cudaCloverField&>(clover).Tex();
599  normTex = static_cast<const cudaCloverField&>(clover).NormTex();
600  }
601  if (!huge_alloc && (this->clover != clover.V(is_inverse) ||
602  ((clover.Precision() == QUDA_HALF_PRECISION || clover.Precision() == QUDA_QUARTER_PRECISION) && this->norm != clover.Norm(is_inverse)) ) && !override) {
603  errorQuda("Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
604  }
605  }
606 #endif
607  }
608 
609  bool Twisted() const { return twisted; }
610  real Mu2() const { return mu2; }
611 
622  __device__ __host__ inline clover_wrapper<real, Accessor> operator()(int x_cb, int parity, int chirality)
623  {
625  }
626 
637  __device__ __host__ inline const clover_wrapper<real, Accessor> operator()(
638  int x_cb, int parity, int chirality) const
639  {
640  return clover_wrapper<real, Accessor>(const_cast<Accessor &>(*this), x_cb, parity, chirality);
641  }
642 
650  __device__ __host__ inline void load(real v[block], int x, int parity, int chirality) const
651  {
652  norm_type nrm;
653  if (isFixed<Float>::value) {
654 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
655  nrm = !huge_alloc ? tex1Dfetch_<float>(normTex, parity * norm_offset + chirality * stride + x) :
656  norm[parity * norm_offset + chirality * stride + x];
657 #else
658  nrm = norm[parity * norm_offset + chirality * stride + x];
659 #endif
660  }
661 
662 #pragma unroll
663  for (int i=0; i<M; i++) {
664 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__)
665  if (!huge_alloc) { // use textures unless we have a huge alloc
666  // first do texture load from memory
667  TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity*tex_offset + stride*(chirality*M+i) + x);
668  // now insert into output array
669 #pragma unroll
670  for (int j = 0; j < N; j++) {
671  copy(v[i * N + j], reinterpret_cast<real *>(&vecTmp)[j]);
672  if (isFixed<Float>::value) v[i * N + j] *= nrm;
673  }
674  } else
675 #endif
676  {
677  // first load from memory
678  Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
679  // second do scalar copy converting into register type
680 #pragma unroll
681  for (int j = 0; j < N; j++) { copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm); }
682  }
683  }
684 
685  if (add_rho) for (int i=0; i<6; i++) v[i] += rho;
686  }
687 
695  __device__ __host__ inline void save(const real v[block], int x, int parity, int chirality)
696  {
697  real tmp[block];
698 
699  // find the norm of each chiral block
700  if (isFixed<Float>::value) {
701  norm_type scale = 0.0;
702 #pragma unroll
703  for (int i = 0; i < block; i++) scale = fabsf((norm_type)v[i]) > scale ? fabsf((norm_type)v[i]) : scale;
704  norm[parity*norm_offset + chirality*stride + x] = scale;
705 
706 #ifdef __CUDA_ARCH__
707  real scale_inv = __fdividef(fixedMaxValue<Float>::value, scale);
708 #else
709  real scale_inv = fixedMaxValue<Float>::value / scale;
710 #endif
711 #pragma unroll
712  for (int i = 0; i < block; i++) tmp[i] = v[i] * scale_inv;
713  } else {
714 #pragma unroll
715  for (int i = 0; i < block; i++) tmp[i] = v[i];
716  }
717 
718 #pragma unroll
719  for (int i = 0; i < M; i++) {
720  Vector vecTmp;
721  // first do scalar copy converting into storage type
722  for (int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], tmp[i * N + j]);
723  // second do vectorized copy into memory
724  vector_store(clover + parity*offset, x + stride*(chirality*M+i), vecTmp);
725  }
726  }
727 
735  __device__ __host__ inline void load(real v[length], int x, int parity) const {
736 #pragma unroll
737  for (int chirality = 0; chirality < 2; chirality++) load(&v[chirality * block], x, parity, chirality);
738  }
739 
747  __device__ __host__ inline void save(const real v[length], int x, int parity) {
748 #pragma unroll
749  for (int chirality = 0; chirality < 2; chirality++) save(&v[chirality * block], x, parity, chirality);
750  }
751 
755  void save() {
756  if (backup_h) errorQuda("Already allocated host backup");
757  backup_h = safe_malloc(bytes);
758  cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
759  if (norm_bytes) {
760  backup_norm_h = safe_malloc(norm_bytes);
761  cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
762  }
763  checkCudaError();
764  }
765 
769  void load() {
770  cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
771  host_free(backup_h);
772  backup_h = nullptr;
773  if (norm_bytes) {
774  cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
775  host_free(backup_norm_h);
776  backup_norm_h = nullptr;
777  }
778  checkCudaError();
779  }
780 
781  size_t Bytes() const {
782  size_t bytes = length*sizeof(Float);
783  if (isFixed<Float>::value) bytes += 2 * sizeof(norm_type);
784  return bytes;
785  }
786  };
787 
794  template <typename real, int length> struct S { real v[length]; };
795 
799  template <typename Float, int length>
800  struct QDPOrder {
801  typedef typename mapper<Float>::type RegType;
802  Float *clover;
803  const int volumeCB;
804  const int stride;
805  const int offset;
806 
807  const bool twisted;
808  const Float mu2;
809 
810  QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
811  : volumeCB(clover.VolumeCB()), stride(volumeCB), offset(clover.Bytes()/(2*sizeof(Float))),
812  twisted(clover.Twisted()), mu2(clover.Mu2()) {
813  this->clover = clover_ ? clover_ : (Float*)(clover.V(inverse));
814  }
815 
816  bool Twisted() const {return twisted;}
817  Float Mu2() const {return mu2;}
818 
819  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
820  // factor of 0.5 comes from basis change
821 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
822  typedef S<Float,length> structure;
823  trove::coalesced_ptr<structure> clover_((structure*)clover);
824  structure v_ = clover_[parity*volumeCB + x];
825  for (int i=0; i<length; i++) v[i] = 0.5*(RegType)v_.v[i];
826 #else
827  for (int i=0; i<length; i++) v[i] = 0.5*clover[parity*offset + x*length+i];
828 #endif
829  }
830 
831  __device__ __host__ inline void save(const RegType v[length], int x, int parity) {
832 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
833  typedef S<Float,length> structure;
834  trove::coalesced_ptr<structure> clover_((structure*)clover);
835  structure v_;
836  for (int i=0; i<length; i++) v_.v[i] = 2.0*(Float)v[i];
837  clover_[parity*volumeCB + x] = v_;
838 #else
839  for (int i=0; i<length; i++) clover[parity*offset + x*length+i] = 2.0*v[i];
840 #endif
841  }
842 
843  size_t Bytes() const { return length*sizeof(Float); }
844  };
845 
849  template <typename Float, int length>
850  struct QDPJITOrder {
851  typedef typename mapper<Float>::type RegType;
852  Float *diag;
853  Float *offdiag;
854  const int volumeCB;
855  const int stride;
856 
857  const bool twisted;
858  const Float mu2;
859 
860  QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
861  : volumeCB(clover.VolumeCB()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
862  offdiag = clover_ ? ((Float**)clover_)[0] : ((Float**)clover.V(inverse))[0];
863  diag = clover_ ? ((Float**)clover_)[1] : ((Float**)clover.V(inverse))[1];
864  }
865 
866  bool Twisted() const {return twisted;}
867  Float Mu2() const {return mu2;}
868 
869  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
870  // the factor of 0.5 comes from a basis change
871  for (int chirality=0; chirality<2; chirality++) {
872  // set diagonal elements
873  for (int i=0; i<6; i++) {
874  v[chirality*36 + i] = 0.5*diag[((i*2 + chirality)*2 + parity)*volumeCB + x];
875  }
876 
877  // the off diagonal elements
878  for (int i=0; i<30; i++) {
879  int z = i%2;
880  int off = i/2;
881  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
882  v[chirality*36 + 6 + i] = 0.5*offdiag[(((z*15 + idtab[off])*2 + chirality)*2 + parity)*volumeCB + x];
883  }
884 
885  }
886  }
887 
888  __device__ __host__ inline void save(const RegType v[length], int x, int parity) {
889  // the factor of 2.0 comes from undoing the basis change
890  for (int chirality=0; chirality<2; chirality++) {
891  // set diagonal elements
892  for (int i=0; i<6; i++) {
893  diag[((i*2 + chirality)*2 + parity)*volumeCB + x] = 2.0*v[chirality*36 + i];
894  }
895 
896  // the off diagonal elements
897  for (int i=0; i<30; i++) {
898  int z = i%2;
899  int off = i/2;
900  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
901  offdiag[(((z*15 + idtab[off])*2 + chirality)*2 + parity)*volumeCB + x] = 2.0*v[chirality*36 + 6 + i];
902  }
903  }
904  }
905 
906  size_t Bytes() const { return length*sizeof(Float); }
907  };
908 
909 
916  template <typename Float, int length>
917  struct BQCDOrder {
918  typedef typename mapper<Float>::type RegType;
919  Float *clover[2];
920  const int volumeCB;
921  const int stride;
922 
923  const bool twisted;
924  const Float mu2;
925 
926  BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
927  : volumeCB(clover.Stride()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
928  this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse));
929  this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2);
930  }
931 
932 
933  bool Twisted() const {return twisted;}
934  Float Mu2() const {return mu2;}
935 
941  __device__ __host__ inline void load(RegType v[length], int x, int parity) const {
942  int bq[36] = { 21, 32, 33, 0, 1, 20, // diagonal
943  28, 29, 30, 31, 6, 7, 14, 15, 22, 23, // column 1 6
944  34, 35, 8, 9, 16, 17, 24, 25, // column 2 16
945  10, 11, 18, 19, 26, 27, // column 3 24
946  2, 3, 4, 5, // column 4 30
947  12, 13};
948 
949  // flip the sign of the imaginary components
950  int sign[36];
951  for (int i=0; i<6; i++) sign[i] = 1;
952  for (int i=6; i<36; i+=2) {
953  if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) { sign[i] = -1; sign[i+1] = -1; }
954  else { sign[i] = 1; sign[i+1] = -1; }
955  }
956 
957  const int M=length/2;
958  for (int chirality=0; chirality<2; chirality++)
959  for (int i=0; i<M; i++)
960  v[chirality*M+i] = sign[i] * clover[parity][x*length+chirality*M+bq[i]];
961 
962  }
963 
964  // FIXME implement the save routine for BQCD ordered fields
965  __device__ __host__ inline void save(RegType v[length], int x, int parity) {
966 
967  };
968 
969  size_t Bytes() const { return length*sizeof(Float); }
970  };
971 
972  } // namespace clover
973 
974  // Use traits to reduce the template explosion
975  template<typename Float,int N=72, bool add_rho=false> struct clover_mapper { };
976 
977  // double precision uses Float2
978  template<int N, bool add_rho> struct clover_mapper<double,N,add_rho> { typedef clover::FloatNOrder<double, N, 2, add_rho> type; };
979 
980  // single precision uses Float4
981  template<int N, bool add_rho> struct clover_mapper<float,N,add_rho> { typedef clover::FloatNOrder<float, N, 4, add_rho> type; };
982 
983  // half precision uses Float4
984  template<int N, bool add_rho> struct clover_mapper<short,N,add_rho> { typedef clover::FloatNOrder<short, N, 4, add_rho> type; };
985 
986  // quarter precision uses Float4
987  template<int N, bool add_rho> struct clover_mapper<char,N,add_rho> { typedef clover::FloatNOrder<char, N, 4, add_rho> type; };
988 
989 } // namespace quda
990 
991 #endif //_CLOVER_ORDER_H
992 
993 
__device__ __host__ void load(real v[block], int x, int parity, int chirality) const
Load accessor for a single chiral block.
void * V(bool inverse=false)
Definition: clover_field.h:74
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double i) const
__device__ __host__ const complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function.
#define errorQuda(...)
Definition: util_quda.h:121
__device__ __host__ int Ncolor() const
Complex-member accessor function.
size_t Bytes() const
Definition: clover_field.h:98
#define host_free(ptr)
Definition: malloc_quda.h:71
typename mapper< Float >::type real
mapper< Float >::type RegType
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53
mapper< Float >::type RegType
__device__ __host__ clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality)
This accessor routine returns a clover_wrapper to this object, allowing us to overload various operat...
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__device__ __host__ const clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ complex< Float > operator()(int dummy, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function. This is a special variant that is compatible with the equ...
__host__ __device__ void copy(T1 &a, const T2 &b)
int length[]
void * backup_norm_h
host memory for backing up the field when tuning
const Accessor< Float, nColor, nSpin, order > accessor
This is just a dummy structure we use for trove to define the required structure size.
__device__ __host__ void load(real v[length], int x, int parity) const
Load accessor for the clover matrix.
__device__ __host__ void save(RegType v[length], int x, int parity)
FieldOrder(CloverField &A, bool inverse=false)
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
void comm_allreduce_min(double *data)
Definition: comm_mpi.cpp:265
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__device__ __host__ int VolumeCB() const
const QudaFieldLocation location
const int nColor
Definition: covdev_test.cpp:75
__device__ __host__ HMatrix()
Definition: quda_matrix.h:307
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
AllocType< huge_alloc >::type AllocInt
clover::FloatNOrder< char, N, 4, add_rho > type
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field.
clover::FloatNOrder< short, N, 4, add_rho > type
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
void load()
Restore the field from the host after tuning.
__device__ __host__ void save(const RegType v[length], int x, int parity)
__device__ __host__ int Volume() const
__device__ __host__ void load(RegType v[length], int x, int parity) const
Provides precision abstractions and defines the register precision given the storage precision using ...
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm suared of the field.
Accessor(const CloverField &A, bool inverse=false)
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
#define safe_malloc(size)
Definition: malloc_quda.h:66
clover::FloatNOrder< double, N, 2, add_rho > type
int V
Definition: test_util.cpp:27
QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ void save(const real v[length], int x, int parity)
Store accessor for the clover matrix.
QudaFieldLocation Location() const
__device__ __host__ void save(const real v[block], int x, int parity, int chirality)
Store accessor for a single chiral block.
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field.
enum QudaFieldLocation_s QudaFieldLocation
mapper< Float >::type RegType
__device__ __host__ void operator=(const C &a)
Assignment operator with H matrix instance as input.
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
clover::FloatNOrder< float, N, 4, add_rho > type
void * Norm(bool inverse=false)
Definition: clover_field.h:75
VectorType< Float, N >::type Vector
FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, norm_type *norm_=0, bool override=false)
host memory for backing up norm when tuning
__device__ __host__ void operator=(const HMatrix< U, N > &b)
Definition: quda_matrix.h:338
__device__ __host__ void save(const RegType v[length], int x, int parity)
Accessor routine for CloverFields in native field order.
QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
__device__ __host__ complex< Float > & operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
static int volumeCB
Definition: face_gauge.cpp:43
#define checkCudaError()
Definition: util_quda.h:161
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:258
BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
QudaPrecision Precision() const
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:54
__device__ __host__ int indexFloatN(int k, int stride, int x)
__device__ __host__ void load(RegType v[length], int x, int parity) const
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
unsigned long long bytes
Definition: blas_quda.cu:23
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
void save()
Backup the field to the host when tuning.
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ void copy_and_scale(T1 &a, const T2 &b, const T3 &c)
Specialized variants of the copy function that include an additional scale factor. Note the scale factor is ignored unless the input type (b) is either a short or char vector.