QUDA  v1.1.0
A library for QCD on GPUs
quda_matrix.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstdio>
4 #include <cstdlib>
5 #include <iostream>
6 #include <iomanip>
7 
8 #include <register_traits.h>
9 #include <float_vector.h>
10 #include <complex_quda.h>
11 
12 namespace quda {
13 
14 
15  template<class T>
16  struct Zero
17  {
18  //static const T val;
19  __device__ __host__ inline
20  static T val();
21  };
22 
23  template<>
24  __device__ __host__ inline
26  {
27  return make_float2(0.,0.);
28  }
29 
30  template<>
31  __device__ __host__ inline
33  {
34  return make_double2(0.,0.);
35  }
36 
37  template<class T>
38  struct Identity
39  {
40  __device__ __host__ inline
41  static T val();
42  };
43 
44  template<>
45  __device__ __host__ inline
47  return make_float2(1.,0.);
48  }
49 
50  template<>
51  __device__ __host__ inline
53  return make_double2(1.,0.);
54  }
55 
56  template<typename Float, typename T> struct gauge_wrapper;
57  template<typename Float, typename T> struct gauge_ghost_wrapper;
58  template<typename Float, typename T> struct clover_wrapper;
59  template <typename T, int N> class HMatrix;
60 
61  template<class T, int N>
62  class Matrix
63  {
64  typedef typename RealType<T>::type real;
65 
66  private:
67  __device__ __host__ inline int index(int i, int j) const { return i*N + j; }
68 
69  public:
70  T data[N*N];
71 
72  __device__ __host__ constexpr int size() const { return N; }
73 
74  __device__ __host__ inline Matrix() { setZero(this); }
75 
76  __device__ __host__ inline Matrix(const Matrix<T, N> &a)
77  {
78 #pragma unroll
79  for (int i=0; i<N*N; i++) data[i] = a.data[i];
80  }
81 
82  template <class U> __device__ __host__ inline Matrix(const Matrix<U, N> &a)
83  {
84 #pragma unroll
85  for (int i = 0; i < N * N; i++) data[i] = a.data[i];
86  }
87 
88  __device__ __host__ inline Matrix(const T data_[])
89  {
90 #pragma unroll
91  for (int i=0; i<N*N; i++) data[i] = data_[i];
92  }
93 
94  __device__ __host__ inline Matrix(const HMatrix<real, N> &a);
95 
96  __device__ __host__ inline T const & operator()(int i, int j) const {
97  return data[index(i,j)];
98  }
99 
100  __device__ __host__ inline T & operator()(int i, int j) {
101  return data[index(i,j)];
102  }
103 
104  __device__ __host__ inline T const & operator()(int i) const {
105  int j = i % N;
106  int k = i / N;
107  return data[index(j,k)];
108  }
109 
110  __device__ __host__ inline T& operator()(int i) {
111  int j = i % N;
112  int k = i / N;
113  return data[index(j,k)];
114  }
115 
116  template<class U>
117  __device__ __host__ inline void operator=(const Matrix<U,N> & b) {
118 #pragma unroll
119  for (int i=0; i<N*N; i++) data[i] = b.data[i];
120  }
121 
122  template<typename S>
123  __device__ __host__ inline Matrix(const gauge_wrapper<real, S> &s);
124 
125  template<typename S>
126  __device__ __host__ inline void operator=(const gauge_wrapper<real, S> &s);
127 
128  template<typename S>
129  __device__ __host__ inline Matrix(const gauge_ghost_wrapper<real, S> &s);
130 
131  template<typename S>
132  __device__ __host__ inline void operator=(const gauge_ghost_wrapper<real, S> &s);
133 
139  __device__ __host__ inline real L1() {
140  real l1 = 0;
141 #pragma unroll
142  for (int j=0; j<N; j++) {
143  real col_sum = 0;
144 #pragma unroll
145  for (int i=0; i<N; i++) {
146  col_sum += abs(data[i*N + j]);
147  }
148  l1 = col_sum > l1 ? col_sum : l1;
149  }
150  return l1;
151  }
152 
158  __device__ __host__ inline real L2() {
159  real l2 = 0;
160 #pragma unroll
161  for (int j=0; j<N; j++) {
162 #pragma unroll
163  for (int i=0; i<N; i++) {
164  l2 += norm(data[i*N + j]);
165  }
166  }
167  return sqrt(l2);
168  }
169 
175  __device__ __host__ inline real Linf() {
176  real linf = 0;
177 #pragma unroll
178  for (int i=0; i<N; i++) {
179  real row_sum = 0;
180 #pragma unroll
181  for (int j=0; j<N; j++) {
182  row_sum += abs(data[i*N + j]);
183  }
184  linf = row_sum > linf ? row_sum : linf;
185  }
186  return linf;
187  }
188 
194  __device__ __host__ inline uint64_t checksum() const {
195  // ensure length is rounded up to 64-bit multiple
196  constexpr int length = (N*N*sizeof(T) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
197  uint64_t base[length] = { };
198  memcpy(base, data, N * N * sizeof(T));
199  uint64_t checksum_ = base[0];
200  for (int i=1; i<length; i++) checksum_ ^= base[i];
201  return checksum_;
202  }
203 
204  __device__ __host__ inline bool isUnitary(double max_error) const
205  {
206  const auto identity = conj(*this) * *this;
207 
208 #pragma unroll
209  for (int i=0; i<N; ++i){
210  if( fabs(identity(i,i).real() - 1.0) > max_error ||
211  fabs(identity(i,i).imag()) > max_error) return false;
212 
213 #pragma unroll
214  for (int j=i+1; j<N; ++j){
215  if( fabs(identity(i,j).real()) > max_error ||
216  fabs(identity(i,j).imag()) > max_error ||
217  fabs(identity(j,i).real()) > max_error ||
218  fabs(identity(j,i).imag()) > max_error ){
219  return false;
220  }
221  }
222  }
223 
224 #pragma unroll
225  for (int i=0; i<N; i++) {
226 #pragma unroll
227  for (int j=0; j<N; j++) {
228  if (std::isnan((*this)(i,j).real()) ||
229  std::isnan((*this)(i,j).imag())) return false;
230  }
231  }
232 
233  return true;
234  }
235 
236  };
237 
243  template <typename T, typename Hmat>
245  Hmat &mat;
246  const int i;
247  const int j;
248  const int idx;
249 
250  __device__ __host__ inline HMatrix_wrapper(Hmat &mat, int i, int j, int idx) : mat(mat), i(i), j(j), idx(idx) { }
251 
252  __device__ __host__ inline void operator=(const complex<T> &a) {
253  if (i==j) {
254  mat.data[idx] = a.real();
255  } else if (j<i) {
256  mat.data[idx+0] = a.real();
257  mat.data[idx+1] = a.imag();
258  } else {
259  mat.data[idx+0] = a.real();
260  mat.data[idx+1] = -a.imag();
261  }
262  }
263 
264  __device__ __host__ inline void operator+=(const complex<T> &a) {
265  if (i==j) {
266  mat.data[idx] += a.real();
267  } else if (j<i) {
268  mat.data[idx+0] += a.real();
269  mat.data[idx+1] += a.imag();
270  } else {
271  mat.data[idx+0] += a.real();
272  mat.data[idx+1] += -a.imag();
273  }
274  }
275  };
276 
280  template<class T, int N>
281  class HMatrix
282  {
284  private:
285  // compute index into triangular-packed Hermitian matrix
286  __device__ __host__ inline int index(int i, int j) const {
287  if (i==j) {
288  return i;
289  } else if (j<i) {
290  int k = N*(N-1)/2 - (N-j)*(N-j-1)/2 + i - j - 1;
291  return N + 2*k;
292  } else { // i>j
293  // switch coordinates to count from bottom right instead of top left of matrix
294  int k = N*(N-1)/2 - (N-i)*(N-i-1)/2 + j - i - 1;
295  return N + 2*k;
296  }
297  }
298 
299  public:
300  T data[N*N]; // store in real-valued array
301 
302  __device__ __host__ inline HMatrix() {
303 #pragma unroll
304  for (int i = 0; i < N * N; i++) data[i] = (T)0.0;
305  }
306 
307  __device__ __host__ inline HMatrix(const HMatrix<T,N> &a) {
308 #pragma unroll
309  for (int i=0; i<N*N; i++) data[i] = a.data[i];
310  }
311 
312  __device__ __host__ inline HMatrix(const T data_[]) {
313 #pragma unroll
314  for (int i=0; i<N*N; i++) data[i] = data_[i];
315  }
316 
317  __device__ __host__ inline complex<T> const operator()(int i, int j) const {
318  const int idx = index(i,j);
319  if (i==j) {
320  return complex<T>(data[idx],0.0);
321  } else if (j<i) {
322  return complex<T>(data[idx], data[idx+1]);
323  } else {
324  return complex<T>(data[idx],-data[idx+1]);
325  }
326  }
327 
328  __device__ __host__ inline HMatrix_wrapper<T,HMatrix<T,N> > operator() (int i, int j) {
329  return HMatrix_wrapper<T,HMatrix<T,N> >(*this, i, j, index(i,j));
330  }
331 
332  template<class U>
333  __device__ __host__ inline void operator=(const HMatrix<U,N> & b) {
334 #pragma unroll
335  for (int i=0; i<N*N; i++) data[i] = b.data[i];
336  }
337 
338  template<typename S>
339  __device__ __host__ inline HMatrix(const clover_wrapper<T, S> &s);
340 
341  template<typename S>
342  __device__ __host__ inline void operator=(const clover_wrapper<T, S> &s);
343 
348  __device__ __host__ inline HMatrix<T,N> square() const {
349  HMatrix<T,N> result;
350  complex<T> tmp;
351 #pragma unroll
352  for (int i=0; i<N; i++) {
353 #pragma unroll
354  for (int k=0; k<N; k++) if (i<=k) { // else compiler can't handle triangular unroll
355  tmp.x = (*this)(i,0).real() * (*this)(0,k).real();
356  tmp.x -= (*this)(i,0).imag() * (*this)(0,k).imag();
357  tmp.y = (*this)(i,0).real() * (*this)(0,k).imag();
358  tmp.y += (*this)(i,0).imag() * (*this)(0,k).real();
359 #pragma unroll
360  for (int j=1; j<N; j++) {
361  tmp.x += (*this)(i,j).real() * (*this)(j,k).real();
362  tmp.x -= (*this)(i,j).imag() * (*this)(j,k).imag();
363  tmp.y += (*this)(i,j).real() * (*this)(j,k).imag();
364  tmp.y += (*this)(i,j).imag() * (*this)(j,k).real();
365  }
366  result(i,k) = tmp;
367  }
368  }
369  return result;
370  }
371 
376  __device__ __host__ inline T max() const
377  {
378  HMatrix<T, N> result;
379  T max = static_cast<T>(0.0);
380 #pragma unroll
381  for (int i = 0; i < N * N; i++) max = (abs(data[i]) > max ? abs(data[i]) : max);
382  return max;
383  }
384 
385  __device__ __host__ void print() const {
386  for (int i=0; i<N; i++) {
387  printf("i=%d ", i);
388  for (int j=0; j<N; j++) {
389  printf(" (%e, %e)", (*this)(i,j).real(), (*this)(i,j).imag());
390  }
391  printf("\n");
392  }
393  printf("\n");
394  }
395 
396  };
397 
398  template<class T,int N>
399  __device__ __host__ Matrix<T,N>::Matrix(const HMatrix<typename RealType<T>::type,N> &a) {
400 #pragma unroll
401  for (int i=0; i<N; i++) {
402 #pragma unroll
403  for (int j=0; j<N; j++) {
404  (*this)(i,j) = a(i,j);
405  }
406  }
407  }
408 
409  template<class T>
410  __device__ __host__ inline T getTrace(const Matrix<T,3>& a)
411  {
412  return a(0,0) + a(1,1) + a(2,2);
413  }
414 
415 
416  template< template<typename,int> class Mat, class T>
417  __device__ __host__ inline T getDeterminant(const Mat<T,3> & a){
418 
419  T result;
420  result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
421  - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
422  + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
423 
424  return result;
425  }
426 
427  template< template<typename,int> class Mat, class T, int N>
428  __device__ __host__ inline Mat<T,N> operator+(const Mat<T,N> & a, const Mat<T,N> & b)
429  {
430  Mat<T,N> result;
431 #pragma unroll
432  for (int i=0; i<N*N; i++) result.data[i] = a.data[i] + b.data[i];
433  return result;
434  }
435 
436 
437  template< template<typename,int> class Mat, class T, int N>
438  __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const Mat<T,N> & b)
439  {
440 #pragma unroll
441  for (int i=0; i<N*N; i++) a.data[i] += b.data[i];
442  return a;
443  }
444 
445  template< template<typename,int> class Mat, class T, int N>
446  __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const T & b)
447  {
448 #pragma unroll
449  for (int i=0; i<N; i++) a(i,i) += b;
450  return a;
451  }
452 
453  template< template<typename,int> class Mat, class T, int N>
454  __device__ __host__ inline Mat<T,N> operator-=(Mat<T,N> & a, const Mat<T,N> & b)
455  {
456 #pragma unroll
457  for (int i=0; i<N*N; i++) a.data[i] -= b.data[i];
458  return a;
459  }
460 
461  template< template<typename,int> class Mat, class T, int N>
462  __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a, const Mat<T,N> & b)
463  {
464  Mat<T,N> result;
465 #pragma unroll
466  for (int i=0; i<N*N; i++) result.data[i] = a.data[i] - b.data[i];
467  return result;
468  }
469 
470  template< template<typename,int> class Mat, class T, int N, class S>
471  __device__ __host__ inline Mat<T,N> operator*(const S & scalar, const Mat<T,N> & a){
472  Mat<T,N> result;
473 #pragma unroll
474  for (int i=0; i<N*N; ++i) result.data[i] = scalar*a.data[i];
475  return result;
476  }
477 
478  template< template<typename,int> class Mat, class T, int N, class S>
479  __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> & a, const S & scalar){
480  return scalar*a;
481  }
482 
483  template< template<typename,int> class Mat, class T, int N, class S>
484  __device__ __host__ inline Mat<T,N> operator *=(Mat<T,N> & a, const S & scalar){
485  a = scalar*a;
486  return a;
487  }
488 
489  template< template<typename,int> class Mat, class T, int N>
490  __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a){
491  Mat<T,N> result;
492 #pragma unroll
493  for (int i=0; i<(N*N); ++i) result.data[i] = -a.data[i];
494  return result;
495  }
496 
497 
501  template< template<typename,int> class Mat, class T, int N>
502  __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> &a, const Mat<T,N> &b)
503  {
504  Mat<T,N> result;
505 #pragma unroll
506  for (int i=0; i<N; i++) {
507 #pragma unroll
508  for (int k=0; k<N; k++) {
509  result(i,k) = a(i,0) * b(0,k);
510 #pragma unroll
511  for (int j=1; j<N; j++) {
512  result(i,k) += a(i,j) * b(j,k);
513  }
514  }
515  }
516  return result;
517  }
518 
522  template< template<typename> class complex, typename T, int N>
523  __device__ __host__ inline Matrix<complex<T>,N> operator*(const Matrix<complex<T>,N> &a, const Matrix<complex<T>,N> &b)
524  {
525  Matrix<complex<T>,N> result;
526 #pragma unroll
527  for (int i=0; i<N; i++) {
528 #pragma unroll
529  for (int k=0; k<N; k++) {
530  result(i,k).x = a(i,0).real() * b(0,k).real();
531  result(i,k).x -= a(i,0).imag() * b(0,k).imag();
532  result(i,k).y = a(i,0).real() * b(0,k).imag();
533  result(i,k).y += a(i,0).imag() * b(0,k).real();
534 #pragma unroll
535  for (int j=1; j<N; j++) {
536  result(i,k).x += a(i,j).real() * b(j,k).real();
537  result(i,k).x -= a(i,j).imag() * b(j,k).imag();
538  result(i,k).y += a(i,j).real() * b(j,k).imag();
539  result(i,k).y += a(i,j).imag() * b(j,k).real();
540  }
541  }
542  }
543  return result;
544  }
545 
546  template<class T, int N>
547  __device__ __host__ inline Matrix<T,N> operator *=(Matrix<T,N> & a, const Matrix<T,N>& b){
548 
549  Matrix<T,N> c = a;
550  a = c*b;
551  return a;
552  }
553 
554 
555  // This is so that I can multiply real and complex matrice
556  template <class T, class U, int N>
557  __device__ __host__ inline Matrix<typename PromoteTypeId<T, U>::type, N> operator*(const Matrix<T, N> &a,
558  const Matrix<U, N> &b)
559  {
561 #pragma unroll
562  for (int i=0; i<N; i++) {
563 #pragma unroll
564  for (int k=0; k<N; k++) {
565  result(i,k) = a(i,0) * b(0,k);
566 #pragma unroll
567  for (int j=1; j<N; j++) {
568  result(i,k) += a(i,j) * b(j,k);
569  }
570  }
571  }
572  return result;
573  }
574 
575  template<class T>
576  __device__ __host__ inline
578  {
579  Matrix<T,2> result;
580  result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
581  result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
582  result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
583  result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
584  return result;
585  }
586 
587 
588  template<class T, int N>
589  __device__ __host__ inline
590  Matrix<T,N> conj(const Matrix<T,N> & other){
591  Matrix<T,N> result;
592 #pragma unroll
593  for (int i=0; i<N; ++i){
594 #pragma unroll
595  for (int j=0; j<N; ++j){
596  result(i,j) = conj( other(j,i) );
597  }
598  }
599  return result;
600  }
601 
602 
603  template<class T>
604  __device__ __host__ inline
606  {
607  const T det = getDeterminant(u);
608  const T det_inv = static_cast<typename T::value_type>(1.0)/det;
609  Matrix<T,3> uinv;
610 
611  T temp;
612 
613  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
614  uinv(0,0) = (det_inv*temp);
615 
616  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
617  uinv(0,1) = (temp*det_inv);
618 
619  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
620  uinv(0,2) = (temp*det_inv);
621 
622  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
623  uinv(1,0) = (det_inv*temp);
624 
625  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
626  uinv(1,1) = (temp*det_inv);
627 
628  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
629  uinv(1,2) = (temp*det_inv);
630 
631  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
632  uinv(2,0) = (det_inv*temp);
633 
634  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
635  uinv(2,1) = (temp*det_inv);
636 
637  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
638  uinv(2,2) = (temp*det_inv);
639 
640  return uinv;
641  }
642 
643 
644 
645  template<class T, int N>
646  __device__ __host__ inline
648 
649 #pragma unroll
650  for (int i=0; i<N; ++i){
651  (*m)(i,i) = 1;
652 #pragma unroll
653  for (int j=i+1; j<N; ++j){
654  (*m)(i,j) = (*m)(j,i) = 0;
655  }
656  }
657  }
658 
659 
660  template<int N>
661  __device__ __host__ inline
663 
664 #pragma unroll
665  for (int i=0; i<N; ++i){
666  (*m)(i,i) = make_float2(1,0);
667 #pragma unroll
668  for (int j=i+1; j<N; ++j){
669  (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
670  }
671  }
672  }
673 
674 
675  template<int N>
676  __device__ __host__ inline
678 
679 #pragma unroll
680  for (int i=0; i<N; ++i){
681  (*m)(i,i) = make_double2(1,0);
682 #pragma unroll
683  for (int j=i+1; j<N; ++j){
684  (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
685  }
686  }
687  }
688 
689 
690  // Need to write more generic code for this!
691  template<class T, int N>
692  __device__ __host__ inline
694 
695 #pragma unroll
696  for (int i=0; i<N; ++i){
697 #pragma unroll
698  for (int j=0; j<N; ++j){
699  (*m)(i,j) = 0;
700  }
701  }
702  }
703 
704 
705  template<int N>
706  __device__ __host__ inline
708 
709 #pragma unroll
710  for (int i=0; i<N; ++i){
711 #pragma unroll
712  for (int j=0; j<N; ++j){
713  (*m)(i,j) = make_float2(0.,0.);
714  }
715  }
716  }
717 
718 
719  template<int N>
720  __device__ __host__ inline
722 
723 #pragma unroll
724  for (int i=0; i<N; ++i){
725 #pragma unroll
726  for (int j=0; j<N; ++j){
727  (*m)(i,j) = make_double2(0.,0.);
728  }
729  }
730  }
731 
732 
733  template<typename Complex,int N>
734  __device__ __host__ inline void makeAntiHerm(Matrix<Complex,N> &m) {
735  typedef typename Complex::value_type real;
736  // first make the matrix anti-hermitian
737  Matrix<Complex,N> am = m - conj(m);
738 
739  // second make it traceless
740  real imag_trace = 0.0;
741 #pragma unroll
742  for (int i=0; i<N; i++) imag_trace += am(i,i).y;
743 #pragma unroll
744  for (int i=0; i<N; i++) {
745  am(i,i).y -= imag_trace/N;
746  }
747  m = 0.5*am;
748  }
749 
750  template <typename Complex, int N> __device__ __host__ inline void makeHerm(Matrix<Complex, N> &m)
751  {
752  typedef typename Complex::value_type real;
753  // first make the matrix anti-hermitian
754  Matrix<Complex, N> am = conj(m) - m;
755 
756  // second make it traceless
757  real imag_trace = 0.0;
758 #pragma unroll
759  for (int i = 0; i < N; i++) imag_trace += am(i, i).y;
760 #pragma unroll
761  for (int i = 0; i < N; i++) { am(i, i).y -= imag_trace / N; }
762  // third scale out anti hermitian part
763  Complex i_2(0.0, 0.5);
764  m = i_2 * am;
765  }
766 
767  // Matrix and array are very similar
768  // Maybe I should factor out the similar
769  // code. However, I want to make sure that
770  // the compiler knows to store the
771  // data elements in registers, so I won't do
772  // it right now.
773  template<class T, int N>
774  class Array
775  {
776  private:
777  T data[N];
778 
779  public:
780  // access function
781  __device__ __host__ inline
782  T const & operator[](int i) const{
783  return data[i];
784  }
785 
786  // assignment function
787  __device__ __host__ inline
788  T & operator[](int i){
789  return data[i];
790  }
791  };
792 
793 
794  template<class T, int N>
795  __device__ __host__ inline
796  void copyColumn(const Matrix<T,N>& m, int c, Array<T,N>* a)
797  {
798 #pragma unroll
799  for (int i=0; i<N; ++i){
800  (*a)[i] = m(i,c); // c is the column index
801  }
802  }
803 
804 
805  // Need some print utilities
806  template<class T, int N>
807  std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
808 #pragma unroll
809  for (int i=0; i<N; ++i){
810 #pragma unroll
811  for (int j=0; j<N; ++j){
812  os << m(i,j) << " ";
813  }
814  if(i<N-1) os << std::endl;
815  }
816  return os;
817  }
818 
819 
820  template<class T, int N>
821  std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
822  for (int i=0; i<N; ++i){
823  os << a[i] << " ";
824  }
825  return os;
826  }
827 
828  template<class Cmplx>
829  __device__ __host__ inline
831  {
832  const Cmplx & det = getDeterminant(u);
833  const Cmplx & det_inv = static_cast<typename Cmplx::value_type>(1.0)/det;
834 
835  Cmplx temp;
836 
837  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
838  (*uinv)(0,0) = (det_inv*temp);
839 
840  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
841  (*uinv)(0,1) = (temp*det_inv);
842 
843  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
844  (*uinv)(0,2) = (temp*det_inv);
845 
846  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
847  (*uinv)(1,0) = (det_inv*temp);
848 
849  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
850  (*uinv)(1,1) = (temp*det_inv);
851 
852  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
853  (*uinv)(1,2) = (temp*det_inv);
854 
855  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
856  (*uinv)(2,0) = (det_inv*temp);
857 
858  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
859  (*uinv)(2,1) = (temp*det_inv);
860 
861  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
862  (*uinv)(2,2) = (temp*det_inv);
863  }
864  // template this!
865  inline void copyArrayToLink(Matrix<float2,3>* link, float* array){
866 #pragma unroll
867  for (int i=0; i<3; ++i){
868 #pragma unroll
869  for (int j=0; j<3; ++j){
870  (*link)(i,j).x = array[(i*3+j)*2];
871  (*link)(i,j).y = array[(i*3+j)*2 + 1];
872  }
873  }
874  }
875 
876  template<class Cmplx, class Real>
877  inline void copyArrayToLink(Matrix<Cmplx,3>* link, Real* array){
878 #pragma unroll
879  for (int i=0; i<3; ++i){
880 #pragma unroll
881  for (int j=0; j<3; ++j){
882  (*link)(i,j).x = array[(i*3+j)*2];
883  (*link)(i,j).y = array[(i*3+j)*2 + 1];
884  }
885  }
886  }
887 
888 
889  // and this!
890  inline void copyLinkToArray(float* array, const Matrix<float2,3>& link){
891 #pragma unroll
892  for (int i=0; i<3; ++i){
893 #pragma unroll
894  for (int j=0; j<3; ++j){
895  array[(i*3+j)*2] = link(i,j).x;
896  array[(i*3+j)*2 + 1] = link(i,j).y;
897  }
898  }
899  }
900 
901  // and this!
902  template<class Cmplx, class Real>
903  inline void copyLinkToArray(Real* array, const Matrix<Cmplx,3>& link){
904 #pragma unroll
905  for (int i=0; i<3; ++i){
906 #pragma unroll
907  for (int j=0; j<3; ++j){
908  array[(i*3+j)*2] = link(i,j).x;
909  array[(i*3+j)*2 + 1] = link(i,j).y;
910  }
911  }
912  }
913 
914  template<class T>
915  __device__ __host__ inline Matrix<T,3> getSubTraceUnit(const Matrix<T,3>& a){
916  T tr = (a(0,0) + a(1,1) + a(2,2)) / 3.0;
917  Matrix<T,3> res;
918  res(0,0) = a(0,0) - tr; res(0,1) = a(0,1); res(0,2) = a(0,2);
919  res(1,0) = a(1,0); res(1,1) = a(1,1) - tr; res(1,2) = a(1,2);
920  res(2,0) = a(2,0); res(2,1) = a(2,1); res(2,2) = a(2,2) - tr;
921  return res;
922  }
923 
924  template<class T>
925  __device__ __host__ inline void SubTraceUnit(Matrix<T,3>& a){
926  T tr = (a(0,0) + a(1,1) + a(2,2)) / static_cast<T>(3.0);
927  a(0,0) -= tr; a(1,1) -= tr; a(2,2) -= tr;
928  }
929 
930  template<class T>
931  __device__ __host__ inline double getRealTraceUVdagger(const Matrix<T,3>& a, const Matrix<T,3>& b){
932  double sum = (double)(a(0,0).x * b(0,0).x + a(0,0).y * b(0,0).y);
933  sum += (double)(a(0,1).x * b(0,1).x + a(0,1).y * b(0,1).y);
934  sum += (double)(a(0,2).x * b(0,2).x + a(0,2).y * b(0,2).y);
935  sum += (double)(a(1,0).x * b(1,0).x + a(1,0).y * b(1,0).y);
936  sum += (double)(a(1,1).x * b(1,1).x + a(1,1).y * b(1,1).y);
937  sum += (double)(a(1,2).x * b(1,2).x + a(1,2).y * b(1,2).y);
938  sum += (double)(a(2,0).x * b(2,0).x + a(2,0).y * b(2,0).y);
939  sum += (double)(a(2,1).x * b(2,1).x + a(2,1).y * b(2,1).y);
940  sum += (double)(a(2,2).x * b(2,2).x + a(2,2).y * b(2,2).y);
941  return sum;
942  }
943 
944  // and this!
945  template<class Cmplx>
946  __host__ __device__ inline
947  void printLink(const Matrix<Cmplx,3>& link){
948  printf("(%lf, %lf)\t", link(0,0).x, link(0,0).y);
949  printf("(%lf, %lf)\t", link(0,1).x, link(0,1).y);
950  printf("(%lf, %lf)\n", link(0,2).x, link(0,2).y);
951  printf("(%lf, %lf)\t", link(1,0).x, link(1,0).y);
952  printf("(%lf, %lf)\t", link(1,1).x, link(1,1).y);
953  printf("(%lf, %lf)\n", link(1,2).x, link(1,2).y);
954  printf("(%lf, %lf)\t", link(2,0).x, link(2,0).y);
955  printf("(%lf, %lf)\t", link(2,1).x, link(2,1).y);
956  printf("(%lf, %lf)\n", link(2,2).x, link(2,2).y);
957  printf("\n");
958  }
959 
960  template<class Cmplx>
961  __device__ __host__
962  double ErrorSU3(const Matrix<Cmplx,3>& matrix)
963  {
964  const Matrix<Cmplx,3> identity_comp = conj(matrix)*matrix;
965  double error = 0.0;
966  Cmplx temp(0,0);
967  int i=0;
968  int j=0;
969 
970  //error = ||U^dagger U - I||_L2
971 #pragma unroll
972  for (i=0; i<3; ++i)
973 #pragma unroll
974  for (j=0; j<3; ++j)
975  if(i==j) {
976  temp = identity_comp(i,j);
977  temp -= 1.0;
978  error += norm(temp);
979  }
980  else {
981  error += norm(identity_comp(i,j));
982  }
983  //error is L2 norm, should be (very close) to zero.
984  return error;
985  }
986 
987  template <class T> __device__ __host__ inline auto exponentiate_iQ(const Matrix<T, 3> &Q)
988  {
989  // Use Cayley-Hamilton Theorem for SU(3) exp{iQ}.
990  // This algorithm is outlined in
991  // http://arxiv.org/pdf/hep-lat/0311018v1.pdf
992  // Equation numbers in the paper are referenced by [eq_no].
993 
994  //Declarations
995  typedef decltype(Q(0, 0).x) matType;
996 
997  matType inv3 = 1.0 / 3.0;
998  matType c0, c1, c0_max, Tr_re;
999  matType f0_re, f0_im, f1_re, f1_im, f2_re, f2_im;
1000  matType theta;
1001  matType u_p, w_p; // u, w parameters.
1002  Matrix<T,3> temp1;
1003  Matrix<T,3> temp2;
1004  //[14] c0 = det(Q) = 1/3Tr(Q^3)
1005  const T & det_Q = getDeterminant(Q);
1006  c0 = det_Q.x;
1007  //[15] c1 = 1/2Tr(Q^2)
1008  // Q = Q^dag => Tr(Q^2) = Tr(QQ^dag) = sum_ab [Q_ab * Q_ab^*]
1009  temp1 = Q;
1010  temp1 = temp1 * Q;
1011  Tr_re = getTrace(temp1).x;
1012  c1 = 0.5*Tr_re;
1013 
1014  //We now have the coeffiecients c0 and c1.
1015  //We now find: exp(iQ) = f0*I + f1*Q + f2*Q^2
1016  // where fj = fj(c0,c1), j=0,1,2.
1017 
1018  //[17]
1019  auto sqrt_c1_inv3 = sqrt(c1 * inv3);
1020  c0_max = 2 * (c1 * inv3 * sqrt_c1_inv3); // reuse the sqrt factor for a fast 1.5 power
1021 
1022  //[25]
1023  theta = acos(c0/c0_max);
1024 
1025  sincos(theta * inv3, &w_p, &u_p);
1026  //[23]
1027  u_p *= sqrt_c1_inv3;
1028 
1029  //[24]
1030  w_p *= sqrt(c1);
1031 
1032  //[29] Construct objects for fj = hj/(9u^2 - w^2).
1033  matType u_sq = u_p * u_p;
1034  matType w_sq = w_p * w_p;
1035  matType denom_inv = 1.0 / (9 * u_sq - w_sq);
1036  matType exp_iu_re, exp_iu_im;
1037  sincos(u_p, &exp_iu_im, &exp_iu_re);
1038  matType exp_2iu_re = exp_iu_re * exp_iu_re - exp_iu_im * exp_iu_im;
1039  matType exp_2iu_im = 2 * exp_iu_re * exp_iu_im;
1040  matType cos_w = cos(w_p);
1041  matType sinc_w;
1042  matType hj_re = 0.0;
1043  matType hj_im = 0.0;
1044 
1045  //[33] Added one more term to the series given in the paper.
1046  if (w_p < 0.05 && w_p > -0.05) {
1047  //1 - 1/6 x^2 (1 - 1/20 x^2 (1 - 1/42 x^2(1 - 1/72*x^2)))
1048  sinc_w = 1.0 - (w_sq/6.0)*(1 - (w_sq*0.05)*(1 - (w_sq/42.0)*(1 - (w_sq/72.0))));
1049  }
1050  else sinc_w = sin(w_p)/w_p;
1051 
1052  //[34] Test for c0 < 0.
1053  int parity = 0;
1054  if(c0 < 0) {
1055  c0 *= -1.0;
1056  parity = 1;
1057  //calculate fj with c0 > 0 and then convert all fj.
1058  }
1059 
1060  //Get all the numerators for fj,
1061  //[30] f0
1062  hj_re = (u_sq - w_sq)*exp_2iu_re + 8*u_sq*cos_w*exp_iu_re + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_im;
1063  hj_im = (u_sq - w_sq)*exp_2iu_im - 8*u_sq*cos_w*exp_iu_im + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_re;
1064  f0_re = hj_re*denom_inv;
1065  f0_im = hj_im*denom_inv;
1066 
1067  //[31] f1
1068  hj_re = 2*u_p*exp_2iu_re - 2*u_p*cos_w*exp_iu_re + (3*u_sq - w_sq)*sinc_w*exp_iu_im;
1069  hj_im = 2*u_p*exp_2iu_im + 2*u_p*cos_w*exp_iu_im + (3*u_sq - w_sq)*sinc_w*exp_iu_re;
1070  f1_re = hj_re*denom_inv;
1071  f1_im = hj_im*denom_inv;
1072 
1073  //[32] f2
1074  hj_re = exp_2iu_re - cos_w*exp_iu_re - 3*u_p*sinc_w*exp_iu_im;
1075  hj_im = exp_2iu_im + cos_w*exp_iu_im - 3*u_p*sinc_w*exp_iu_re;
1076  f2_re = hj_re*denom_inv;
1077  f2_im = hj_im*denom_inv;
1078 
1079  //[34] If c0 < 0, apply tranformation fj(-c0,c1) = (-1)^j f^*j(c0,c1)
1080  if (parity == 1) {
1081  f0_im *= -1.0;
1082  f1_re *= -1.0;
1083  f2_im *= -1.0;
1084  }
1085 
1086  T f0_c;
1087  T f1_c;
1088  T f2_c;
1089 
1090  f0_c.x = f0_re;
1091  f0_c.y = f0_im;
1092 
1093  f1_c.x = f1_re;
1094  f1_c.y = f1_im;
1095 
1096  f2_c.x = f2_re;
1097  f2_c.y = f2_im;
1098 
1099  //[19] Construct exp{iQ}
1100  Matrix<T, 3> exp_iQ;
1101  setZero(&exp_iQ);
1102  Matrix<T,3> UnitM;
1103  setIdentity(&UnitM);
1104  // +f0*I
1105  temp1 = f0_c * UnitM;
1106  exp_iQ = temp1;
1107 
1108  // +f1*Q
1109  temp1 = f1_c * Q;
1110  exp_iQ += temp1;
1111 
1112  // +f2*Q^2
1113  temp1 = Q * Q;
1114  temp2 = f2_c * temp1;
1115  exp_iQ += temp2;
1116 
1117  //exp(iQ) is now defined.
1118  return exp_iQ;
1119  }
1120 
1124  template <typename Float> __device__ __host__ void expsu3(Matrix<complex<Float>, 3> &q)
1125  {
1126  typedef complex<Float> Complex;
1127 
1128  Complex a2 = (q(3) * q(1) + q(7) * q(5) + q(6) * q(2) - (q(0) * q(4) + (q(0) + q(4)) * q(8))) / (Float)3.0;
1129  Complex a3 = q(0) * q(4) * q(8) + q(1) * q(5) * q(6) + q(2) * q(3) * q(7) - q(6) * q(4) * q(2)
1130  - q(3) * q(1) * q(8) - q(0) * q(7) * q(5);
1131 
1132  Complex sg2h3 = sqrt(a3 * a3 - (Float)4. * a2 * a2 * a2);
1133  Complex cp = exp(log((Float)0.5 * (a3 + sg2h3)) / (Float)3.0);
1134  Complex cm = a2 / cp;
1135 
1136  Complex r1 = exp(Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1137  Complex r2 = exp(-Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1138 
1139  Complex w1[3];
1140 
1141  w1[0] = cm + cp;
1142  w1[1] = r1 * cp + r2 * cm;
1143  w1[2] = r2 * cp + r1 * cm;
1144  Complex z1 = q(1) * q(6) - q(0) * q(7);
1145  Complex z2 = q(3) * q(7) - q(4) * q(6);
1146 
1147  Complex al = w1[0];
1148  Complex wr21 = (z1 + al * q(7)) / (z2 + al * q(6));
1149  Complex wr31 = (al - q(0) - wr21 * q(3)) / q(6);
1150 
1151  al = w1[1];
1152  Complex wr22 = (z1 + al * q(7)) / (z2 + al * q(6));
1153  Complex wr32 = (al - q(0) - wr22 * q(3)) / q(6);
1154 
1155  al = w1[2];
1156  Complex wr23 = (z1 + al * q(7)) / (z2 + al * q(6));
1157  Complex wr33 = (al - q(0) - wr23 * q(3)) / q(6);
1158 
1159  z1 = q(3) * q(2) - q(0) * q(5);
1160  z2 = q(1) * q(5) - q(4) * q(2);
1161 
1162  al = w1[0];
1163  Complex wl21 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1164  Complex wl31 = conj((al - q(0) - conj(wl21) * q(1)) / q(2));
1165 
1166  al = w1[1];
1167  Complex wl22 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1168  Complex wl32 = conj((al - q(0) - conj(wl22) * q(1)) / q(2));
1169 
1170  al = w1[2];
1171  Complex wl23 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1172  Complex wl33 = conj((al - q(0) - conj(wl23) * q(1)) / q(2));
1173 
1174  Complex xn1 = (Float)1. + wr21 * conj(wl21) + wr31 * conj(wl31);
1175  Complex xn2 = (Float)1. + wr22 * conj(wl22) + wr32 * conj(wl32);
1176  Complex xn3 = (Float)1. + wr23 * conj(wl23) + wr33 * conj(wl33);
1177 
1178  Complex d1 = exp(w1[0]);
1179  Complex d2 = exp(w1[1]);
1180  Complex d3 = exp(w1[2]);
1181  Complex y11 = d1 / xn1;
1182  Complex y12 = d2 / xn2;
1183  Complex y13 = d3 / xn3;
1184  Complex y21 = wr21 * d1 / xn1;
1185  Complex y22 = wr22 * d2 / xn2;
1186  Complex y23 = wr23 * d3 / xn3;
1187  Complex y31 = wr31 * d1 / xn1;
1188  Complex y32 = wr32 * d2 / xn2;
1189  Complex y33 = wr33 * d3 / xn3;
1190  q(0) = y11 + y12 + y13;
1191  q(1) = y21 + y22 + y23;
1192  q(2) = y31 + y32 + y33;
1193  q(3) = y11 * conj(wl21) + y12 * conj(wl22) + y13 * conj(wl23);
1194  q(4) = y21 * conj(wl21) + y22 * conj(wl22) + y23 * conj(wl23);
1195  q(5) = y31 * conj(wl21) + y32 * conj(wl22) + y33 * conj(wl23);
1196  q(6) = y11 * conj(wl31) + y12 * conj(wl32) + y13 * conj(wl33);
1197  q(7) = y21 * conj(wl31) + y22 * conj(wl32) + y23 * conj(wl33);
1198  q(8) = y31 * conj(wl31) + y32 * conj(wl32) + y33 * conj(wl33);
1199  }
1200 
1201 } // end namespace quda
__device__ __host__ T & operator[](int i)
Definition: quda_matrix.h:788
__device__ __host__ T const & operator[](int i) const
Definition: quda_matrix.h:782
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices)
Definition: quda_matrix.h:282
T data[N *N]
Definition: quda_matrix.h:300
__device__ __host__ void print() const
Definition: quda_matrix.h:385
__device__ __host__ T max() const
Compute the absolute max element of the Hermitian matrix.
Definition: quda_matrix.h:376
__device__ __host__ HMatrix(const T data_[])
Definition: quda_matrix.h:312
__device__ __host__ void operator=(const HMatrix< U, N > &b)
Definition: quda_matrix.h:333
__device__ __host__ HMatrix(const HMatrix< T, N > &a)
Definition: quda_matrix.h:307
__device__ __host__ HMatrix()
Definition: quda_matrix.h:302
__device__ __host__ complex< T > const operator()(int i, int j) const
Definition: quda_matrix.h:317
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
Definition: quda_matrix.h:348
T data[N *N]
Definition: quda_matrix.h:70
__device__ __host__ real Linf()
Compute the matrix Linfinity norm - this is the maximum of the absolute row sums.
Definition: quda_matrix.h:175
__device__ __host__ Matrix(const Matrix< T, N > &a)
Definition: quda_matrix.h:76
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Definition: quda_matrix.h:158
__device__ __host__ void operator=(const gauge_ghost_wrapper< real, S > &s)
__device__ constexpr __host__ int size() const
Definition: quda_matrix.h:72
__device__ __host__ Matrix()
Definition: quda_matrix.h:74
__device__ __host__ bool isUnitary(double max_error) const
Definition: quda_matrix.h:204
__device__ __host__ T const & operator()(int i) const
Definition: quda_matrix.h:104
__device__ __host__ Matrix(const gauge_ghost_wrapper< real, S > &s)
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:117
__device__ __host__ Matrix(const HMatrix< real, N > &a)
__device__ __host__ T & operator()(int i, int j)
Definition: quda_matrix.h:100
__device__ __host__ Matrix(const Matrix< U, N > &a)
Definition: quda_matrix.h:82
__device__ __host__ Matrix(const gauge_wrapper< real, S > &s)
__device__ __host__ void operator=(const gauge_wrapper< real, S > &s)
__device__ __host__ uint64_t checksum() const
Definition: quda_matrix.h:194
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
Definition: quda_matrix.h:139
__device__ __host__ T const & operator()(int i, int j) const
Definition: quda_matrix.h:96
__device__ __host__ Matrix(const T data_[])
Definition: quda_matrix.h:88
__device__ __host__ T & operator()(int i)
Definition: quda_matrix.h:110
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
int length[]
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:947
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:890
__host__ __device__ float4 operator+=(float4 &x, const float4 &y)
Definition: float_vector.h:122
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
Definition: quda_matrix.h:962
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:61
__host__ __device__ float2 operator*=(float2 &x, const float &a)
Definition: float_vector.h:199
__device__ __host__ auto exponentiate_iQ(const Matrix< T, 3 > &Q)
Definition: quda_matrix.h:987
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:605
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
__host__ __device__ ValueType log(ValueType x)
Definition: complex_quda.h:101
std::complex< double > Complex
Definition: quda_internal.h:86
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:865
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:410
__device__ __host__ Matrix< T, 3 > getSubTraceUnit(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:915
__host__ __device__ float4 operator-=(float4 &x, const float4 &y)
Definition: float_vector.h:169
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:734
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:647
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:417
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:931
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:96
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
Definition: quda_matrix.h:796
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:925
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
Definition: quda_matrix.h:830
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:693
__device__ __host__ void makeHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:750
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
__device__ __host__ void expsu3(Matrix< complex< Float >, 3 > &q)
Definition: quda_matrix.h:1124
FloatingPoint< float > Float
__host__ __device__ T sum(const array< T, s > &a)
Definition: utility.h:76
Provides precision abstractions and defines the register precision given the storage precision using ...
wrapper class that enables us to write to Hmatrices in packed format
Definition: quda_matrix.h:244
__device__ __host__ void operator+=(const complex< T > &a)
Definition: quda_matrix.h:264
__device__ __host__ HMatrix_wrapper(Hmat &mat, int i, int j, int idx)
Definition: quda_matrix.h:250
__device__ __host__ void operator=(const complex< T > &a)
Definition: quda_matrix.h:252
__device__ static __host__ T val()
__device__ static __host__ T val()
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors,...
__host__ __device__ ValueType imag() const volatile
__host__ __device__ ValueType real() const volatile
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors,...
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors,...