QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
quda_matrix.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstdio>
4 #include <cstdlib>
5 #include <iostream>
6 #include <iomanip>
7 
8 #include <register_traits.h>
9 #include <float_vector.h>
10 #include <complex_quda.h>
11 
12 namespace quda {
13 
14 
15  template<class T>
16  struct Zero
17  {
18  //static const T val;
19  __device__ __host__ inline
20  static T val();
21  };
22 
23  template<>
24  __device__ __host__ inline
26  {
27  return make_float2(0.,0.);
28  }
29 
30  template<>
31  __device__ __host__ inline
33  {
34  return make_double2(0.,0.);
35  }
36 
37 
38 
39  template<class T>
40  struct Identity
41  {
42  __device__ __host__ inline
43  static T val();
44  };
45 
46  template<>
47  __device__ __host__ inline
49  return make_float2(1.,0.);
50  }
51 
52  template<>
53  __device__ __host__ inline
55  return make_double2(1.,0.);
56  }
57 
58  template<typename Float, typename T> struct gauge_wrapper;
59  template<typename Float, typename T> struct gauge_ghost_wrapper;
60  template<typename Float, typename T> struct clover_wrapper;
61  template<typename T, int N> struct HMatrix;
62 
63  template<class T, int N>
64  class Matrix
65  {
66  typedef typename RealType<T>::type real;
67 
68  private:
69  __device__ __host__ inline int index(int i, int j) const { return i*N + j; }
70 
71  public:
72  T data[N*N];
73 
74  __device__ __host__ constexpr int size() const { return N; }
75 
76  __device__ __host__ inline Matrix() {
77 #pragma unroll
78  for (int i=0; i<N*N; i++) zero(data[i]);
79  }
80 
81  __device__ __host__ inline Matrix(const Matrix<T,N> &a) {
82 #pragma unroll
83  for (int i=0; i<N*N; i++) data[i] = a.data[i];
84  }
85 
86  template <class U> __device__ __host__ inline Matrix(const Matrix<U, N> &a)
87  {
88 #pragma unroll
89  for (int i = 0; i < N * N; i++) data[i] = a.data[i];
90  }
91 
92  __device__ __host__ inline Matrix(const T data_[])
93  {
94 #pragma unroll
95  for (int i=0; i<N*N; i++) data[i] = data_[i];
96  }
97 
98  __device__ __host__ inline Matrix(const HMatrix<real, N> &a);
99 
100  __device__ __host__ inline T const & operator()(int i, int j) const {
101  return data[index(i,j)];
102  }
103 
104  __device__ __host__ inline T & operator()(int i, int j) {
105  return data[index(i,j)];
106  }
107 
108  __device__ __host__ inline T const & operator()(int i) const {
109  int j = i % N;
110  int k = i / N;
111  return data[index(j,k)];
112  }
113 
114  __device__ __host__ inline T& operator()(int i) {
115  int j = i % N;
116  int k = i / N;
117  return data[index(j,k)];
118  }
119 
120  template<class U>
121  __device__ __host__ inline void operator=(const Matrix<U,N> & b) {
122 #pragma unroll
123  for (int i=0; i<N*N; i++) data[i] = b.data[i];
124  }
125 
126  template<typename S>
127  __device__ __host__ inline Matrix(const gauge_wrapper<real, S> &s);
128 
129  template<typename S>
130  __device__ __host__ inline void operator=(const gauge_wrapper<real, S> &s);
131 
132  template<typename S>
133  __device__ __host__ inline Matrix(const gauge_ghost_wrapper<real, S> &s);
134 
135  template<typename S>
136  __device__ __host__ inline void operator=(const gauge_ghost_wrapper<real, S> &s);
137 
143  __device__ __host__ inline real L1() {
144  real l1 = 0;
145 #pragma unroll
146  for (int j=0; j<N; j++) {
147  real col_sum = 0;
148 #pragma unroll
149  for (int i=0; i<N; i++) {
150  col_sum += abs(data[i*N + j]);
151  }
152  l1 = col_sum > l1 ? col_sum : l1;
153  }
154  return l1;
155  }
156 
162  __device__ __host__ inline real L2() {
163  real l2 = 0;
164 #pragma unroll
165  for (int j=0; j<N; j++) {
166 #pragma unroll
167  for (int i=0; i<N; i++) {
168  l2 += norm(data[i*N + j]);
169  }
170  }
171  return sqrt(l2);
172  }
173 
179  __device__ __host__ inline real Linf() {
180  real linf = 0;
181 #pragma unroll
182  for (int i=0; i<N; i++) {
183  real row_sum = 0;
184 #pragma unroll
185  for (int j=0; j<N; j++) {
186  row_sum += abs(data[i*N + j]);
187  }
188  linf = row_sum > linf ? row_sum : linf;
189  }
190  return linf;
191  }
192 
198  __device__ __host__ inline uint64_t checksum() const {
199  // ensure length is rounded up to 64-bit multiple
200  constexpr int length = (N*N*sizeof(T) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
201  uint64_t base_[length] = { };
202  T *data_ = reinterpret_cast<T*>( static_cast<void*>(base_) );
203  for (int i=0; i<N*N; i++) data_[i] = data[i];
204  uint64_t checksum_ = base_[0];
205  for (int i=1; i<length; i++) checksum_ ^= base_[i];
206  return checksum_;
207  }
208 
209  __device__ __host__ inline bool isUnitary(double max_error) const
210  {
211  const auto identity = conj(*this) * *this;
212 
213 #pragma unroll
214  for (int i=0; i<N; ++i){
215  if( fabs(identity(i,i).real() - 1.0) > max_error ||
216  fabs(identity(i,i).imag()) > max_error) return false;
217 
218 #pragma unroll
219  for (int j=i+1; j<N; ++j){
220  if( fabs(identity(i,j).real()) > max_error ||
221  fabs(identity(i,j).imag()) > max_error ||
222  fabs(identity(j,i).real()) > max_error ||
223  fabs(identity(j,i).imag()) > max_error ){
224  return false;
225  }
226  }
227  }
228 
229 #pragma unroll
230  for (int i=0; i<N; i++) {
231 #pragma unroll
232  for (int j=0; j<N; j++) {
233  if (std::isnan((*this)(i,j).real()) ||
234  std::isnan((*this)(i,j).imag())) return false;
235  }
236  }
237 
238  return true;
239  }
240 
241  };
242 
248  template <typename T, typename Hmat>
250  Hmat &mat;
251  const int i;
252  const int j;
253  const int idx;
254 
255  __device__ __host__ inline HMatrix_wrapper(Hmat &mat, int i, int j, int idx) : mat(mat), i(i), j(j), idx(idx) { }
256 
257  __device__ __host__ inline void operator=(const complex<T> &a) {
258  if (i==j) {
259  mat.data[idx] = a.real();
260  } else if (j<i) {
261  mat.data[idx+0] = a.real();
262  mat.data[idx+1] = a.imag();
263  } else {
264  mat.data[idx+0] = a.real();
265  mat.data[idx+1] = -a.imag();
266  }
267  }
268 
269  __device__ __host__ inline void operator+=(const complex<T> &a) {
270  if (i==j) {
271  mat.data[idx] += a.real();
272  } else if (j<i) {
273  mat.data[idx+0] += a.real();
274  mat.data[idx+1] += a.imag();
275  } else {
276  mat.data[idx+0] += a.real();
277  mat.data[idx+1] += -a.imag();
278  }
279  }
280  };
281 
285  template<class T, int N>
286  class HMatrix
287  {
289  private:
290  // compute index into triangular-packed Hermitian matrix
291  __device__ __host__ inline int index(int i, int j) const {
292  if (i==j) {
293  return i;
294  } else if (j<i) {
295  int k = N*(N-1)/2 - (N-j)*(N-j-1)/2 + i - j - 1;
296  return N + 2*k;
297  } else { // i>j
298  // switch coordinates to count from bottom right instead of top left of matrix
299  int k = N*(N-1)/2 - (N-i)*(N-i-1)/2 + j - i - 1;
300  return N + 2*k;
301  }
302  }
303 
304  public:
305  T data[N*N]; // store in real-valued array
306 
307  __device__ __host__ inline HMatrix() {
308 #pragma unroll
309  for (int i=0; i<N*N; i++) zero(data[i]);
310  }
311 
312  __device__ __host__ inline HMatrix(const HMatrix<T,N> &a) {
313 #pragma unroll
314  for (int i=0; i<N*N; i++) data[i] = a.data[i];
315  }
316 
317  __device__ __host__ inline HMatrix(const T data_[]) {
318 #pragma unroll
319  for (int i=0; i<N*N; i++) data[i] = data_[i];
320  }
321 
322  __device__ __host__ inline complex<T> const operator()(int i, int j) const {
323  const int idx = index(i,j);
324  if (i==j) {
325  return complex<T>(data[idx],0.0);
326  } else if (j<i) {
327  return complex<T>(data[idx], data[idx+1]);
328  } else {
329  return complex<T>(data[idx],-data[idx+1]);
330  }
331  }
332 
333  __device__ __host__ inline HMatrix_wrapper<T,HMatrix<T,N> > operator() (int i, int j) {
334  return HMatrix_wrapper<T,HMatrix<T,N> >(*this, i, j, index(i,j));
335  }
336 
337  template<class U>
338  __device__ __host__ inline void operator=(const HMatrix<U,N> & b) {
339 #pragma unroll
340  for (int i=0; i<N*N; i++) data[i] = b.data[i];
341  }
342 
343  template<typename S>
344  __device__ __host__ inline HMatrix(const clover_wrapper<T, S> &s);
345 
346  template<typename S>
347  __device__ __host__ inline void operator=(const clover_wrapper<T, S> &s);
348 
353  __device__ __host__ inline HMatrix<T,N> square() const {
354  HMatrix<T,N> result;
355  complex<T> tmp;
356 #pragma unroll
357  for (int i=0; i<N; i++) {
358 #pragma unroll
359  for (int k=0; k<N; k++) if (i<=k) { // else compiler can't handle triangular unroll
360  tmp.x = (*this)(i,0).real() * (*this)(0,k).real();
361  tmp.x -= (*this)(i,0).imag() * (*this)(0,k).imag();
362  tmp.y = (*this)(i,0).real() * (*this)(0,k).imag();
363  tmp.y += (*this)(i,0).imag() * (*this)(0,k).real();
364 #pragma unroll
365  for (int j=1; j<N; j++) {
366  tmp.x += (*this)(i,j).real() * (*this)(j,k).real();
367  tmp.x -= (*this)(i,j).imag() * (*this)(j,k).imag();
368  tmp.y += (*this)(i,j).real() * (*this)(j,k).imag();
369  tmp.y += (*this)(i,j).imag() * (*this)(j,k).real();
370  }
371  result(i,k) = tmp;
372  }
373  }
374  return result;
375  }
376 
381  __device__ __host__ inline T max() const
382  {
383  HMatrix<T, N> result;
384  T max = static_cast<T>(0.0);
385 #pragma unroll
386  for (int i = 0; i < N * N; i++) max = (abs(data[i]) > max ? abs(data[i]) : max);
387  return max;
388  }
389 
390  __device__ __host__ void print() const {
391  for (int i=0; i<N; i++) {
392  printf("i=%d ", i);
393  for (int j=0; j<N; j++) {
394  printf(" (%e, %e)", (*this)(i,j).real(), (*this)(i,j).imag());
395  }
396  printf("\n");
397  }
398  printf("\n");
399  }
400 
401  };
402 
403  template<class T,int N>
404  __device__ __host__ Matrix<T,N>::Matrix(const HMatrix<typename RealType<T>::type,N> &a) {
405 #pragma unroll
406  for (int i=0; i<N; i++) {
407 #pragma unroll
408  for (int j=0; j<N; j++) {
409  (*this)(i,j) = a(i,j);
410  }
411  }
412  }
413 
414  template<class T>
415  __device__ __host__ inline T getTrace(const Matrix<T,3>& a)
416  {
417  return a(0,0) + a(1,1) + a(2,2);
418  }
419 
420 
421  template< template<typename,int> class Mat, class T>
422  __device__ __host__ inline T getDeterminant(const Mat<T,3> & a){
423 
424  T result;
425  result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
426  - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
427  + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
428 
429  return result;
430  }
431 
432  template< template<typename,int> class Mat, class T, int N>
433  __device__ __host__ inline Mat<T,N> operator+(const Mat<T,N> & a, const Mat<T,N> & b)
434  {
435  Mat<T,N> result;
436 #pragma unroll
437  for (int i=0; i<N*N; i++) result.data[i] = a.data[i] + b.data[i];
438  return result;
439  }
440 
441 
442  template< template<typename,int> class Mat, class T, int N>
443  __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const Mat<T,N> & b)
444  {
445 #pragma unroll
446  for (int i=0; i<N*N; i++) a.data[i] += b.data[i];
447  return a;
448  }
449 
450  template< template<typename,int> class Mat, class T, int N>
451  __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const T & b)
452  {
453 #pragma unroll
454  for (int i=0; i<N; i++) a(i,i) += b;
455  return a;
456  }
457 
458  template< template<typename,int> class Mat, class T, int N>
459  __device__ __host__ inline Mat<T,N> operator-=(Mat<T,N> & a, const Mat<T,N> & b)
460  {
461 #pragma unroll
462  for (int i=0; i<N*N; i++) a.data[i] -= b.data[i];
463  return a;
464  }
465 
466  template< template<typename,int> class Mat, class T, int N>
467  __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a, const Mat<T,N> & b)
468  {
469  Mat<T,N> result;
470 #pragma unroll
471  for (int i=0; i<N*N; i++) result.data[i] = a.data[i] - b.data[i];
472  return result;
473  }
474 
475  template< template<typename,int> class Mat, class T, int N, class S>
476  __device__ __host__ inline Mat<T,N> operator*(const S & scalar, const Mat<T,N> & a){
477  Mat<T,N> result;
478 #pragma unroll
479  for (int i=0; i<N*N; ++i) result.data[i] = scalar*a.data[i];
480  return result;
481  }
482 
483  template< template<typename,int> class Mat, class T, int N, class S>
484  __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> & a, const S & scalar){
485  return scalar*a;
486  }
487 
488  template< template<typename,int> class Mat, class T, int N, class S>
489  __device__ __host__ inline Mat<T,N> operator *=(Mat<T,N> & a, const S & scalar){
490  a = scalar*a;
491  return a;
492  }
493 
494  template< template<typename,int> class Mat, class T, int N>
495  __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a){
496  Mat<T,N> result;
497 #pragma unroll
498  for (int i=0; i<(N*N); ++i) result.data[i] = -a.data[i];
499  return result;
500  }
501 
502 
506  template< template<typename,int> class Mat, class T, int N>
507  __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> &a, const Mat<T,N> &b)
508  {
509  Mat<T,N> result;
510 #pragma unroll
511  for (int i=0; i<N; i++) {
512 #pragma unroll
513  for (int k=0; k<N; k++) {
514  result(i,k) = a(i,0) * b(0,k);
515 #pragma unroll
516  for (int j=1; j<N; j++) {
517  result(i,k) += a(i,j) * b(j,k);
518  }
519  }
520  }
521  return result;
522  }
523 
527  template< template<typename> class complex, typename T, int N>
528  __device__ __host__ inline Matrix<complex<T>,N> operator*(const Matrix<complex<T>,N> &a, const Matrix<complex<T>,N> &b)
529  {
530  Matrix<complex<T>,N> result;
531 #pragma unroll
532  for (int i=0; i<N; i++) {
533 #pragma unroll
534  for (int k=0; k<N; k++) {
535  result(i,k).x = a(i,0).real() * b(0,k).real();
536  result(i,k).x -= a(i,0).imag() * b(0,k).imag();
537  result(i,k).y = a(i,0).real() * b(0,k).imag();
538  result(i,k).y += a(i,0).imag() * b(0,k).real();
539 #pragma unroll
540  for (int j=1; j<N; j++) {
541  result(i,k).x += a(i,j).real() * b(j,k).real();
542  result(i,k).x -= a(i,j).imag() * b(j,k).imag();
543  result(i,k).y += a(i,j).real() * b(j,k).imag();
544  result(i,k).y += a(i,j).imag() * b(j,k).real();
545  }
546  }
547  }
548  return result;
549  }
550 
551  template<class T, int N>
552  __device__ __host__ inline Matrix<T,N> operator *=(Matrix<T,N> & a, const Matrix<T,N>& b){
553 
554  Matrix<T,N> c = a;
555  a = c*b;
556  return a;
557  }
558 
559 
560  // This is so that I can multiply real and complex matrice
561  template<class T, class U, int N>
562  __device__ __host__ inline
564  {
566 #pragma unroll
567  for (int i=0; i<N; i++) {
568 #pragma unroll
569  for (int k=0; k<N; k++) {
570  result(i,k) = a(i,0) * b(0,k);
571 #pragma unroll
572  for (int j=1; j<N; j++) {
573  result(i,k) += a(i,j) * b(j,k);
574  }
575  }
576  }
577  return result;
578  }
579 
580 
581  template<class T>
582  __device__ __host__ inline
584  {
585  Matrix<T,2> result;
586  result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
587  result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
588  result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
589  result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
590  return result;
591  }
592 
593 
594  template<class T, int N>
595  __device__ __host__ inline
596  Matrix<T,N> conj(const Matrix<T,N> & other){
597  Matrix<T,N> result;
598 #pragma unroll
599  for (int i=0; i<N; ++i){
600 #pragma unroll
601  for (int j=0; j<N; ++j){
602  result(i,j) = conj( other(j,i) );
603  }
604  }
605  return result;
606  }
607 
608 
609  template<class T>
610  __device__ __host__ inline
612  {
613  const T det = getDeterminant(u);
614  const T det_inv = static_cast<typename T::value_type>(1.0)/det;
615  Matrix<T,3> uinv;
616 
617  T temp;
618 
619  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
620  uinv(0,0) = (det_inv*temp);
621 
622  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
623  uinv(0,1) = (temp*det_inv);
624 
625  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
626  uinv(0,2) = (temp*det_inv);
627 
628  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
629  uinv(1,0) = (det_inv*temp);
630 
631  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
632  uinv(1,1) = (temp*det_inv);
633 
634  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
635  uinv(1,2) = (temp*det_inv);
636 
637  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
638  uinv(2,0) = (det_inv*temp);
639 
640  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
641  uinv(2,1) = (temp*det_inv);
642 
643  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
644  uinv(2,2) = (temp*det_inv);
645 
646  return uinv;
647  }
648 
649 
650 
651  template<class T, int N>
652  __device__ __host__ inline
654 
655 #pragma unroll
656  for (int i=0; i<N; ++i){
657  (*m)(i,i) = 1;
658 #pragma unroll
659  for (int j=i+1; j<N; ++j){
660  (*m)(i,j) = (*m)(j,i) = 0;
661  }
662  }
663  return;
664  }
665 
666 
667  template<int N>
668  __device__ __host__ inline
670 
671 #pragma unroll
672  for (int i=0; i<N; ++i){
673  (*m)(i,i) = make_float2(1,0);
674 #pragma unroll
675  for (int j=i+1; j<N; ++j){
676  (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
677  }
678  }
679  return;
680  }
681 
682 
683  template<int N>
684  __device__ __host__ inline
686 
687 #pragma unroll
688  for (int i=0; i<N; ++i){
689  (*m)(i,i) = make_double2(1,0);
690 #pragma unroll
691  for (int j=i+1; j<N; ++j){
692  (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
693  }
694  }
695  return;
696  }
697 
698 
699  // Need to write more generic code for this!
700  template<class T, int N>
701  __device__ __host__ inline
703 
704 #pragma unroll
705  for (int i=0; i<N; ++i){
706 #pragma unroll
707  for (int j=0; j<N; ++j){
708  (*m)(i,j) = 0;
709  }
710  }
711  return;
712  }
713 
714 
715  template<int N>
716  __device__ __host__ inline
718 
719 #pragma unroll
720  for (int i=0; i<N; ++i){
721 #pragma unroll
722  for (int j=0; j<N; ++j){
723  (*m)(i,j) = make_float2(0.,0.);
724  }
725  }
726  return;
727  }
728 
729 
730  template<int N>
731  __device__ __host__ inline
733 
734 #pragma unroll
735  for (int i=0; i<N; ++i){
736 #pragma unroll
737  for (int j=0; j<N; ++j){
738  (*m)(i,j) = make_double2(0.,0.);
739  }
740  }
741  return;
742  }
743 
744 
745  template<typename Complex,int N>
746  __device__ __host__ inline void makeAntiHerm(Matrix<Complex,N> &m) {
747  typedef typename Complex::value_type real;
748  // first make the matrix anti-hermitian
749  Matrix<Complex,N> am = m - conj(m);
750 
751  // second make it traceless
752  real imag_trace = 0.0;
753 #pragma unroll
754  for (int i=0; i<N; i++) imag_trace += am(i,i).y;
755 #pragma unroll
756  for (int i=0; i<N; i++) {
757  am(i,i).y -= imag_trace/N;
758  }
759  m = 0.5*am;
760  }
761 
762 
763 
764  // Matrix and array are very similar
765  // Maybe I should factor out the similar
766  // code. However, I want to make sure that
767  // the compiler knows to store the
768  // data elements in registers, so I won't do
769  // it right now.
770  template<class T, int N>
771  class Array
772  {
773  private:
774  T data[N];
775 
776  public:
777  // access function
778  __device__ __host__ inline
779  T const & operator[](int i) const{
780  return data[i];
781  }
782 
783  // assignment function
784  __device__ __host__ inline
785  T & operator[](int i){
786  return data[i];
787  }
788  };
789 
790 
791  template<class T, int N>
792  __device__ __host__ inline
793  void copyColumn(const Matrix<T,N>& m, int c, Array<T,N>* a)
794  {
795 #pragma unroll
796  for (int i=0; i<N; ++i){
797  (*a)[i] = m(i,c); // c is the column index
798  }
799  return;
800  }
801 
802 
803  template<class T, int N>
804  __device__ __host__ inline
805  void outerProd(const Array<T,N>& a, const Array<T,N> & b, Matrix<T,N>* m){
806 #pragma unroll
807  for (int i=0; i<N; ++i){
808  const T conjb_i = conj(b[i]);
809  for (int j=0; j<N; ++j){
810  (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
811  }
812  }
813  return;
814  }
815 
816  template<class T, int N>
817  __device__ __host__ inline
818  void outerProd(const T (&a)[N], const T (&b)[N], Matrix<T,N>* m){
819 #pragma unroll
820  for (int i=0; i<N; ++i){
821  const T conjb_i = conj(b[i]);
822 #pragma unroll
823  for (int j=0; j<N; ++j){
824  (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
825  }
826  }
827  return;
828  }
829 
830 
831  // Need some print utilities
832  template<class T, int N>
833  std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
834 #pragma unroll
835  for (int i=0; i<N; ++i){
836 #pragma unroll
837  for (int j=0; j<N; ++j){
838  os << m(i,j) << " ";
839  }
840  if(i<N-1) os << std::endl;
841  }
842  return os;
843  }
844 
845 
846  template<class T, int N>
847  std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
848  for (int i=0; i<N; ++i){
849  os << a[i] << " ";
850  }
851  return os;
852  }
853 
854 
855  template<class T, class U>
856  __device__ inline
857  void loadLinkVariableFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<U,3> *link)
858  {
859 #pragma unroll
860  for (int i=0; i<9; ++i){
861  link->data[i] = array[idx + (dir*9 + i)*stride];
862  }
863  return;
864  }
865 
866 
867  template<class T, class U, int N>
868  __device__ inline
869  void loadMatrixFromArray(const T* const array, const int idx, const int stride, Matrix<U,N> *mat)
870  {
871 #pragma unroll
872  for (int i=0; i<(N*N); ++i){
873  mat->data[i] = array[idx + i*stride];
874  }
875  }
876 
877 
878  __device__ inline
879  void loadLinkVariableFromArray(const float2* const array, const int dir, const int idx, const int stride, Matrix<complex<double>,3> *link)
880  {
881  float2 single_temp;
882 #pragma unroll
883  for (int i=0; i<9; ++i){
884  single_temp = array[idx + (dir*9 + i)*stride];
885  link->data[i].x = single_temp.x;
886  link->data[i].y = single_temp.y;
887  }
888  return;
889  }
890 
891 
892 
893  template<class T, int N, class U>
894  __device__ inline
895  void writeMatrixToArray(const Matrix<T,N>& mat, const int idx, const int stride, U* const array)
896  {
897 #pragma unroll
898  for (int i=0; i<(N*N); ++i){
899  array[idx + i*stride] = mat.data[i];
900  }
901  }
902 
903  __device__ inline
904  void appendMatrixToArray(const Matrix<complex<double>,3>& mat, const int idx, const int stride, double2* const array)
905  {
906 #pragma unroll
907  for (int i=0; i<9; ++i){
908  array[idx + i*stride].x += mat.data[i].x;
909  array[idx + i*stride].y += mat.data[i].y;
910  }
911  }
912 
913  __device__ inline
914  void appendMatrixToArray(const Matrix<complex<float>,3>& mat, const int idx, const int stride, float2* const array)
915  {
916 #pragma unroll
917  for (int i=0; i<9; ++i){
918  array[idx + i*stride].x += mat.data[i].x;
919  array[idx + i*stride].y += mat.data[i].y;
920  }
921  }
922 
923 
924  template<class T, class U>
925  __device__ inline
926  void writeLinkVariableToArray(const Matrix<T,3> & link, const int dir, const int idx, const int stride, U* const array)
927  {
928 #pragma unroll
929  for (int i=0; i<9; ++i){
930  array[idx + (dir*9 + i)*stride] = link.data[i];
931  }
932  return;
933  }
934 
935 
936 
937 
938  __device__ inline
939  void writeLinkVariableToArray(const Matrix<complex<double>,3> & link, const int dir, const int idx, const int stride, float2* const array)
940  {
941  float2 single_temp;
942 
943 #pragma unroll
944  for (int i=0; i<9; ++i){
945  single_temp.x = link.data[i].x;
946  single_temp.y = link.data[i].y;
947  array[idx + (dir*9 + i)*stride] = single_temp;
948  }
949  return;
950  }
951 
952 
953  template<class T>
954  __device__ inline
955  void loadMomentumFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<T,3> *mom)
956  {
957  T temp2[5];
958  temp2[0] = array[idx + dir*stride*5];
959  temp2[1] = array[idx + dir*stride*5 + stride];
960  temp2[2] = array[idx + dir*stride*5 + 2*stride];
961  temp2[3] = array[idx + dir*stride*5 + 3*stride];
962  temp2[4] = array[idx + dir*stride*5 + 4*stride];
963 
964  mom->data[0].x = 0.;
965  mom->data[0].y = temp2[3].x;
966  mom->data[1] = temp2[0];
967  mom->data[2] = temp2[1];
968 
969  mom->data[3].x = -mom->data[1].x;
970  mom->data[3].y = mom->data[1].y;
971  mom->data[4].x = 0.;
972  mom->data[4].y = temp2[3].y;
973  mom->data[5] = temp2[2];
974 
975  mom->data[6].x = -mom->data[2].x;
976  mom->data[6].y = mom->data[2].y;
977 
978  mom->data[7].x = -mom->data[5].x;
979  mom->data[7].y = mom->data[5].y;
980 
981  mom->data[8].x = 0.;
982  mom->data[8].y = temp2[4].x;
983 
984  return;
985  }
986 
987 
988 
989  template<class T, class U>
990  __device__ inline
991  void writeMomentumToArray(const Matrix<T,3> & mom, const int dir, const int idx, const U coeff, const int stride, T* const array)
992  {
993  typedef typename T::value_type real;
994  T temp2;
995  temp2.x = (mom.data[1].x - mom.data[3].x)*0.5*coeff;
996  temp2.y = (mom.data[1].y + mom.data[3].y)*0.5*coeff;
997  array[idx + dir*stride*5] = temp2;
998 
999  temp2.x = (mom.data[2].x - mom.data[6].x)*0.5*coeff;
1000  temp2.y = (mom.data[2].y + mom.data[6].y)*0.5*coeff;
1001  array[idx + dir*stride*5 + stride] = temp2;
1002 
1003  temp2.x = (mom.data[5].x - mom.data[7].x)*0.5*coeff;
1004  temp2.y = (mom.data[5].y + mom.data[7].y)*0.5*coeff;
1005  array[idx + dir*stride*5 + stride*2] = temp2;
1006 
1007  const real temp = (mom.data[0].y + mom.data[4].y + mom.data[8].y)*0.3333333333333333333333333;
1008  temp2.x = (mom.data[0].y-temp)*coeff;
1009  temp2.y = (mom.data[4].y-temp)*coeff;
1010  array[idx + dir*stride*5 + stride*3] = temp2;
1011 
1012  temp2.x = (mom.data[8].y - temp)*coeff;
1013  temp2.y = 0.0;
1014  array[idx + dir*stride*5 + stride*4] = temp2;
1015 
1016  return;
1017  }
1018 
1019 
1020 
1021  template<class Cmplx>
1022  __device__ __host__ inline
1024  {
1025 
1026  const Cmplx & det = getDeterminant(u);
1027  const Cmplx & det_inv = static_cast<typename Cmplx::value_type>(1.0)/det;
1028 
1029  Cmplx temp;
1030 
1031  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
1032  (*uinv)(0,0) = (det_inv*temp);
1033 
1034  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
1035  (*uinv)(0,1) = (temp*det_inv);
1036 
1037  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
1038  (*uinv)(0,2) = (temp*det_inv);
1039 
1040  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
1041  (*uinv)(1,0) = (det_inv*temp);
1042 
1043  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
1044  (*uinv)(1,1) = (temp*det_inv);
1045 
1046  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
1047  (*uinv)(1,2) = (temp*det_inv);
1048 
1049  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
1050  (*uinv)(2,0) = (det_inv*temp);
1051 
1052  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
1053  (*uinv)(2,1) = (temp*det_inv);
1054 
1055  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
1056  (*uinv)(2,2) = (temp*det_inv);
1057 
1058  return;
1059  }
1060  // template this!
1061  inline void copyArrayToLink(Matrix<float2,3>* link, float* array){
1062 #pragma unroll
1063  for (int i=0; i<3; ++i){
1064 #pragma unroll
1065  for (int j=0; j<3; ++j){
1066  (*link)(i,j).x = array[(i*3+j)*2];
1067  (*link)(i,j).y = array[(i*3+j)*2 + 1];
1068  }
1069  }
1070  return;
1071  }
1072 
1073  template<class Cmplx, class Real>
1074  inline void copyArrayToLink(Matrix<Cmplx,3>* link, Real* array){
1075 #pragma unroll
1076  for (int i=0; i<3; ++i){
1077 #pragma unroll
1078  for (int j=0; j<3; ++j){
1079  (*link)(i,j).x = array[(i*3+j)*2];
1080  (*link)(i,j).y = array[(i*3+j)*2 + 1];
1081  }
1082  }
1083  return;
1084  }
1085 
1086 
1087  // and this!
1088  inline void copyLinkToArray(float* array, const Matrix<float2,3>& link){
1089 #pragma unroll
1090  for (int i=0; i<3; ++i){
1091 #pragma unroll
1092  for (int j=0; j<3; ++j){
1093  array[(i*3+j)*2] = link(i,j).x;
1094  array[(i*3+j)*2 + 1] = link(i,j).y;
1095  }
1096  }
1097  return;
1098  }
1099 
1100  // and this!
1101  template<class Cmplx, class Real>
1102  inline void copyLinkToArray(Real* array, const Matrix<Cmplx,3>& link){
1103 #pragma unroll
1104  for (int i=0; i<3; ++i){
1105 #pragma unroll
1106  for (int j=0; j<3; ++j){
1107  array[(i*3+j)*2] = link(i,j).x;
1108  array[(i*3+j)*2 + 1] = link(i,j).y;
1109  }
1110  }
1111  return;
1112  }
1113 
1114  template<class T>
1115  __device__ __host__ inline Matrix<T,3> getSubTraceUnit(const Matrix<T,3>& a){
1116  T tr = (a(0,0) + a(1,1) + a(2,2)) / 3.0;
1117  Matrix<T,3> res;
1118  res(0,0) = a(0,0) - tr; res(0,1) = a(0,1); res(0,2) = a(0,2);
1119  res(1,0) = a(1,0); res(1,1) = a(1,1) - tr; res(1,2) = a(1,2);
1120  res(2,0) = a(2,0); res(2,1) = a(2,1); res(2,2) = a(2,2) - tr;
1121  return res;
1122  }
1123 
1124  template<class T>
1125  __device__ __host__ inline void SubTraceUnit(Matrix<T,3>& a){
1126  T tr = (a(0,0) + a(1,1) + a(2,2)) / static_cast<T>(3.0);
1127  a(0,0) -= tr; a(1,1) -= tr; a(2,2) -= tr;
1128  }
1129 
1130  template<class T>
1131  __device__ __host__ inline double getRealTraceUVdagger(const Matrix<T,3>& a, const Matrix<T,3>& b){
1132  double sum = (double)(a(0,0).x * b(0,0).x + a(0,0).y * b(0,0).y);
1133  sum += (double)(a(0,1).x * b(0,1).x + a(0,1).y * b(0,1).y);
1134  sum += (double)(a(0,2).x * b(0,2).x + a(0,2).y * b(0,2).y);
1135  sum += (double)(a(1,0).x * b(1,0).x + a(1,0).y * b(1,0).y);
1136  sum += (double)(a(1,1).x * b(1,1).x + a(1,1).y * b(1,1).y);
1137  sum += (double)(a(1,2).x * b(1,2).x + a(1,2).y * b(1,2).y);
1138  sum += (double)(a(2,0).x * b(2,0).x + a(2,0).y * b(2,0).y);
1139  sum += (double)(a(2,1).x * b(2,1).x + a(2,1).y * b(2,1).y);
1140  sum += (double)(a(2,2).x * b(2,2).x + a(2,2).y * b(2,2).y);
1141  return sum;
1142  }
1143 
1144 
1145 
1146  // and this!
1147  template<class Cmplx>
1148  __host__ __device__ inline
1149  void printLink(const Matrix<Cmplx,3>& link){
1150  printf("(%lf, %lf)\t", link(0,0).x, link(0,0).y);
1151  printf("(%lf, %lf)\t", link(0,1).x, link(0,1).y);
1152  printf("(%lf, %lf)\n", link(0,2).x, link(0,2).y);
1153  printf("(%lf, %lf)\t", link(1,0).x, link(1,0).y);
1154  printf("(%lf, %lf)\t", link(1,1).x, link(1,1).y);
1155  printf("(%lf, %lf)\n", link(1,2).x, link(1,2).y);
1156  printf("(%lf, %lf)\t", link(2,0).x, link(2,0).y);
1157  printf("(%lf, %lf)\t", link(2,1).x, link(2,1).y);
1158  printf("(%lf, %lf)\n", link(2,2).x, link(2,2).y);
1159  printf("\n");
1160  }
1161 
1162  template<class Cmplx>
1163  __device__ __host__
1164  double ErrorSU3(const Matrix<Cmplx,3>& matrix)
1165  {
1166  const Matrix<Cmplx,3> identity_comp = conj(matrix)*matrix;
1167  double error = 0.0;
1168  Cmplx temp(0,0);
1169  int i=0;
1170  int j=0;
1171 
1172  //error = ||U^dagger U - I||_L2
1173 #pragma unroll
1174  for (i=0; i<3; ++i)
1175 #pragma unroll
1176  for (j=0; j<3; ++j)
1177  if(i==j) {
1178  temp = identity_comp(i,j);
1179  temp -= 1.0;
1180  error += norm(temp);
1181  }
1182  else {
1183  error += norm(identity_comp(i,j));
1184  }
1185  //error is L2 norm, should be (very close) to zero.
1186  return error;
1187  }
1188 
1189  template<class T>
1190  __device__ __host__ inline
1191  void exponentiate_iQ(const Matrix<T,3>& Q, Matrix<T,3>* exp_iQ)
1192  {
1193  // Use Cayley-Hamilton Theorem for SU(3) exp{iQ}.
1194  // This algorithm is outlined in
1195  // http://arxiv.org/pdf/hep-lat/0311018v1.pdf
1196  // Equation numbers in the paper are referenced by [eq_no].
1197 
1198  //Declarations
1199  typedef decltype(Q(0,0).x) undMatType;
1200 
1201  undMatType inv3 = 1.0/3.0;
1202  undMatType c0, c1, c0_max, Tr_re;
1203  undMatType f0_re, f0_im, f1_re, f1_im, f2_re, f2_im;
1204  undMatType theta;
1205  undMatType u_p, w_p; //u, w parameters.
1206  Matrix<T,3> temp1;
1207  Matrix<T,3> temp2;
1208  //[14] c0 = det(Q) = 1/3Tr(Q^3)
1209  const T & det_Q = getDeterminant(Q);
1210  c0 = det_Q.x;
1211  //[15] c1 = 1/2Tr(Q^2)
1212  // Q = Q^dag => Tr(Q^2) = Tr(QQ^dag) = sum_ab [Q_ab * Q_ab^*]
1213  temp1 = Q;
1214  temp1 = temp1 * Q;
1215  Tr_re = getTrace(temp1).x;
1216  c1 = 0.5*Tr_re;
1217 
1218  //We now have the coeffiecients c0 and c1.
1219  //We now find: exp(iQ) = f0*I + f1*Q + f2*Q^2
1220  // where fj = fj(c0,c1), j=0,1,2.
1221 
1222  //[17]
1223  c0_max = 2*pow(c1*inv3,1.5);
1224 
1225  //[25]
1226  theta = acos(c0/c0_max);
1227  //[23]
1228  u_p = sqrt(c1*inv3)*cos(theta*inv3);
1229 
1230  //[24]
1231  w_p = sqrt(c1)*sin(theta*inv3);
1232 
1233  //[29] Construct objects for fj = hj/(9u^2 - w^2).
1234  undMatType u_sq = u_p*u_p;
1235  undMatType w_sq = w_p*w_p;
1236  undMatType denom_inv = 1.0/(9*u_sq - w_sq);
1237  undMatType exp_iu_re = cos(u_p);
1238  undMatType exp_iu_im = sin(u_p);
1239  undMatType exp_2iu_re = exp_iu_re*exp_iu_re - exp_iu_im*exp_iu_im;
1240  undMatType exp_2iu_im = 2*exp_iu_re*exp_iu_im;
1241  undMatType cos_w = cos(w_p);
1242  undMatType sinc_w;
1243  undMatType hj_re = 0.0;
1244  undMatType hj_im = 0.0;
1245 
1246  //[33] Added one more term to the series given in the paper.
1247  if (w_p < 0.05 && w_p > -0.05) {
1248  //1 - 1/6 x^2 (1 - 1/20 x^2 (1 - 1/42 x^2(1 - 1/72*x^2)))
1249  sinc_w = 1.0 - (w_sq/6.0)*(1 - (w_sq*0.05)*(1 - (w_sq/42.0)*(1 - (w_sq/72.0))));
1250  }
1251  else sinc_w = sin(w_p)/w_p;
1252 
1253 
1254  //[34] Test for c0 < 0.
1255  int parity = 0;
1256  if(c0 < 0) {
1257  c0 *= -1.0;
1258  parity = 1;
1259  //calculate fj with c0 > 0 and then convert all fj.
1260  }
1261 
1262  //Get all the numerators for fj,
1263  //[30] f0
1264  hj_re = (u_sq - w_sq)*exp_2iu_re + 8*u_sq*cos_w*exp_iu_re + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_im;
1265  hj_im = (u_sq - w_sq)*exp_2iu_im - 8*u_sq*cos_w*exp_iu_im + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_re;
1266  f0_re = hj_re*denom_inv;
1267  f0_im = hj_im*denom_inv;
1268 
1269  //[31] f1
1270  hj_re = 2*u_p*exp_2iu_re - 2*u_p*cos_w*exp_iu_re + (3*u_sq - w_sq)*sinc_w*exp_iu_im;
1271  hj_im = 2*u_p*exp_2iu_im + 2*u_p*cos_w*exp_iu_im + (3*u_sq - w_sq)*sinc_w*exp_iu_re;
1272  f1_re = hj_re*denom_inv;
1273  f1_im = hj_im*denom_inv;
1274 
1275  //[32] f2
1276  hj_re = exp_2iu_re - cos_w*exp_iu_re - 3*u_p*sinc_w*exp_iu_im;
1277  hj_im = exp_2iu_im + cos_w*exp_iu_im - 3*u_p*sinc_w*exp_iu_re;
1278  f2_re = hj_re*denom_inv;
1279  f2_im = hj_im*denom_inv;
1280 
1281  //[34] If c0 < 0, apply tranformation fj(-c0,c1) = (-1)^j f^*j(c0,c1)
1282  if (parity == 1) {
1283  f0_im *= -1.0;
1284  f1_re *= -1.0;
1285  f2_im *= -1.0;
1286  }
1287 
1288  T f0_c;
1289  T f1_c;
1290  T f2_c;
1291 
1292  f0_c.x = f0_re;
1293  f0_c.y = f0_im;
1294 
1295  f1_c.x = f1_re;
1296  f1_c.y = f1_im;
1297 
1298  f2_c.x = f2_re;
1299  f2_c.y = f2_im;
1300 
1301  //[19] Construct exp{iQ}
1302  setZero(exp_iQ);
1303  Matrix<T,3> UnitM;
1304  setIdentity(&UnitM);
1305  // +f0*I
1306  temp1 = f0_c * UnitM;
1307  *exp_iQ = temp1;
1308 
1309  // +f1*Q
1310  temp1 = f1_c * Q;
1311  *exp_iQ += temp1;
1312 
1313  // +f2*Q^2
1314  temp1 = Q * Q;
1315  temp2 = f2_c * temp1;
1316  *exp_iQ += temp2;
1317 
1318  //exp(iQ) is now defined.
1319  return;
1320  }
1321 
1325  template <typename Float> __device__ __host__ void expsu3(Matrix<complex<Float>, 3> &q)
1326  {
1327  typedef complex<Float> Complex;
1328 
1329  Complex a2 = (q(3) * q(1) + q(7) * q(5) + q(6) * q(2) - (q(0) * q(4) + (q(0) + q(4)) * q(8))) / (Float)3.0;
1330  Complex a3 = q(0) * q(4) * q(8) + q(1) * q(5) * q(6) + q(2) * q(3) * q(7) - q(6) * q(4) * q(2)
1331  - q(3) * q(1) * q(8) - q(0) * q(7) * q(5);
1332 
1333  Complex sg2h3 = sqrt(a3 * a3 - (Float)4. * a2 * a2 * a2);
1334  Complex cp = exp(log((Float)0.5 * (a3 + sg2h3)) / (Float)3.0);
1335  Complex cm = a2 / cp;
1336 
1337  Complex r1 = exp(Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1338  Complex r2 = exp(-Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1339 
1340  Complex w1[3];
1341 
1342  w1[0] = cm + cp;
1343  w1[1] = r1 * cp + r2 * cm;
1344  w1[2] = r2 * cp + r1 * cm;
1345  Complex z1 = q(1) * q(6) - q(0) * q(7);
1346  Complex z2 = q(3) * q(7) - q(4) * q(6);
1347 
1348  Complex al = w1[0];
1349  Complex wr21 = (z1 + al * q(7)) / (z2 + al * q(6));
1350  Complex wr31 = (al - q(0) - wr21 * q(3)) / q(6);
1351 
1352  al = w1[1];
1353  Complex wr22 = (z1 + al * q(7)) / (z2 + al * q(6));
1354  Complex wr32 = (al - q(0) - wr22 * q(3)) / q(6);
1355 
1356  al = w1[2];
1357  Complex wr23 = (z1 + al * q(7)) / (z2 + al * q(6));
1358  Complex wr33 = (al - q(0) - wr23 * q(3)) / q(6);
1359 
1360  z1 = q(3) * q(2) - q(0) * q(5);
1361  z2 = q(1) * q(5) - q(4) * q(2);
1362 
1363  al = w1[0];
1364  Complex wl21 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1365  Complex wl31 = conj((al - q(0) - conj(wl21) * q(1)) / q(2));
1366 
1367  al = w1[1];
1368  Complex wl22 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1369  Complex wl32 = conj((al - q(0) - conj(wl22) * q(1)) / q(2));
1370 
1371  al = w1[2];
1372  Complex wl23 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
1373  Complex wl33 = conj((al - q(0) - conj(wl23) * q(1)) / q(2));
1374 
1375  Complex xn1 = (Float)1. + wr21 * conj(wl21) + wr31 * conj(wl31);
1376  Complex xn2 = (Float)1. + wr22 * conj(wl22) + wr32 * conj(wl32);
1377  Complex xn3 = (Float)1. + wr23 * conj(wl23) + wr33 * conj(wl33);
1378 
1379  Complex d1 = exp(w1[0]);
1380  Complex d2 = exp(w1[1]);
1381  Complex d3 = exp(w1[2]);
1382  Complex y11 = d1 / xn1;
1383  Complex y12 = d2 / xn2;
1384  Complex y13 = d3 / xn3;
1385  Complex y21 = wr21 * d1 / xn1;
1386  Complex y22 = wr22 * d2 / xn2;
1387  Complex y23 = wr23 * d3 / xn3;
1388  Complex y31 = wr31 * d1 / xn1;
1389  Complex y32 = wr32 * d2 / xn2;
1390  Complex y33 = wr33 * d3 / xn3;
1391  q(0) = y11 + y12 + y13;
1392  q(1) = y21 + y22 + y23;
1393  q(2) = y31 + y32 + y33;
1394  q(3) = y11 * conj(wl21) + y12 * conj(wl22) + y13 * conj(wl23);
1395  q(4) = y21 * conj(wl21) + y22 * conj(wl22) + y23 * conj(wl23);
1396  q(5) = y31 * conj(wl21) + y32 * conj(wl22) + y33 * conj(wl23);
1397  q(6) = y11 * conj(wl31) + y12 * conj(wl32) + y13 * conj(wl33);
1398  q(7) = y21 * conj(wl31) + y22 * conj(wl32) + y23 * conj(wl33);
1399  q(8) = y31 * conj(wl31) + y32 * conj(wl32) + y33 * conj(wl33);
1400  }
1401 
1402 } // end namespace quda
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:131
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
__device__ __host__ bool isUnitary(double max_error) const
Definition: quda_matrix.h:209
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
Definition: quda_matrix.h:1164
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:702
__device__ __host__ constexpr int size() const
Definition: quda_matrix.h:74
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__device__ static __host__ T val()
__device__ __host__ T max() const
Compute the absolute max element of the Hermitian matrix.
Definition: quda_matrix.h:381
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:96
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
T data[N *N]
Definition: quda_matrix.h:305
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1131
__device__ __host__ Matrix(const T data_[])
Definition: quda_matrix.h:92
double l2(float *a, float *b, int N)
Definition: new_half.cu:120
__device__ __host__ void operator=(const complex< T > &a)
Definition: quda_matrix.h:257
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
__device__ void writeMomentumToArray(const Matrix< T, 3 > &mom, const int dir, const int idx, const U coeff, const int stride, T *const array)
Definition: quda_matrix.h:991
int length[]
__host__ __device__ float2 operator*=(float2 &x, const float a)
Definition: float_vector.h:151
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
__device__ __host__ uint64_t checksum() const
Definition: quda_matrix.h:198
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:121
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< U, 3 > *link)
Definition: quda_matrix.h:857
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1149
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:1088
__device__ __host__ T const & operator()(int i, int j) const
Definition: quda_matrix.h:100
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:805
__device__ __host__ T const & operator[](int i) const
Definition: quda_matrix.h:779
wrapper class that enables us to write to Hmatrices in packed format
Definition: quda_matrix.h:249
__device__ __host__ complex< T > const operator()(int i, int j) const
Definition: quda_matrix.h:322
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
__device__ __host__ HMatrix()
Definition: quda_matrix.h:307
T data[N *N]
Definition: quda_matrix.h:72
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
__device__ static __host__ T val()
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:61
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Definition: quda_matrix.h:162
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
Definition: quda_matrix.h:793
Provides precision abstractions and defines the register precision given the storage precision using ...
std::complex< double > Complex
Definition: quda_internal.h:46
__device__ __host__ T & operator()(int i)
Definition: quda_matrix.h:114
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, U *const array)
Definition: quda_matrix.h:926
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1125
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< U, N > *mat)
Definition: quda_matrix.h:869
__device__ __host__ real Linf()
Compute the matrix Linfinity norm - this is the maximum of the absolute row sums. ...
Definition: quda_matrix.h:179
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:1061
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:653
__device__ __host__ int index(int i, int j) const
Definition: quda_matrix.h:291
__host__ __device__ ValueType log(ValueType x)
Definition: complex_quda.h:101
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:415
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
__device__ __host__ Matrix(const Matrix< U, N > &a)
Definition: quda_matrix.h:86
__shared__ float s[]
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ int index(int i, int j) const
Definition: quda_matrix.h:69
__device__ void appendMatrixToArray(const Matrix< complex< double >, 3 > &mat, const int idx, const int stride, double2 *const array)
Definition: quda_matrix.h:904
__device__ void loadMomentumFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *mom)
Definition: quda_matrix.h:955
__device__ __host__ Matrix< T, 3 > getSubTraceUnit(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:1115
__device__ __host__ T & operator()(int i, int j)
Definition: quda_matrix.h:104
__device__ __host__ void operator=(const HMatrix< U, N > &b)
Definition: quda_matrix.h:338
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
Definition: quda_matrix.h:1023
__device__ __host__ void print() const
Definition: quda_matrix.h:390
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
Definition: quda_matrix.h:143
__device__ __host__ HMatrix(const T data_[])
Definition: quda_matrix.h:317
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
Definition: quda_matrix.h:353
__device__ __host__ Matrix()
Definition: quda_matrix.h:76
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746
__device__ __host__ Matrix(const Matrix< T, N > &a)
Definition: quda_matrix.h:81
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
RealType< T >::type real
Definition: quda_matrix.h:66
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
__device__ __host__ HMatrix_wrapper(Hmat &mat, int i, int j, int idx)
Definition: quda_matrix.h:255
__device__ __host__ void operator+=(const complex< T > &a)
Definition: quda_matrix.h:269
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:96
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:61
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
__device__ __host__ void expsu3(Matrix< complex< Float >, 3 > &q)
Definition: quda_matrix.h:1325
QudaParity parity
Definition: covdev_test.cpp:54
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:54
__device__ void writeMatrixToArray(const Matrix< T, N > &mat, const int idx, const int stride, U *const array)
Definition: quda_matrix.h:895
__device__ __host__ HMatrix(const HMatrix< T, N > &a)
Definition: quda_matrix.h:312
__device__ __host__ void exponentiate_iQ(const Matrix< T, 3 > &Q, Matrix< T, 3 > *exp_iQ)
Definition: quda_matrix.h:1191
__device__ __host__ T & operator[](int i)
Definition: quda_matrix.h:785
__device__ __host__ T const & operator()(int i) const
Definition: quda_matrix.h:108