quda-ref/v1.0.0/quda__matrix_8h_source.html

 #pragma once

 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
 #include <iomanip>

 #include <register_traits.h>
 #include <float_vector.h>
 #include <complex_quda.h>

 namespace quda {


   template<class T>
     struct Zero
     {
       //static const T val;
       __device__ __host__ inline
         static T val();
     };

   template<>
     __device__ __host__ inline
     float2 Zero<float2>::val()
     {
       return make_float2(0.,0.);
     }

   template<>
     __device__ __host__ inline
     double2 Zero<double2>::val()
     {
       return make_double2(0.,0.);
     }


   template<class T>
     struct Identity
     {
       __device__  __host__ inline
         static T val();
     };

   template<>
     __device__ __host__ inline
     float2 Identity<float2>::val(){
       return make_float2(1.,0.);
     }

   template<>
     __device__ __host__ inline
     double2 Identity<double2>::val(){
       return make_double2(1.,0.);
     }

   template<typename Float, typename T> struct gauge_wrapper;
   template<typename Float, typename T> struct gauge_ghost_wrapper;
   template<typename Float, typename T> struct clover_wrapper;
   template<typename T, int N> struct HMatrix;

   template<class T, int N>
     class Matrix
     {
       typedef typename RealType<T>::type real;

     private:
         __device__ __host__ inline int index(int i, int j) const { return i*N + j; }

       public:
         T data[N*N];

         __device__ __host__ constexpr int size() const { return N; }

         __device__ __host__ inline Matrix() {
 #pragma unroll
     for (int i=0; i<N*N; i++) zero(data[i]);
   }

   __device__ __host__ inline Matrix(const Matrix<T,N> &a) {
 #pragma unroll
     for (int i=0; i<N*N; i++) data[i] = a.data[i];
   }

         template <class U> __device__ __host__ inline Matrix(const Matrix<U, N> &a)
         {
 #pragma unroll
           for (int i = 0; i < N * N; i++) data[i] = a.data[i];
         }

         __device__ __host__ inline Matrix(const T data_[])
         {
 #pragma unroll
     for (int i=0; i<N*N; i++) data[i] = data_[i];
         }

         __device__ __host__ inline Matrix(const HMatrix<real, N> &a);

         __device__ __host__ inline T const & operator()(int i, int j) const {
           return data[index(i,j)];
         }

         __device__ __host__ inline T & operator()(int i, int j) {
           return data[index(i,j)];
         }

         __device__ __host__ inline T const & operator()(int i) const {
           int j = i % N;
           int k = i / N;
           return data[index(j,k)];
         }

         __device__ __host__ inline T& operator()(int i) {
           int j = i % N;
           int k = i / N;
           return data[index(j,k)];
         }

   template<class U>
     __device__ __host__ inline void operator=(const Matrix<U,N> & b) {
 #pragma unroll
     for (int i=0; i<N*N; i++) data[i] = b.data[i];
   }

   template<typename S>
     __device__ __host__ inline Matrix(const gauge_wrapper<real, S> &s);

   template<typename S>
     __device__ __host__ inline void operator=(const gauge_wrapper<real, S> &s);

   template<typename S>
     __device__ __host__ inline Matrix(const gauge_ghost_wrapper<real, S> &s);

   template<typename S>
     __device__ __host__ inline void operator=(const gauge_ghost_wrapper<real, S> &s);

         __device__ __host__ inline real L1() {
           real l1 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
             real col_sum = 0;
 #pragma unroll
             for (int i=0; i<N; i++) {
               col_sum += abs(data[i*N + j]);
             }
             l1 = col_sum > l1 ? col_sum : l1;
           }
           return l1;
         }

         __device__ __host__ inline real L2() {
           real l2 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
 #pragma unroll
             for (int i=0; i<N; i++) {
               l2 += norm(data[i*N + j]);
             }
           }
           return sqrt(l2);
         }

         __device__ __host__ inline real Linf() {
           real linf = 0;
 #pragma unroll
           for (int i=0; i<N; i++) {
             real row_sum = 0;
 #pragma unroll
             for (int j=0; j<N; j++) {
               row_sum += abs(data[i*N + j]);
             }
             linf = row_sum > linf ? row_sum : linf;
           }
           return linf;
         }

   __device__ __host__ inline uint64_t checksum() const {
           // ensure length is rounded up to 64-bit multiple
           constexpr int length = (N*N*sizeof(T) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
           uint64_t base_[length] = { };
           T *data_ = reinterpret_cast<T*>( static_cast<void*>(base_) );
           for (int i=0; i<N*N; i++) data_[i] = data[i];
           uint64_t checksum_ = base_[0];
           for (int i=1; i<length; i++) checksum_ ^= base_[i];
           return checksum_;
         }

         __device__ __host__ inline bool isUnitary(double max_error) const
         {
           const auto identity = conj(*this) * *this;

 #pragma unroll
           for (int i=0; i<N; ++i){
             if( fabs(identity(i,i).real() - 1.0) > max_error ||
                 fabs(identity(i,i).imag()) > max_error) return false;

 #pragma unroll
             for (int j=i+1; j<N; ++j){
               if( fabs(identity(i,j).real()) > max_error ||
                   fabs(identity(i,j).imag()) > max_error ||
                   fabs(identity(j,i).real()) > max_error ||
                   fabs(identity(j,i).imag()) > max_error ){
                 return false;
               }
             }
           }

 #pragma unroll
           for (int i=0; i<N; i++) {
 #pragma unroll
             for (int j=0; j<N; j++) {
               if (std::isnan((*this)(i,j).real()) ||
                   std::isnan((*this)(i,j).imag())) return false;
             }
           }

           return true;
         }

     };

   template <typename T, typename Hmat>
   struct HMatrix_wrapper {
     Hmat &mat;
     const int i;
     const int j;
     const int idx;

     __device__ __host__ inline HMatrix_wrapper(Hmat &mat, int i, int j, int idx) : mat(mat), i(i), j(j), idx(idx) { }

     __device__ __host__ inline void operator=(const complex<T> &a) {
       if (i==j) {
   mat.data[idx] = a.real();
       } else if (j<i) {
   mat.data[idx+0] = a.real();
   mat.data[idx+1] = a.imag();
       } else {
   mat.data[idx+0] = a.real();
   mat.data[idx+1] = -a.imag();
       }
     }

     __device__ __host__ inline void operator+=(const complex<T> &a) {
       if (i==j) {
   mat.data[idx] += a.real();
       } else if (j<i) {
   mat.data[idx+0] += a.real();
   mat.data[idx+1] += a.imag();
       } else {
   mat.data[idx+0] += a.real();
   mat.data[idx+1] += -a.imag();
       }
     }
   };

   template<class T, int N>
     class HMatrix
     {
       friend HMatrix_wrapper<T,HMatrix<T,N> >;
       private:
       // compute index into triangular-packed Hermitian matrix
       __device__ __host__ inline int index(int i, int j) const {
   if (i==j) {
     return i;
   } else if (j<i) {
     int k = N*(N-1)/2 - (N-j)*(N-j-1)/2 + i - j - 1;
     return N + 2*k;
   } else { // i>j
     // switch coordinates to count from bottom right instead of top left of matrix
     int k = N*(N-1)/2 - (N-i)*(N-i-1)/2 + j - i - 1;
     return N + 2*k;
   }
       }

       public:
       T data[N*N]; // store in real-valued array

       __device__ __host__ inline HMatrix() {
 #pragma unroll
   for (int i=0; i<N*N; i++) zero(data[i]);
       }

       __device__ __host__ inline HMatrix(const HMatrix<T,N> &a) {
 #pragma unroll
   for (int i=0; i<N*N; i++) data[i] = a.data[i];
       }

       __device__ __host__ inline HMatrix(const T data_[]) {
 #pragma unroll
   for (int i=0; i<N*N; i++) data[i] = data_[i];
       }

       __device__ __host__ inline complex<T> const operator()(int i, int j) const {
   const int idx = index(i,j);
   if (i==j) {
     return complex<T>(data[idx],0.0);
   } else if (j<i) {
     return complex<T>(data[idx], data[idx+1]);
   } else {
     return complex<T>(data[idx],-data[idx+1]);
   }
       }

       __device__ __host__ inline HMatrix_wrapper<T,HMatrix<T,N> > operator() (int i, int j) {
   return HMatrix_wrapper<T,HMatrix<T,N> >(*this, i, j, index(i,j));
       }

       template<class U>
   __device__ __host__ inline void operator=(const HMatrix<U,N> & b) {
 #pragma unroll
   for (int i=0; i<N*N; i++) data[i] = b.data[i];
       }

       template<typename S>
   __device__ __host__ inline HMatrix(const clover_wrapper<T, S> &s);

       template<typename S>
   __device__ __host__ inline void operator=(const clover_wrapper<T, S> &s);

       __device__ __host__ inline HMatrix<T,N> square() const {
   HMatrix<T,N> result;
   complex<T> tmp;
 #pragma unroll
   for (int i=0; i<N; i++) {
 #pragma unroll
     for (int k=0; k<N; k++) if (i<=k) { // else compiler can't handle triangular unroll
       tmp.x  = (*this)(i,0).real() * (*this)(0,k).real();
       tmp.x -= (*this)(i,0).imag() * (*this)(0,k).imag();
       tmp.y  = (*this)(i,0).real() * (*this)(0,k).imag();
       tmp.y += (*this)(i,0).imag() * (*this)(0,k).real();
 #pragma unroll
       for (int j=1; j<N; j++) {
         tmp.x += (*this)(i,j).real() * (*this)(j,k).real();
         tmp.x -= (*this)(i,j).imag() * (*this)(j,k).imag();
         tmp.y += (*this)(i,j).real() * (*this)(j,k).imag();
         tmp.y += (*this)(i,j).imag() * (*this)(j,k).real();
       }
       result(i,k) = tmp;
     }
   }
   return result;
       }

       __device__ __host__ inline T max() const
       {
         HMatrix<T, N> result;
         T max = static_cast<T>(0.0);
 #pragma unroll
         for (int i = 0; i < N * N; i++) max = (abs(data[i]) > max ? abs(data[i]) : max);
         return max;
       }

       __device__ __host__ void print() const {
   for (int i=0; i<N; i++) {
     printf("i=%d ", i);
     for (int j=0; j<N; j++) {
       printf(" (%e, %e)", (*this)(i,j).real(), (*this)(i,j).imag());
     }
     printf("\n");
   }
   printf("\n");
       }

     };

   template<class T,int N>
     __device__ __host__ Matrix<T,N>::Matrix(const HMatrix<typename RealType<T>::type,N> &a) {
 #pragma unroll
     for (int i=0; i<N; i++) {
 #pragma unroll
       for (int j=0; j<N; j++) {
   (*this)(i,j) = a(i,j);
       }
     }
   }

   template<class T>
     __device__ __host__ inline T getTrace(const Matrix<T,3>& a)
     {
       return a(0,0) + a(1,1) + a(2,2);
     }


   template< template<typename,int> class Mat, class T>
     __device__ __host__ inline  T getDeterminant(const Mat<T,3> & a){

       T result;
       result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
         - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
         + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));

       return result;
     }

   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator+(const Mat<T,N> & a, const Mat<T,N> & b)
     {
       Mat<T,N> result;
 #pragma unroll
       for (int i=0; i<N*N; i++) result.data[i] = a.data[i] + b.data[i];
       return result;
     }


   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const Mat<T,N> & b)
     {
 #pragma unroll
       for (int i=0; i<N*N; i++) a.data[i] += b.data[i];
       return a;
     }

   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator+=(Mat<T,N> & a, const T & b)
     {
 #pragma unroll
       for (int i=0; i<N; i++) a(i,i) += b;
       return a;
     }

   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator-=(Mat<T,N> & a, const Mat<T,N> & b)
     {
 #pragma unroll
       for (int i=0; i<N*N; i++) a.data[i] -= b.data[i];
       return a;
     }

   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a, const Mat<T,N> & b)
     {
       Mat<T,N> result;
 #pragma unroll
       for (int i=0; i<N*N; i++) result.data[i] = a.data[i] - b.data[i];
       return result;
     }

   template< template<typename,int> class Mat, class T, int N, class S>
     __device__ __host__ inline Mat<T,N> operator*(const S & scalar, const Mat<T,N> & a){
       Mat<T,N> result;
 #pragma unroll
       for (int i=0; i<N*N; ++i) result.data[i] = scalar*a.data[i];
       return result;
     }

   template< template<typename,int> class Mat, class T, int N, class S>
     __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> & a, const S & scalar){
       return scalar*a;
     }

   template< template<typename,int> class Mat, class T, int N, class S>
     __device__ __host__ inline Mat<T,N> operator *=(Mat<T,N> & a, const S & scalar){
       a = scalar*a;
       return a;
     }

   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator-(const Mat<T,N> & a){
       Mat<T,N> result;
 #pragma unroll
       for (int i=0; i<(N*N); ++i) result.data[i] = -a.data[i];
       return result;
     }


   template< template<typename,int> class Mat, class T, int N>
     __device__ __host__ inline Mat<T,N> operator*(const Mat<T,N> &a, const Mat<T,N> &b)
     {
       Mat<T,N> result;
 #pragma unroll
       for (int i=0; i<N; i++) {
 #pragma unroll
   for (int k=0; k<N; k++) {
     result(i,k) = a(i,0) * b(0,k);
 #pragma unroll
     for (int j=1; j<N; j++) {
       result(i,k) += a(i,j) * b(j,k);
     }
   }
       }
       return result;
     }

   template< template<typename> class complex, typename T, int N>
     __device__ __host__ inline Matrix<complex<T>,N> operator*(const Matrix<complex<T>,N> &a, const Matrix<complex<T>,N> &b)
     {
       Matrix<complex<T>,N> result;
 #pragma unroll
       for (int i=0; i<N; i++) {
 #pragma unroll
   for (int k=0; k<N; k++) {
     result(i,k).x  = a(i,0).real() * b(0,k).real();
     result(i,k).x -= a(i,0).imag() * b(0,k).imag();
     result(i,k).y  = a(i,0).real() * b(0,k).imag();
     result(i,k).y += a(i,0).imag() * b(0,k).real();
 #pragma unroll
     for (int j=1; j<N; j++) {
       result(i,k).x += a(i,j).real() * b(j,k).real();
       result(i,k).x -= a(i,j).imag() * b(j,k).imag();
       result(i,k).y += a(i,j).real() * b(j,k).imag();
       result(i,k).y += a(i,j).imag() * b(j,k).real();
     }
   }
       }
       return result;
     }

   template<class T, int N>
     __device__ __host__ inline Matrix<T,N> operator *=(Matrix<T,N> & a, const Matrix<T,N>& b){

     Matrix<T,N> c = a;
     a = c*b;
     return a;
   }


   // This is so that I can multiply real and complex matrice
   template<class T, class U, int N>
     __device__ __host__ inline
     Matrix<typename PromoteTypeId<T,U>::Type,N> operator*(const Matrix<T,N> &a, const Matrix<U,N> &b)
     {
       Matrix<typename PromoteTypeId<T,U>::Type,N> result;
 #pragma unroll
       for (int i=0; i<N; i++) {
 #pragma unroll
   for (int k=0; k<N; k++) {
     result(i,k) = a(i,0) * b(0,k);
 #pragma unroll
     for (int j=1; j<N; j++) {
       result(i,k) += a(i,j) * b(j,k);
     }
   }
       }
       return result;
     }


   template<class T>
     __device__ __host__ inline
     Matrix<T,2> operator*(const Matrix<T,2> & a, const Matrix<T,2> & b)
     {
       Matrix<T,2> result;
       result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
       result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
       result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
       result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
       return result;
     }


   template<class T, int N>
     __device__ __host__ inline
     Matrix<T,N> conj(const Matrix<T,N> & other){
       Matrix<T,N> result;
 #pragma unroll
       for (int i=0; i<N; ++i){
 #pragma unroll
         for (int j=0; j<N; ++j){
           result(i,j) = conj( other(j,i) );
         }
       }
       return result;
     }


   template<class T>
     __device__  __host__ inline
     Matrix<T,3> inverse(const Matrix<T,3> &u)
     {
       const T det = getDeterminant(u);
       const T det_inv = static_cast<typename T::value_type>(1.0)/det;
       Matrix<T,3> uinv;

       T temp;

       temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
       uinv(0,0) = (det_inv*temp);

       temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
       uinv(0,1) = (temp*det_inv);

       temp = u(0,1)*u(1,2)  - u(0,2)*u(1,1);
       uinv(0,2) = (temp*det_inv);

       temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
       uinv(1,0) = (det_inv*temp);

       temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
       uinv(1,1) = (temp*det_inv);

       temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
       uinv(1,2) = (temp*det_inv);

       temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
       uinv(2,0) = (det_inv*temp);

       temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
       uinv(2,1) = (temp*det_inv);

       temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
       uinv(2,2) = (temp*det_inv);

       return uinv;
     }


   template<class T, int N>
     __device__ __host__ inline
     void setIdentity(Matrix<T,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
         (*m)(i,i) = 1;
 #pragma unroll
         for (int j=i+1; j<N; ++j){
           (*m)(i,j) = (*m)(j,i) = 0;
         }
       }
       return;
     }


   template<int N>
     __device__ __host__ inline
     void setIdentity(Matrix<float2,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
         (*m)(i,i) = make_float2(1,0);
 #pragma unroll
         for (int j=i+1; j<N; ++j){
           (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
         }
       }
       return;
     }


   template<int N>
     __device__ __host__ inline
     void setIdentity(Matrix<double2,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
         (*m)(i,i) = make_double2(1,0);
 #pragma unroll
         for (int j=i+1; j<N; ++j){
           (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
         }
       }
       return;
     }


   // Need to write more generic code for this!
   template<class T, int N>
     __device__ __host__ inline
     void setZero(Matrix<T,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
 #pragma unroll
         for (int j=0; j<N; ++j){
           (*m)(i,j) = 0;
         }
       }
       return;
     }


   template<int N>
     __device__ __host__ inline
     void setZero(Matrix<float2,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
 #pragma unroll
         for (int j=0; j<N; ++j){
           (*m)(i,j) = make_float2(0.,0.);
         }
       }
       return;
     }


   template<int N>
     __device__ __host__ inline
     void setZero(Matrix<double2,N>* m){

 #pragma unroll
       for (int i=0; i<N; ++i){
 #pragma unroll
         for (int j=0; j<N; ++j){
           (*m)(i,j) = make_double2(0.,0.);
         }
       }
       return;
     }


   template<typename Complex,int N>
     __device__ __host__ inline void makeAntiHerm(Matrix<Complex,N> &m) {
     typedef typename Complex::value_type real;
     // first make the matrix anti-hermitian
     Matrix<Complex,N> am = m - conj(m);

     // second make it traceless
     real imag_trace = 0.0;
 #pragma unroll
     for (int i=0; i<N; i++) imag_trace += am(i,i).y;
 #pragma unroll
     for (int i=0; i<N; i++) {
       am(i,i).y -= imag_trace/N;
     }
     m = 0.5*am;
   }


   // Matrix and array are very similar
   // Maybe I should factor out the similar
   // code. However, I want to make sure that
   // the compiler knows to store the
   // data elements in registers, so I won't do
   // it right now.
   template<class T, int N>
     class Array
     {
       private:
         T data[N];

       public:
         // access function
         __device__ __host__ inline
           T const & operator[](int i) const{
             return data[i];
           }

         // assignment function
         __device__ __host__ inline
           T & operator[](int i){
             return data[i];
           }
     };


   template<class T, int N>
     __device__  __host__ inline
     void copyColumn(const Matrix<T,N>& m, int c, Array<T,N>* a)
     {
 #pragma unroll
       for (int i=0; i<N; ++i){
         (*a)[i] = m(i,c); // c is the column index
       }
       return;
     }


   template<class T, int N>
     __device__ __host__ inline
     void outerProd(const Array<T,N>& a, const Array<T,N> & b, Matrix<T,N>* m){
 #pragma unroll
       for (int i=0; i<N; ++i){
         const T conjb_i = conj(b[i]);
         for (int j=0; j<N; ++j){
           (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
         }
       }
       return;
     }

   template<class T, int N>
     __device__ __host__ inline
     void outerProd(const T (&a)[N], const T (&b)[N], Matrix<T,N>* m){
 #pragma unroll
       for (int i=0; i<N; ++i){
         const T conjb_i = conj(b[i]);
 #pragma unroll
         for (int j=0; j<N; ++j){
           (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
         }
       }
       return;
     }


   // Need some print utilities
   template<class T, int N>
     std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
 #pragma unroll
       for (int i=0; i<N; ++i){
 #pragma unroll
         for (int j=0; j<N; ++j){
           os << m(i,j) << " ";
         }
         if(i<N-1) os << std::endl;
       }
       return os;
     }


   template<class T, int N>
     std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
       for (int i=0; i<N; ++i){
         os << a[i] << " ";
       }
       return os;
     }


   template<class T, class U>
     __device__ inline
     void loadLinkVariableFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<U,3> *link)
     {
 #pragma unroll
       for (int i=0; i<9; ++i){
         link->data[i] = array[idx + (dir*9 + i)*stride];
       }
       return;
     }


   template<class T, class U, int N>
     __device__ inline
     void loadMatrixFromArray(const T* const array, const int idx, const int stride, Matrix<U,N> *mat)
     {
 #pragma unroll
       for (int i=0; i<(N*N); ++i){
         mat->data[i] = array[idx + i*stride];
       }
     }


   __device__ inline
     void loadLinkVariableFromArray(const float2* const array, const int dir, const int idx, const int stride, Matrix<complex<double>,3> *link)
     {
       float2 single_temp;
 #pragma unroll
       for (int i=0; i<9; ++i){
         single_temp = array[idx + (dir*9 + i)*stride];
         link->data[i].x = single_temp.x;
         link->data[i].y = single_temp.y;
       }
       return;
     }


   template<class T, int N, class U>
     __device__ inline
     void writeMatrixToArray(const Matrix<T,N>& mat, const int idx, const int stride, U* const array)
     {
 #pragma unroll
       for (int i=0; i<(N*N); ++i){
         array[idx + i*stride] = mat.data[i];
       }
     }

   __device__ inline
     void appendMatrixToArray(const Matrix<complex<double>,3>& mat, const int idx, const int stride, double2* const array)
     {
 #pragma unroll
       for (int i=0; i<9; ++i){
         array[idx + i*stride].x += mat.data[i].x;
         array[idx + i*stride].y += mat.data[i].y;
       }
     }

   __device__ inline
     void appendMatrixToArray(const Matrix<complex<float>,3>& mat, const int idx, const int stride, float2* const array)
     {
 #pragma unroll
       for (int i=0; i<9; ++i){
         array[idx + i*stride].x += mat.data[i].x;
         array[idx + i*stride].y += mat.data[i].y;
       }
     }


   template<class T, class U>
     __device__ inline
     void writeLinkVariableToArray(const Matrix<T,3> & link, const int dir, const int idx, const int stride, U* const array)
     {
 #pragma unroll
       for (int i=0; i<9; ++i){
         array[idx + (dir*9 + i)*stride] = link.data[i];
       }
       return;
     }


   __device__ inline
     void writeLinkVariableToArray(const Matrix<complex<double>,3> & link, const int dir, const int idx, const int stride, float2* const array)
     {
       float2 single_temp;

 #pragma unroll
       for (int i=0; i<9; ++i){
         single_temp.x = link.data[i].x;
         single_temp.y = link.data[i].y;
         array[idx + (dir*9 + i)*stride] = single_temp;
       }
       return;
     }


   template<class T>
     __device__ inline
     void loadMomentumFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<T,3> *mom)
     {
       T temp2[5];
       temp2[0] = array[idx + dir*stride*5];
       temp2[1] = array[idx + dir*stride*5 + stride];
       temp2[2] = array[idx + dir*stride*5 + 2*stride];
       temp2[3] = array[idx + dir*stride*5 + 3*stride];
       temp2[4] = array[idx + dir*stride*5 + 4*stride];

       mom->data[0].x = 0.;
       mom->data[0].y = temp2[3].x;
       mom->data[1] = temp2[0];
       mom->data[2] = temp2[1];

       mom->data[3].x = -mom->data[1].x;
       mom->data[3].y =  mom->data[1].y;
       mom->data[4].x = 0.;
       mom->data[4].y = temp2[3].y;
       mom->data[5]   = temp2[2];

       mom->data[6].x = -mom->data[2].x;
       mom->data[6].y =  mom->data[2].y;

       mom->data[7].x = -mom->data[5].x;
       mom->data[7].y =  mom->data[5].y;

       mom->data[8].x = 0.;
       mom->data[8].y = temp2[4].x;

       return;
     }


   template<class T, class U>
     __device__  inline
     void writeMomentumToArray(const Matrix<T,3> & mom, const int dir, const int idx, const U coeff, const int stride, T* const array)
     {
       typedef typename T::value_type real;
       T temp2;
       temp2.x = (mom.data[1].x - mom.data[3].x)*0.5*coeff;
       temp2.y = (mom.data[1].y + mom.data[3].y)*0.5*coeff;
       array[idx + dir*stride*5] = temp2;

       temp2.x = (mom.data[2].x - mom.data[6].x)*0.5*coeff;
       temp2.y = (mom.data[2].y + mom.data[6].y)*0.5*coeff;
       array[idx + dir*stride*5 + stride] = temp2;

       temp2.x = (mom.data[5].x - mom.data[7].x)*0.5*coeff;
       temp2.y = (mom.data[5].y + mom.data[7].y)*0.5*coeff;
       array[idx + dir*stride*5 + stride*2] = temp2;

       const real temp = (mom.data[0].y + mom.data[4].y + mom.data[8].y)*0.3333333333333333333333333;
       temp2.x =  (mom.data[0].y-temp)*coeff;
       temp2.y =  (mom.data[4].y-temp)*coeff;
       array[idx + dir*stride*5 + stride*3] = temp2;

       temp2.x = (mom.data[8].y - temp)*coeff;
       temp2.y = 0.0;
       array[idx + dir*stride*5 + stride*4] = temp2;

       return;
     }


   template<class Cmplx>
     __device__  __host__ inline
     void computeLinkInverse(Matrix<Cmplx,3>* uinv, const Matrix<Cmplx,3>& u)
     {

       const Cmplx & det = getDeterminant(u);
       const Cmplx & det_inv = static_cast<typename Cmplx::value_type>(1.0)/det;

       Cmplx temp;

       temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
       (*uinv)(0,0) = (det_inv*temp);

       temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
       (*uinv)(0,1) = (temp*det_inv);

       temp = u(0,1)*u(1,2)  - u(0,2)*u(1,1);
       (*uinv)(0,2) = (temp*det_inv);

       temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
       (*uinv)(1,0) = (det_inv*temp);

       temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
       (*uinv)(1,1) = (temp*det_inv);

       temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
       (*uinv)(1,2) = (temp*det_inv);

       temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
       (*uinv)(2,0) = (det_inv*temp);

       temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
       (*uinv)(2,1) = (temp*det_inv);

       temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
       (*uinv)(2,2) = (temp*det_inv);

       return;
     }
   // template this!
   inline void copyArrayToLink(Matrix<float2,3>* link, float* array){
 #pragma unroll
     for (int i=0; i<3; ++i){
 #pragma unroll
       for (int j=0; j<3; ++j){
         (*link)(i,j).x = array[(i*3+j)*2];
         (*link)(i,j).y = array[(i*3+j)*2 + 1];
       }
     }
     return;
   }

   template<class Cmplx, class Real>
     inline void copyArrayToLink(Matrix<Cmplx,3>* link, Real* array){
 #pragma unroll
       for (int i=0; i<3; ++i){
 #pragma unroll
         for (int j=0; j<3; ++j){
           (*link)(i,j).x = array[(i*3+j)*2];
           (*link)(i,j).y = array[(i*3+j)*2 + 1];
         }
       }
       return;
     }


   // and this!
   inline void copyLinkToArray(float* array, const Matrix<float2,3>& link){
 #pragma unroll
     for (int i=0; i<3; ++i){
 #pragma unroll
       for (int j=0; j<3; ++j){
         array[(i*3+j)*2] = link(i,j).x;
         array[(i*3+j)*2 + 1] = link(i,j).y;
       }
     }
     return;
   }

   // and this!
   template<class Cmplx, class Real>
     inline void copyLinkToArray(Real* array, const Matrix<Cmplx,3>& link){
 #pragma unroll
       for (int i=0; i<3; ++i){
 #pragma unroll
         for (int j=0; j<3; ++j){
           array[(i*3+j)*2] = link(i,j).x;
           array[(i*3+j)*2 + 1] = link(i,j).y;
         }
       }
       return;
     }

   template<class T>
   __device__ __host__ inline Matrix<T,3> getSubTraceUnit(const Matrix<T,3>& a){
     T tr = (a(0,0) + a(1,1) + a(2,2)) / 3.0;
     Matrix<T,3> res;
     res(0,0) = a(0,0) - tr; res(0,1) = a(0,1); res(0,2) = a(0,2);
     res(1,0) = a(1,0); res(1,1) = a(1,1) - tr; res(1,2) = a(1,2);
     res(2,0) = a(2,0); res(2,1) = a(2,1); res(2,2) = a(2,2) - tr;
     return res;
   }

   template<class T>
   __device__ __host__ inline void SubTraceUnit(Matrix<T,3>& a){
     T tr = (a(0,0) + a(1,1) + a(2,2)) / static_cast<T>(3.0);
     a(0,0) -= tr; a(1,1) -= tr; a(2,2) -= tr;
   }

   template<class T>
   __device__ __host__ inline double getRealTraceUVdagger(const Matrix<T,3>& a, const Matrix<T,3>& b){
     double sum = (double)(a(0,0).x * b(0,0).x  + a(0,0).y * b(0,0).y);
     sum += (double)(a(0,1).x * b(0,1).x  + a(0,1).y * b(0,1).y);
     sum += (double)(a(0,2).x * b(0,2).x  + a(0,2).y * b(0,2).y);
     sum += (double)(a(1,0).x * b(1,0).x  + a(1,0).y * b(1,0).y);
     sum += (double)(a(1,1).x * b(1,1).x  + a(1,1).y * b(1,1).y);
     sum += (double)(a(1,2).x * b(1,2).x  + a(1,2).y * b(1,2).y);
     sum += (double)(a(2,0).x * b(2,0).x  + a(2,0).y * b(2,0).y);
     sum += (double)(a(2,1).x * b(2,1).x  + a(2,1).y * b(2,1).y);
     sum += (double)(a(2,2).x * b(2,2).x  + a(2,2).y * b(2,2).y);
     return sum;
   }


   // and this!
   template<class Cmplx>
     __host__ __device__ inline
     void printLink(const Matrix<Cmplx,3>& link){
       printf("(%lf, %lf)\t", link(0,0).x, link(0,0).y);
       printf("(%lf, %lf)\t", link(0,1).x, link(0,1).y);
       printf("(%lf, %lf)\n", link(0,2).x, link(0,2).y);
       printf("(%lf, %lf)\t", link(1,0).x, link(1,0).y);
       printf("(%lf, %lf)\t", link(1,1).x, link(1,1).y);
       printf("(%lf, %lf)\n", link(1,2).x, link(1,2).y);
       printf("(%lf, %lf)\t", link(2,0).x, link(2,0).y);
       printf("(%lf, %lf)\t", link(2,1).x, link(2,1).y);
       printf("(%lf, %lf)\n", link(2,2).x, link(2,2).y);
       printf("\n");
     }

   template<class Cmplx>
   __device__ __host__
     double ErrorSU3(const Matrix<Cmplx,3>& matrix)
     {
       const Matrix<Cmplx,3> identity_comp = conj(matrix)*matrix;
       double error = 0.0;
       Cmplx temp(0,0);
       int i=0;
       int j=0;

       //error = ||U^dagger U - I||_L2
 #pragma unroll
       for (i=0; i<3; ++i)
 #pragma unroll
   for (j=0; j<3; ++j)
     if(i==j) {
       temp = identity_comp(i,j);
       temp -= 1.0;
       error += norm(temp);
     }
     else {
       error += norm(identity_comp(i,j));
     }
       //error is L2 norm, should be (very close) to zero.
       return error;
     }

   template<class T>
     __device__  __host__ inline
     void exponentiate_iQ(const Matrix<T,3>& Q, Matrix<T,3>* exp_iQ)
     {
       // Use Cayley-Hamilton Theorem for SU(3) exp{iQ}.
       // This algorithm is outlined in
       // http://arxiv.org/pdf/hep-lat/0311018v1.pdf
       // Equation numbers in the paper are referenced by [eq_no].

       //Declarations
       typedef decltype(Q(0,0).x) undMatType;

       undMatType inv3 = 1.0/3.0;
       undMatType c0, c1, c0_max, Tr_re;
       undMatType f0_re, f0_im, f1_re, f1_im, f2_re, f2_im;
       undMatType theta;
       undMatType u_p, w_p;  //u, w parameters.
       Matrix<T,3> temp1;
       Matrix<T,3> temp2;
       //[14] c0 = det(Q) = 1/3Tr(Q^3)
       const T & det_Q = getDeterminant(Q);
       c0 = det_Q.x;
       //[15] c1 = 1/2Tr(Q^2)
       // Q = Q^dag => Tr(Q^2) = Tr(QQ^dag) = sum_ab [Q_ab * Q_ab^*]
       temp1 = Q;
       temp1 = temp1 * Q;
       Tr_re = getTrace(temp1).x;
       c1 = 0.5*Tr_re;

       //We now have the coeffiecients c0 and c1.
       //We now find: exp(iQ) = f0*I + f1*Q + f2*Q^2
       //      where       fj = fj(c0,c1), j=0,1,2.

       //[17]
       c0_max = 2*pow(c1*inv3,1.5);

       //[25]
       theta  = acos(c0/c0_max);
       //[23]
       u_p = sqrt(c1*inv3)*cos(theta*inv3);

       //[24]
       w_p = sqrt(c1)*sin(theta*inv3);

       //[29] Construct objects for fj = hj/(9u^2 - w^2).
       undMatType u_sq = u_p*u_p;
       undMatType w_sq = w_p*w_p;
       undMatType denom_inv = 1.0/(9*u_sq - w_sq);
       undMatType exp_iu_re = cos(u_p);
       undMatType exp_iu_im = sin(u_p);
       undMatType exp_2iu_re = exp_iu_re*exp_iu_re - exp_iu_im*exp_iu_im;
       undMatType exp_2iu_im = 2*exp_iu_re*exp_iu_im;
       undMatType cos_w = cos(w_p);
       undMatType sinc_w;
       undMatType hj_re = 0.0;
       undMatType hj_im = 0.0;

       //[33] Added one more term to the series given in the paper.
       if (w_p < 0.05 && w_p > -0.05) {
   //1 - 1/6 x^2 (1 - 1/20 x^2 (1 - 1/42 x^2(1 - 1/72*x^2)))
   sinc_w = 1.0 - (w_sq/6.0)*(1 - (w_sq*0.05)*(1 - (w_sq/42.0)*(1 - (w_sq/72.0))));
       }
       else sinc_w = sin(w_p)/w_p;


       //[34] Test for c0 < 0.
       int parity = 0;
       if(c0 < 0) {
   c0 *= -1.0;
   parity = 1;
   //calculate fj with c0 > 0 and then convert all fj.
       }

       //Get all the numerators for fj,
       //[30] f0
       hj_re = (u_sq - w_sq)*exp_2iu_re + 8*u_sq*cos_w*exp_iu_re + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_im;
       hj_im = (u_sq - w_sq)*exp_2iu_im - 8*u_sq*cos_w*exp_iu_im + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_re;
       f0_re = hj_re*denom_inv;
       f0_im = hj_im*denom_inv;

       //[31] f1
       hj_re = 2*u_p*exp_2iu_re - 2*u_p*cos_w*exp_iu_re + (3*u_sq - w_sq)*sinc_w*exp_iu_im;
       hj_im = 2*u_p*exp_2iu_im + 2*u_p*cos_w*exp_iu_im + (3*u_sq - w_sq)*sinc_w*exp_iu_re;
       f1_re = hj_re*denom_inv;
       f1_im = hj_im*denom_inv;

       //[32] f2
       hj_re = exp_2iu_re - cos_w*exp_iu_re - 3*u_p*sinc_w*exp_iu_im;
       hj_im = exp_2iu_im + cos_w*exp_iu_im - 3*u_p*sinc_w*exp_iu_re;
       f2_re = hj_re*denom_inv;
       f2_im = hj_im*denom_inv;

       //[34] If c0 < 0, apply tranformation  fj(-c0,c1) = (-1)^j f^*j(c0,c1)
       if (parity == 1) {
   f0_im *= -1.0;
   f1_re *= -1.0;
   f2_im *= -1.0;
       }

       T f0_c;
       T f1_c;
       T f2_c;

       f0_c.x = f0_re;
       f0_c.y = f0_im;

       f1_c.x = f1_re;
       f1_c.y = f1_im;

       f2_c.x = f2_re;
       f2_c.y = f2_im;

       //[19] Construct exp{iQ}
       setZero(exp_iQ);
       Matrix<T,3> UnitM;
       setIdentity(&UnitM);
       // +f0*I
       temp1 = f0_c * UnitM;
       *exp_iQ = temp1;

       // +f1*Q
       temp1 = f1_c * Q;
       *exp_iQ += temp1;

       // +f2*Q^2
       temp1 = Q * Q;
       temp2 = f2_c * temp1;
       *exp_iQ += temp2;

       //exp(iQ) is now defined.
       return;
     }

     template <typename Float> __device__ __host__ void expsu3(Matrix<complex<Float>, 3> &q)
     {
       typedef complex<Float> Complex;

       Complex a2 = (q(3) * q(1) + q(7) * q(5) + q(6) * q(2) - (q(0) * q(4) + (q(0) + q(4)) * q(8))) / (Float)3.0;
       Complex a3 = q(0) * q(4) * q(8) + q(1) * q(5) * q(6) + q(2) * q(3) * q(7) - q(6) * q(4) * q(2)
         - q(3) * q(1) * q(8) - q(0) * q(7) * q(5);

       Complex sg2h3 = sqrt(a3 * a3 - (Float)4. * a2 * a2 * a2);
       Complex cp = exp(log((Float)0.5 * (a3 + sg2h3)) / (Float)3.0);
       Complex cm = a2 / cp;

       Complex r1 = exp(Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
       Complex r2 = exp(-Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));

       Complex w1[3];

       w1[0] = cm + cp;
       w1[1] = r1 * cp + r2 * cm;
       w1[2] = r2 * cp + r1 * cm;
       Complex z1 = q(1) * q(6) - q(0) * q(7);
       Complex z2 = q(3) * q(7) - q(4) * q(6);

       Complex al = w1[0];
       Complex wr21 = (z1 + al * q(7)) / (z2 + al * q(6));
       Complex wr31 = (al - q(0) - wr21 * q(3)) / q(6);

       al = w1[1];
       Complex wr22 = (z1 + al * q(7)) / (z2 + al * q(6));
       Complex wr32 = (al - q(0) - wr22 * q(3)) / q(6);

       al = w1[2];
       Complex wr23 = (z1 + al * q(7)) / (z2 + al * q(6));
       Complex wr33 = (al - q(0) - wr23 * q(3)) / q(6);

       z1 = q(3) * q(2) - q(0) * q(5);
       z2 = q(1) * q(5) - q(4) * q(2);

       al = w1[0];
       Complex wl21 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
       Complex wl31 = conj((al - q(0) - conj(wl21) * q(1)) / q(2));

       al = w1[1];
       Complex wl22 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
       Complex wl32 = conj((al - q(0) - conj(wl22) * q(1)) / q(2));

       al = w1[2];
       Complex wl23 = conj((z1 + al * q(5)) / (z2 + al * q(2)));
       Complex wl33 = conj((al - q(0) - conj(wl23) * q(1)) / q(2));

       Complex xn1 = (Float)1. + wr21 * conj(wl21) + wr31 * conj(wl31);
       Complex xn2 = (Float)1. + wr22 * conj(wl22) + wr32 * conj(wl32);
       Complex xn3 = (Float)1. + wr23 * conj(wl23) + wr33 * conj(wl33);

       Complex d1 = exp(w1[0]);
       Complex d2 = exp(w1[1]);
       Complex d3 = exp(w1[2]);
       Complex y11 = d1 / xn1;
       Complex y12 = d2 / xn2;
       Complex y13 = d3 / xn3;
       Complex y21 = wr21 * d1 / xn1;
       Complex y22 = wr22 * d2 / xn2;
       Complex y23 = wr23 * d3 / xn3;
       Complex y31 = wr31 * d1 / xn1;
       Complex y32 = wr32 * d2 / xn2;
       Complex y33 = wr33 * d3 / xn3;
       q(0) = y11 + y12 + y13;
       q(1) = y21 + y22 + y23;
       q(2) = y31 + y32 + y33;
       q(3) = y11 * conj(wl21) + y12 * conj(wl22) + y13 * conj(wl23);
       q(4) = y21 * conj(wl21) + y22 * conj(wl22) + y23 * conj(wl23);
       q(5) = y31 * conj(wl21) + y32 * conj(wl22) + y33 * conj(wl23);
       q(6) = y11 * conj(wl31) + y12 * conj(wl32) + y13 * conj(wl33);
       q(7) = y21 * conj(wl31) + y22 * conj(wl32) + y23 * conj(wl33);
       q(8) = y31 * conj(wl31) + y32 * conj(wl32) + y33 * conj(wl33);
     }

 } // end namespace quda
quda::operator-=
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:131

quda::gauge_wrapper
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
Definition: gauge_field_order.h:44

quda::Matrix::isUnitary
__device__ __host__ bool isUnitary(double max_error) const
Definition: quda_matrix.h:209

quda::Zero
Definition: quda_matrix.h:16

quda::ErrorSU3
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
Definition: quda_matrix.h:1164

float_vector.h

quda::HMatrix_wrapper::i
const int i
Definition: quda_matrix.h:251

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:702

quda::Matrix::size
__device__ __host__ constexpr int size() const
Definition: quda_matrix.h:74

quda::Identity
Definition: quda_matrix.h:40

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1092

quda::Identity::val
__device__ static __host__ T val()

quda::HMatrix::max
__device__ __host__ T max() const
Compute the absolute max element of the Hermitian matrix.
Definition: quda_matrix.h:381

quda::exp
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:96

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda::HMatrix::data
T data[N *N]
Definition: quda_matrix.h:305

quda::getRealTraceUVdagger
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1131

quda::Matrix::Matrix
__device__ __host__ Matrix(const T data_[])
Definition: quda_matrix.h:92

l2
double l2(float *a, float *b, int N)
Definition: new_half.cu:120

quda::HMatrix_wrapper::operator=
__device__ __host__ void operator=(const complex< T > &a)
Definition: quda_matrix.h:257

Mat
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Definition: covdev_reference.cpp:99

quda::complex< float >
Definition: complex_quda.h:458

quda::writeMomentumToArray
__device__ void writeMomentumToArray(const Matrix< T, 3 > &mom, const int dir, const int idx, const U coeff, const int stride, T *const array)
Definition: quda_matrix.h:991

length
int length[]
Definition: gauge_force_test.cpp:34

quda::operator*=
__host__ __device__ float2 operator*=(float2 &x, const float a)
Definition: float_vector.h:151

quda::sum
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62

quda::Matrix::checksum
__device__ __host__ uint64_t checksum() const
Definition: quda_matrix.h:198

quda::Matrix::operator=
__device__ __host__ void operator=(const Matrix< U, N > &b)
Definition: quda_matrix.h:121

quda
Definition: blas_cublas.h:5

quda::loadLinkVariableFromArray
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< U, 3 > *link)
Definition: quda_matrix.h:857

quda::clover::S
This is just a dummy structure we use for trove to define the required structure size.
Definition: clover_field_order.h:794

quda::printLink
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1149

quda::scalar
Definition: register_traits.h:113

quda::copyLinkToArray
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:1088

quda::Matrix::operator()
__device__ __host__ T const  & operator()(int i, int j) const
Definition: quda_matrix.h:100

quda::HMatrix_wrapper::j
const int j
Definition: quda_matrix.h:252

quda::outerProd
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:805

quda::Array::operator[]
__device__ __host__ T const  & operator[](int i) const
Definition: quda_matrix.h:779

quda::HMatrix_wrapper
wrapper class that enables us to write to Hmatrices in packed format
Definition: quda_matrix.h:249

quda::HMatrix::operator()
__device__ __host__ complex< T > const operator()(int i, int j) const
Definition: quda_matrix.h:322

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51

quda::HMatrix::HMatrix
__device__ __host__ HMatrix()
Definition: quda_matrix.h:307

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:72

quda::gauge_ghost_wrapper
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
Definition: gauge_field_order.h:109

quda::Zero::val
__device__ static __host__ T val()

quda::operator-
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
Definition: color_spinor.h:1045

quda::clover_wrapper
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
Definition: clover_field_order.h:34

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111

quda::HMatrix
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:61

quda::Matrix::L2
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Definition: quda_matrix.h:162

quda::copyColumn
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
Definition: quda_matrix.h:793

register_traits.h
Provides precision abstractions and defines the register precision given the storage precision using ...

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

quda::Matrix::operator()
__device__ __host__ T & operator()(int i)
Definition: quda_matrix.h:114

quda::writeLinkVariableToArray
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, U *const array)
Definition: quda_matrix.h:926

quda::SubTraceUnit
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1125

quda::loadMatrixFromArray
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< U, N > *mat)
Definition: quda_matrix.h:869

quda::Matrix::Linf
__device__ __host__ real Linf()
Compute the matrix Linfinity norm - this is the maximum of the absolute row sums. ...
Definition: quda_matrix.h:179

quda::copyArrayToLink
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:1061

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:653

quda::HMatrix::index
__device__ __host__ int index(int i, int j) const
Definition: quda_matrix.h:291

quda::log
__host__ __device__ ValueType log(ValueType x)
Definition: complex_quda.h:101

index
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32

quda::Array
Definition: quda_matrix.h:771

quda::getTrace
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:415

quda::inverse
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611

quda::Matrix::Matrix
__device__ __host__ Matrix(const Matrix< U, N > &a)
Definition: quda_matrix.h:86

quda::s
__shared__ float s[]

quda::operator+
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
Definition: color_spinor.h:1023

quda::Matrix::index
__device__ __host__ int index(int i, int j) const
Definition: quda_matrix.h:69

quda::RealType
Definition: float_vector.h:312

quda::appendMatrixToArray
__device__ void appendMatrixToArray(const Matrix< complex< double >, 3 > &mat, const int idx, const int stride, double2 *const array)
Definition: quda_matrix.h:904

quda::loadMomentumFromArray
__device__ void loadMomentumFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *mom)
Definition: quda_matrix.h:955

quda::getSubTraceUnit
__device__ __host__ Matrix< T, 3 > getSubTraceUnit(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:1115

quda::Matrix::operator()
__device__ __host__ T & operator()(int i, int j)
Definition: quda_matrix.h:104

quda::HMatrix::operator=
__device__ __host__ void operator=(const HMatrix< U, N > &b)
Definition: quda_matrix.h:338

quda::computeLinkInverse
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
Definition: quda_matrix.h:1023

quda::HMatrix::print
__device__ __host__ void print() const
Definition: quda_matrix.h:390

quda::Matrix::L1
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
Definition: quda_matrix.h:143

quda::HMatrix::HMatrix
__device__ __host__ HMatrix(const T data_[])
Definition: quda_matrix.h:317

quda::HMatrix::square
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
Definition: quda_matrix.h:353

quda::Matrix::Matrix
__device__ __host__ Matrix()
Definition: quda_matrix.h:76

complex_quda.h

quda::makeAntiHerm
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746

quda::Matrix::Matrix
__device__ __host__ Matrix(const Matrix< T, N > &a)
Definition: quda_matrix.h:81

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46

quda::Matrix::real
RealType< T >::type real
Definition: quda_matrix.h:66

quda::abs
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125

quda::HMatrix_wrapper::HMatrix_wrapper
__device__ __host__ HMatrix_wrapper(Hmat &mat, int i, int j, int idx)
Definition: quda_matrix.h:255

quda::HMatrix_wrapper::operator+=
__device__ __host__ void operator+=(const complex< T > &a)
Definition: quda_matrix.h:269

quda::operator+=
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:96

quda::operator*
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
Definition: color_spinor.h:1067

quda::complex< double >
Definition: complex_quda.h:580

quda::acos
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:61

quda::getDeterminant
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

mat
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: covdev_reference.cpp:112

quda::expsu3
__device__ __host__ void expsu3(Matrix< complex< Float >, 3 > &q)
Definition: quda_matrix.h:1325

quda::HMatrix_wrapper::mat
Hmat & mat
Definition: quda_matrix.h:250

quda::HMatrix_wrapper::idx
const int idx
Definition: quda_matrix.h:253

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::zero
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:54

quda::writeMatrixToArray
__device__ void writeMatrixToArray(const Matrix< T, N > &mat, const int idx, const int stride, U *const array)
Definition: quda_matrix.h:895

quda::Matrix
Definition: quda_matrix.h:64

quda::HMatrix::HMatrix
__device__ __host__ HMatrix(const HMatrix< T, N > &a)
Definition: quda_matrix.h:312

quda::exponentiate_iQ
__device__ __host__ void exponentiate_iQ(const Matrix< T, 3 > &Q, Matrix< T, 3 > *exp_iQ)
Definition: quda_matrix.h:1191

quda::Array::operator[]
__device__ __host__ T & operator[](int i)
Definition: quda_matrix.h:785

quda::Matrix::operator()
__device__ __host__ T const  & operator()(int i) const
Definition: quda_matrix.h:108