1 #ifndef _QUDA_MATRIX_H_
2 #define _QUDA_MATRIX_H_
62 template<
class T,
class U>
123 template<
class Cmplx>
124 __device__ __host__
inline
133 __device__ __host__
inline
135 return make_double2(a,b);
138 __device__ __host__
inline
140 return make_float2(a,b);
143 template<
class Cmplx>
144 __device__ __host__
inline Cmplx
operator-(
const Cmplx &a){
148 template<
class Cmplx>
149 __device__ __host__
inline Cmplx &
operator+=(Cmplx & a,
const Cmplx & b){
155 template<
class Cmplx>
156 __device__ __host__
inline Cmplx &
operator-=(Cmplx & a,
const Cmplx & b){
163 template<
class Cmplx>
164 __device__ __host__
inline Cmplx
operator+(
const Cmplx & a,
const Cmplx & b){
168 template<
class Cmplx>
169 __device__ __host__
inline Cmplx
operator-(
const Cmplx & a,
const Cmplx & b)
175 template<
class Cmplx>
176 __device__ __host__
inline Cmplx
operator*(
const Cmplx & a,
const typename RealTypeId<Cmplx>::Type & scalar)
181 template<
class Cmplx>
182 __device__ __host__
inline Cmplx
operator+(
const Cmplx & a,
const typename RealTypeId<Cmplx>::Type & scalar)
187 template<
class Cmplx>
188 __device__ __host__
inline Cmplx
operator*(
const typename RealTypeId<Cmplx>::Type & scalar,
const Cmplx & b)
193 __device__ __host__
inline double2
operator*(
const double2 & a,
const double & scalar)
198 __device__ __host__
inline float2
operator*(
const float2 & a,
const float & scalar)
203 template<
class Cmplx,
class Float>
204 __device__ __host__
inline Cmplx
operator+(
const Cmplx & a,
const Float & scalar)
220 template<
class Cmplx>
226 template<
class Cmplx>
232 template<
class Cmplx>
238 template<
class Cmplx>
244 template<
class Cmplx>
245 __device__ __host__
inline Cmplx
operator*(
const Cmplx & a,
const Cmplx & b)
247 return makeComplex(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
250 template<
class Cmplx>
251 __device__ __host__
inline Cmplx
conj(
const Cmplx & a)
256 __device__ __host__
inline double conj(
const double & a)
261 __device__ __host__
inline float conj(
const float & a)
266 template<
typename Cmplx>
267 __device__ __host__
inline Cmplx
Conj(
const Cmplx & a)
274 template<
class Cmplx>
275 __device__ __host__
inline
278 if( fabs(z.x) > fabs(z.y) ){ max = z.x; ratio = z.y/max; }
else{ max=z.y; ratio = z.x/max; }
279 denom = max*max*(1 + ratio*ratio);
285 inline std::ostream &
operator << (std::ostream & os,
const float2 & z){
286 os <<
"(" << z.x <<
"," << z.y <<
")";
290 inline std::ostream &
operator << (std::ostream & os,
const double2 & z){
291 os <<
"(" << z.x <<
"," << z.y <<
")";
301 __device__ __host__
inline
306 __device__ __host__
inline
309 return make_float2(0.,0.);
313 __device__ __host__
inline
316 return make_double2(0.,0.);
324 __device__ __host__
inline
329 __device__ __host__
inline
331 return make_float2(1.,0.);
335 __device__ __host__
inline
337 return make_double2(1.,0.);
341 __device__ __host__
inline
347 template<
class T,
int N>
353 __device__ __host__
inline T
const &
operator()(
int i,
int j)
const{
354 return data[index<N>(i,j)];
358 return data[index<N>(i,j)];
365 (
data[index<N>(j,k)]);
372 (
data[index<N>(j,k)]);
380 return a(0,0) + a(1,1) + a(2,2);
388 result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
389 - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
390 + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
395 template<
class T,
int N>
399 for(
int i=0; i<N*N; i++){
406 template<
class T,
int N>
409 for(
int i=0; i<N*N; i++){
416 template<
class T,
int N>
419 for(
int i=0; i<N*N; i++){
426 template<
class T,
int N>
430 for(
int i=0; i<N*N; i++){
438 template<
class T,
int N,
class S>
441 for(
int i=0; i<N*N; ++i){
448 template<
class T,
int N,
class S>
453 template<
class T,
int N,
class S>
459 template<
class T,
int N>
462 for(
int i=0; i<(N*N); ++i){
471 __device__ __host__
inline
478 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
479 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
480 result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
481 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
482 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
483 result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
484 result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
485 result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
486 result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
490 template<
class T,
int N>
506 template<
class T,
class U>
507 __device__ __host__
inline
511 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
512 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
513 result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
514 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
515 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
516 result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
517 result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
518 result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
519 result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
526 __device__ __host__
inline
530 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
531 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
532 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
533 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
538 template<
class T,
int N>
539 __device__ __host__
inline
542 for(
int i=0; i<N; ++i){
543 for(
int j=0; j<N; ++j){
554 __device__ __host__
inline
563 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
564 (*uinv)(0,0) = (det_inv*temp);
566 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
567 (*uinv)(0,1) = (temp*det_inv);
569 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
570 (*uinv)(0,2) = (temp*det_inv);
572 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
573 (*uinv)(1,0) = (det_inv*temp);
575 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
576 (*uinv)(1,1) = (temp*det_inv);
578 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
579 (*uinv)(1,2) = (temp*det_inv);
581 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
582 (*uinv)(2,0) = (det_inv*temp);
584 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
585 (*uinv)(2,1) = (temp*det_inv);
587 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
588 (*uinv)(2,2) = (temp*det_inv);
595 template<
class T,
int N>
596 __device__ __host__
inline
599 for(
int i=0; i<N; ++i){
601 for(
int j=i+1; j<N; ++j){
602 (*m)(i,j) = (*m)(j,i) = 0;
610 __device__ __host__
inline
613 for(
int i=0; i<N; ++i){
614 (*m)(i,i) = make_float2(1,0);
615 for(
int j=i+1; j<N; ++j){
616 (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
624 __device__ __host__
inline
627 for(
int i=0; i<N; ++i){
628 (*m)(i,i) = make_double2(1,0);
629 for(
int j=i+1; j<N; ++j){
630 (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
638 template<
class T,
int N>
639 __device__ __host__
inline
642 for(
int i=0; i<N; ++i){
643 for(
int j=0; j<N; ++j){
652 __device__ __host__
inline
655 for(
int i=0; i<N; ++i){
656 for(
int j=0; j<N; ++j){
657 (*m)(i,j) = make_float2(0.,0.);
665 __device__ __host__
inline
668 for(
int i=0; i<N; ++i){
669 for(
int j=0; j<N; ++j){
670 (*m)(i,j) = make_double2(0.,0.);
686 template<
class T,
int N>
694 __device__ __host__
inline
700 __device__ __host__
inline
707 template<
class T,
int N>
708 __device__ __host__
inline
711 for(
int i=0; i<N; ++i){
718 template<
class T,
int N>
719 __device__ __host__
inline
721 for(
int i=0; i<N; ++i){
722 const T conjb_i =
Conj(b[i]);
723 for(
int j=0; j<N; ++j){
724 (*m)(j,i) = a[j]*conjb_i;
730 template<
class T,
int N>
731 __device__ __host__
inline
733 for(
int i=0; i<N; ++i){
735 for(
int j=0; j<N; ++j){
736 (*m)(j,i) = a[j]*conjb_i;
744 template<
class T,
int N>
745 std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
746 for(
int i=0; i<N; ++i){
747 for(
int j=0; j<N; ++j){
750 if(i<N-1) os << std::endl;
756 template<
class T,
int N>
757 std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
758 for(
int i=0; i<N; ++i){
769 for(
int i=0; i<9; ++i){
770 link->
data[i] = array[idx + (dir*9 + i)*stride];
776 template<
class T,
int N>
780 for(
int i=0; i<(N*N); ++i){
781 mat->
data[i] = array[idx + i*stride];
790 for(
int i=0; i<9; ++i){
791 single_temp = array[idx + (dir*9 + i)*stride];
792 link->
data[i].x = single_temp.x;
793 link->
data[i].y = single_temp.y;
800 template<
class T,
int N>
804 for(
int i=0; i<(N*N); ++i){
805 array[idx + i*stride] = mat.
data[i];
812 for(
int i=0; i<9; ++i){
813 array[idx + i*stride].x += mat.
data[i].x;
814 array[idx + i*stride].y += mat.
data[i].y;
821 for(
int i=0; i<9; ++i){
822 array[idx + i*stride].x += mat.
data[i].x;
823 array[idx + i*stride].y += mat.
data[i].y;
832 for(
int i=0; i<9; ++i){
833 array[idx + (dir*9 + i)*stride] = link.
data[i];
846 for(
int i=0; i<9; ++i){
847 single_temp.x = link.
data[i].x;
848 single_temp.y = link.
data[i].y;
849 array[idx + (dir*9 + i)*stride] = single_temp;
860 temp2[0] = array[idx + dir*stride*5];
861 temp2[1] = array[idx + dir*stride*5 + stride];
862 temp2[2] = array[idx + dir*stride*5 + 2*stride];
863 temp2[3] = array[idx + dir*stride*5 + 3*stride];
864 temp2[4] = array[idx + dir*stride*5 + 4*stride];
867 mom->
data[0].y = temp2[3].x;
868 mom->
data[1] = temp2[0];
869 mom->
data[2] = temp2[1];
874 mom->
data[4].y = temp2[3].y;
875 mom->
data[5] = temp2[2];
884 mom->
data[8].y = temp2[4].x;
891 template<
class T,
class U>
896 temp2.x = (mom.
data[1].x - mom.
data[3].x)*0.5*coeff;
897 temp2.y = (mom.
data[1].y + mom.
data[3].y)*0.5*coeff;
898 array[idx + dir*stride*5] = temp2;
900 temp2.x = (mom.
data[2].x - mom.
data[6].x)*0.5*coeff;
901 temp2.y = (mom.
data[2].y + mom.
data[6].y)*0.5*coeff;
902 array[idx + dir*stride*5 + stride] = temp2;
904 temp2.x = (mom.
data[5].x - mom.
data[7].x)*0.5*coeff;
905 temp2.y = (mom.
data[5].y + mom.
data[7].y)*0.5*coeff;
906 array[idx + dir*stride*5 + stride*2] = temp2;
909 temp2.x = (mom.
data[0].y-temp)*coeff;
910 temp2.y = (mom.
data[4].y-temp)*coeff;
911 array[idx + dir*stride*5 + stride*3] = temp2;
913 temp2.x = (mom.
data[8].y - temp)*coeff;
915 array[idx + dir*stride*5 + stride*4] = temp2;
922 template<
class Cmplx>
923 __device__ __host__
inline
932 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
933 (*uinv)(0,0) = (det_inv*temp);
935 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
936 (*uinv)(0,1) = (temp*det_inv);
938 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
939 (*uinv)(0,2) = (temp*det_inv);
941 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
942 (*uinv)(1,0) = (det_inv*temp);
944 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
945 (*uinv)(1,1) = (temp*det_inv);
947 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
948 (*uinv)(1,2) = (temp*det_inv);
950 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
951 (*uinv)(2,0) = (det_inv*temp);
953 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
954 (*uinv)(2,1) = (temp*det_inv);
956 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
957 (*uinv)(2,2) = (temp*det_inv);
963 for(
int i=0; i<3; ++i){
964 for(
int j=0; j<3; ++j){
965 (*link)(i,j).
x = array[(i*3+j)*2];
966 (*link)(i,j).
y = array[(i*3+j)*2 + 1];
972 template<
class Cmplx,
class Real>
974 for(
int i=0; i<3; ++i){
975 for(
int j=0; j<3; ++j){
976 (*link)(i,j).
x = array[(i*3+j)*2];
977 (*link)(i,j).
y = array[(i*3+j)*2 + 1];
986 for(
int i=0; i<3; ++i){
987 for(
int j=0; j<3; ++j){
988 array[(i*3+j)*2] = link(i,j).x;
989 array[(i*3+j)*2 + 1] = link(i,j).y;
996 template<
class Cmplx,
class Real>
998 for(
int i=0; i<3; ++i){
999 for(
int j=0; j<3; ++j){
1000 array[(i*3+j)*2] = link(i,j).x;
1001 array[(i*3+j)*2 + 1] = link(i,j).y;
1010 template<
class Cmplx>
1011 __host__ __device__
inline
1013 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
1014 printf(
"(%lf, %lf)\t", link(0,0).
x, link(0,0).
y);
1015 printf(
"(%lf, %lf)\t", link(0,1).
x, link(0,1).
y);
1016 printf(
"(%lf, %lf)\n", link(0,2).
x, link(0,2).
y);
1017 printf(
"(%lf, %lf)\t", link(1,0).
x, link(1,0).
y);
1018 printf(
"(%lf, %lf)\t", link(1,1).
x, link(1,1).
y);
1019 printf(
"(%lf, %lf)\n", link(1,2).
x, link(1,2).
y);
1020 printf(
"(%lf, %lf)\t", link(2,0).
x, link(2,0).
y);
1021 printf(
"(%lf, %lf)\t", link(2,1).
x, link(2,1).
y);
1022 printf(
"(%lf, %lf)\n", link(2,2).
x, link(2,2).
y);
1028 #endif // _QUDA_MATRIX_H_
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
__device__ __host__ complex< typename RealTypeId< T >::Type > const & operator()(int i) const
__device__ __host__ void setZero(Matrix< T, N > *m)
__device__ __host__ T const & operator()(int i, int j) const
__device__ static __host__ T val()
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
__device__ void writeMomentumToArray(const Matrix< T, 3 > &mom, const int dir, const int idx, const U coeff, const int stride, T *const array)
__host__ __device__ float2 operator*=(float2 &x, const float a)
std::ostream & operator<<(std::ostream &output, const CloverFieldParam ¶m)
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
__device__ __host__ Cmplx Conj(const Cmplx &a)
__device__ __host__ T const & operator[](int i) const
__device__ __host__ int index(int i, int j)
FloatingPoint< float > Float
__device__ __host__ Cmplx getPreciseInverse(const Cmplx &z)
__host__ __device__ complex< ValueType > operator-(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
__device__ static __host__ T val()
__constant__ double coeff
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
__device__ __host__ void setIdentity(Matrix< T, N > *m)
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< T, N > *mat)
__host__ __device__ complex< ValueType > operator/(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
__device__ void writeMatrixToArray(const Matrix< T, N > &mat, const int idx, const int stride, T *const array)
__device__ void loadMomentumFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *mom)
__device__ __host__ T & operator()(int i, int j)
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
__device__ void appendMatrixToArray(const Matrix< double2, 3 > &mat, const int idx, const int stride, double2 *const array)
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
__host__ __device__ ValueType conj(ValueType x)
__host__ __device__ complex< ValueType > operator+(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
__device__ __host__ complex< typename RealTypeId< T >::Type > & operator()(int i)
__device__ __host__ T & operator[](int i)
__device__ __host__ Cmplx makeComplex(const typename RealTypeId< Cmplx >::Type &a, const typename RealTypeId< Cmplx >::Type &b)