19 __device__ __host__
inline 24 __device__ __host__
inline 27 return make_float2(0.,0.);
31 __device__ __host__
inline 34 return make_double2(0.,0.);
42 __device__ __host__
inline 47 __device__ __host__
inline 49 return make_float2(1.,0.);
53 __device__ __host__
inline 55 return make_double2(1.,0.);
58 template<
typename Float,
typename T>
struct gauge_wrapper;
59 template<
typename Float,
typename T>
struct gauge_ghost_wrapper;
60 template<
typename Float,
typename T>
struct clover_wrapper;
61 template<
typename T,
int N>
struct HMatrix;
63 template<
class T,
int N>
69 __device__ __host__
inline int index(
int i,
int j)
const {
return i*N + j; }
74 __device__ __host__ constexpr
int size()
const {
return N; }
76 __device__ __host__
inline Matrix() {
78 for (
int i=0; i<N*N; i++)
zero(data[i]);
83 for (
int i=0; i<N*N; i++) data[i] = a.
data[i];
89 for (
int i = 0; i < N * N; i++) data[i] = a.
data[i];
92 __device__ __host__
inline Matrix(
const T data_[])
95 for (
int i=0; i<N*N; i++) data[i] = data_[i];
100 __device__ __host__
inline T
const &
operator()(
int i,
int j)
const {
101 return data[
index(i,j)];
105 return data[
index(i,j)];
108 __device__ __host__
inline T
const &
operator()(
int i)
const {
111 return data[
index(j,k)];
117 return data[
index(j,k)];
123 for (
int i=0; i<N*N; i++) data[i] = b.
data[i];
143 __device__ __host__
inline real
L1() {
146 for (
int j=0; j<N; j++) {
149 for (
int i=0; i<N; i++) {
150 col_sum +=
abs(data[i*N + j]);
152 l1 = col_sum > l1 ? col_sum : l1;
162 __device__ __host__
inline real
L2() {
165 for (
int j=0; j<N; j++) {
167 for (
int i=0; i<N; i++) {
168 l2 +=
norm(data[i*N + j]);
179 __device__ __host__
inline real
Linf() {
182 for (
int i=0; i<N; i++) {
185 for (
int j=0; j<N; j++) {
186 row_sum +=
abs(data[i*N + j]);
188 linf = row_sum > linf ? row_sum : linf;
198 __device__ __host__
inline uint64_t
checksum()
const {
200 constexpr
int length = (N*N*
sizeof(T) +
sizeof(uint64_t) - 1)/
sizeof(uint64_t);
201 uint64_t base_[
length] = { };
202 T *data_ =
reinterpret_cast<T*
>(
static_cast<void*
>(base_) );
203 for (
int i=0; i<N*N; i++) data_[i] = data[i];
204 uint64_t checksum_ = base_[0];
205 for (
int i=1; i<
length; i++) checksum_ ^= base_[i];
209 __device__ __host__
inline bool isUnitary(
double max_error)
const 211 const auto identity =
conj(*
this) * *
this;
214 for (
int i=0; i<N; ++i){
215 if( fabs(identity(i,i).real() - 1.0) > max_error ||
216 fabs(identity(i,i).imag()) > max_error)
return false;
219 for (
int j=i+1; j<N; ++j){
220 if( fabs(identity(i,j).real()) > max_error ||
221 fabs(identity(i,j).imag()) > max_error ||
222 fabs(identity(j,i).real()) > max_error ||
223 fabs(identity(j,i).imag()) > max_error ){
230 for (
int i=0; i<N; i++) {
232 for (
int j=0; j<N; j++) {
233 if (std::isnan((*
this)(i,j).real()) ||
234 std::isnan((*
this)(i,j).imag()))
return false;
248 template <
typename T,
typename Hmat>
255 __device__ __host__
inline HMatrix_wrapper(Hmat &mat,
int i,
int j,
int idx) : mat(mat), i(i), j(j), idx(idx) { }
257 __device__ __host__
inline void operator=(
const complex<T> &a) {
259 mat.data[idx] = a.real();
261 mat.data[idx+0] = a.real();
262 mat.data[idx+1] = a.imag();
264 mat.data[idx+0] = a.real();
265 mat.data[idx+1] = -a.imag();
269 __device__ __host__
inline void operator+=(
const complex<T> &a) {
271 mat.data[idx] += a.real();
273 mat.data[idx+0] += a.real();
274 mat.data[idx+1] += a.imag();
276 mat.data[idx+0] += a.real();
277 mat.data[idx+1] += -a.imag();
285 template<
class T,
int N>
291 __device__ __host__
inline int index(
int i,
int j)
const {
295 int k = N*(N-1)/2 - (N-j)*(N-j-1)/2 + i - j - 1;
299 int k = N*(N-1)/2 - (N-i)*(N-i-1)/2 + j - i - 1;
309 for (
int i=0; i<N*N; i++)
zero(data[i]);
314 for (
int i=0; i<N*N; i++) data[i] = a.
data[i];
317 __device__ __host__
inline HMatrix(
const T data_[]) {
319 for (
int i=0; i<N*N; i++) data[i] = data_[i];
322 __device__ __host__
inline complex<T>
const operator()(
int i,
int j)
const {
323 const int idx =
index(i,j);
325 return complex<T>(data[idx],0.0);
327 return complex<T>(data[idx], data[idx+1]);
329 return complex<T>(data[idx],-data[idx+1]);
340 for (
int i=0; i<N*N; i++) data[i] = b.
data[i];
357 for (
int i=0; i<N; i++) {
359 for (
int k=0; k<N; k++)
if (i<=k) {
360 tmp.x = (*this)(i,0).real() * (*this)(0,k).real();
361 tmp.x -= (*this)(i,0).imag() * (*this)(0,k).imag();
362 tmp.y = (*this)(i,0).real() * (*this)(0,k).imag();
363 tmp.y += (*this)(i,0).imag() * (*this)(0,k).real();
365 for (
int j=1; j<N; j++) {
366 tmp.x += (*this)(i,j).real() * (*this)(j,k).real();
367 tmp.x -= (*this)(i,j).imag() * (*this)(j,k).imag();
368 tmp.y += (*this)(i,j).real() * (*this)(j,k).imag();
369 tmp.y += (*this)(i,j).imag() * (*this)(j,k).real();
381 __device__ __host__
inline T
max()
const 384 T max =
static_cast<T
>(0.0);
386 for (
int i = 0; i < N * N; i++) max = (abs(data[i]) > max ?
abs(data[i]) : max);
390 __device__ __host__
void print()
const {
391 for (
int i=0; i<N; i++) {
393 for (
int j=0; j<N; j++) {
394 printf(
" (%e, %e)", (*
this)(i,j).real(), (*
this)(i,j).imag());
403 template<
class T,
int N>
406 for (
int i=0; i<N; i++) {
408 for (
int j=0; j<N; j++) {
409 (*this)(i,j) = a(i,j);
417 return a(0,0) + a(1,1) + a(2,2);
421 template<
template<
typename,
int>
class Mat,
class T>
425 result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
426 - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
427 + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
432 template<
template<
typename,
int>
class Mat,
class T,
int N>
433 __device__ __host__
inline Mat<T,N>
operator+(
const Mat<T,N> & a,
const Mat<T,N> & b)
437 for (
int i=0; i<N*N; i++) result.data[i] = a.data[i] + b.data[i];
442 template<
template<
typename,
int>
class Mat,
class T,
int N>
443 __device__ __host__
inline Mat<T,N>
operator+=(Mat<T,N> & a,
const Mat<T,N> & b)
446 for (
int i=0; i<N*N; i++) a.data[i] += b.data[i];
450 template<
template<
typename,
int>
class Mat,
class T,
int N>
451 __device__ __host__
inline Mat<T,N>
operator+=(Mat<T,N> & a,
const T & b)
454 for (
int i=0; i<N; i++) a(i,i) += b;
458 template<
template<
typename,
int>
class Mat,
class T,
int N>
459 __device__ __host__
inline Mat<T,N>
operator-=(Mat<T,N> & a,
const Mat<T,N> & b)
462 for (
int i=0; i<N*N; i++) a.data[i] -= b.data[i];
466 template<
template<
typename,
int>
class Mat,
class T,
int N>
467 __device__ __host__
inline Mat<T,N>
operator-(
const Mat<T,N> & a,
const Mat<T,N> & b)
471 for (
int i=0; i<N*N; i++) result.data[i] = a.data[i] - b.data[i];
475 template<
template<
typename,
int>
class Mat,
class T,
int N,
class S>
479 for (
int i=0; i<N*N; ++i) result.data[i] = scalar*a.data[i];
483 template<
template<
typename,
int>
class Mat,
class T,
int N,
class S>
488 template<
template<
typename,
int>
class Mat,
class T,
int N,
class S>
494 template<
template<
typename,
int>
class Mat,
class T,
int N>
495 __device__ __host__
inline Mat<T,N>
operator-(
const Mat<T,N> & a){
498 for (
int i=0; i<(N*N); ++i) result.data[i] = -a.data[i];
506 template<
template<
typename,
int>
class Mat,
class T,
int N>
507 __device__ __host__
inline Mat<T,N>
operator*(
const Mat<T,N> &a,
const Mat<T,N> &b)
511 for (
int i=0; i<N; i++) {
513 for (
int k=0; k<N; k++) {
514 result(i,k) = a(i,0) * b(0,k);
516 for (
int j=1; j<N; j++) {
517 result(i,k) += a(i,j) * b(j,k);
527 template<
template<
typename>
class complex,
typename T,
int N>
530 Matrix<complex<T>,N> result;
532 for (
int i=0; i<N; i++) {
534 for (
int k=0; k<N; k++) {
535 result(i,k).x = a(i,0).
real() * b(0,k).real();
536 result(i,k).x -= a(i,0).imag() * b(0,k).imag();
537 result(i,k).y = a(i,0).
real() * b(0,k).imag();
538 result(i,k).y += a(i,0).imag() * b(0,k).
real();
540 for (
int j=1; j<N; j++) {
541 result(i,k).x += a(i,j).
real() * b(j,k).real();
542 result(i,k).x -= a(i,j).imag() * b(j,k).imag();
543 result(i,k).y += a(i,j).
real() * b(j,k).imag();
544 result(i,k).y += a(i,j).imag() * b(j,k).
real();
551 template<
class T,
int N>
561 template<
class T,
class U,
int N>
562 __device__ __host__
inline 567 for (
int i=0; i<N; i++) {
569 for (
int k=0; k<N; k++) {
570 result(i,k) = a(i,0) * b(0,k);
572 for (
int j=1; j<N; j++) {
573 result(i,k) += a(i,j) * b(j,k);
582 __device__ __host__
inline 586 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
587 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
588 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
589 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
594 template<
class T,
int N>
595 __device__ __host__
inline 599 for (
int i=0; i<N; ++i){
601 for (
int j=0; j<N; ++j){
602 result(i,j) =
conj( other(j,i) );
610 __device__ __host__
inline 614 const T det_inv =
static_cast<typename T::value_type
>(1.0)/det;
619 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
620 uinv(0,0) = (det_inv*temp);
622 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
623 uinv(0,1) = (temp*det_inv);
625 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
626 uinv(0,2) = (temp*det_inv);
628 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
629 uinv(1,0) = (det_inv*temp);
631 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
632 uinv(1,1) = (temp*det_inv);
634 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
635 uinv(1,2) = (temp*det_inv);
637 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
638 uinv(2,0) = (det_inv*temp);
640 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
641 uinv(2,1) = (temp*det_inv);
643 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
644 uinv(2,2) = (temp*det_inv);
651 template<
class T,
int N>
652 __device__ __host__
inline 656 for (
int i=0; i<N; ++i){
659 for (
int j=i+1; j<N; ++j){
660 (*m)(i,j) = (*m)(j,i) = 0;
668 __device__ __host__
inline 672 for (
int i=0; i<N; ++i){
673 (*m)(i,i) = make_float2(1,0);
675 for (
int j=i+1; j<N; ++j){
676 (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
684 __device__ __host__
inline 688 for (
int i=0; i<N; ++i){
689 (*m)(i,i) = make_double2(1,0);
691 for (
int j=i+1; j<N; ++j){
692 (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
700 template<
class T,
int N>
701 __device__ __host__
inline 705 for (
int i=0; i<N; ++i){
707 for (
int j=0; j<N; ++j){
716 __device__ __host__
inline 720 for (
int i=0; i<N; ++i){
722 for (
int j=0; j<N; ++j){
723 (*m)(i,j) = make_float2(0.,0.);
731 __device__ __host__
inline 735 for (
int i=0; i<N; ++i){
737 for (
int j=0; j<N; ++j){
738 (*m)(i,j) = make_double2(0.,0.);
745 template<
typename Complex,
int N>
747 typedef typename Complex::value_type real;
752 real imag_trace = 0.0;
754 for (
int i=0; i<N; i++) imag_trace += am(i,i).y;
756 for (
int i=0; i<N; i++) {
757 am(i,i).y -= imag_trace/N;
770 template<
class T,
int N>
778 __device__ __host__
inline 784 __device__ __host__
inline 791 template<
class T,
int N>
792 __device__ __host__
inline 796 for (
int i=0; i<N; ++i){
803 template<
class T,
int N>
804 __device__ __host__
inline 807 for (
int i=0; i<N; ++i){
808 const T conjb_i =
conj(b[i]);
809 for (
int j=0; j<N; ++j){
810 (*m)(j,i) = a[j]*conjb_i;
816 template<
class T,
int N>
817 __device__ __host__
inline 820 for (
int i=0; i<N; ++i){
821 const T conjb_i =
conj(b[i]);
823 for (
int j=0; j<N; ++j){
824 (*m)(j,i) = a[j]*conjb_i;
832 template<
class T,
int N>
833 std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
835 for (
int i=0; i<N; ++i){
837 for (
int j=0; j<N; ++j){
840 if(i<N-1) os << std::endl;
846 template<
class T,
int N>
847 std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
848 for (
int i=0; i<N; ++i){
855 template<
class T,
class U>
860 for (
int i=0; i<9; ++i){
861 link->
data[i] = array[idx + (dir*9 + i)*stride];
867 template<
class T,
class U,
int N>
872 for (
int i=0; i<(N*N); ++i){
873 mat->
data[i] = array[idx + i*stride];
883 for (
int i=0; i<9; ++i){
884 single_temp = array[idx + (dir*9 + i)*stride];
885 link->data[i].x = single_temp.x;
886 link->data[i].y = single_temp.y;
893 template<
class T,
int N,
class U>
898 for (
int i=0; i<(N*N); ++i){
899 array[idx + i*stride] = mat.
data[i];
907 for (
int i=0; i<9; ++i){
908 array[idx + i*stride].x +=
mat.data[i].x;
909 array[idx + i*stride].y +=
mat.data[i].y;
917 for (
int i=0; i<9; ++i){
918 array[idx + i*stride].x +=
mat.data[i].x;
919 array[idx + i*stride].y +=
mat.data[i].y;
924 template<
class T,
class U>
929 for (
int i=0; i<9; ++i){
930 array[idx + (dir*9 + i)*stride] = link.
data[i];
944 for (
int i=0; i<9; ++i){
945 single_temp.x = link.data[i].x;
946 single_temp.y = link.data[i].y;
947 array[idx + (dir*9 + i)*stride] = single_temp;
958 temp2[0] = array[idx + dir*stride*5];
959 temp2[1] = array[idx + dir*stride*5 + stride];
960 temp2[2] = array[idx + dir*stride*5 + 2*stride];
961 temp2[3] = array[idx + dir*stride*5 + 3*stride];
962 temp2[4] = array[idx + dir*stride*5 + 4*stride];
965 mom->
data[0].y = temp2[3].x;
966 mom->
data[1] = temp2[0];
967 mom->
data[2] = temp2[1];
972 mom->
data[4].y = temp2[3].y;
973 mom->
data[5] = temp2[2];
982 mom->
data[8].y = temp2[4].x;
989 template<
class T,
class U>
993 typedef typename T::value_type real;
995 temp2.x = (mom.
data[1].x - mom.
data[3].x)*0.5*coeff;
996 temp2.y = (mom.
data[1].y + mom.
data[3].y)*0.5*coeff;
997 array[idx + dir*stride*5] = temp2;
999 temp2.x = (mom.
data[2].x - mom.
data[6].x)*0.5*coeff;
1000 temp2.y = (mom.
data[2].y + mom.
data[6].y)*0.5*coeff;
1001 array[idx + dir*stride*5 + stride] = temp2;
1003 temp2.x = (mom.
data[5].x - mom.
data[7].x)*0.5*coeff;
1004 temp2.y = (mom.
data[5].y + mom.
data[7].y)*0.5*coeff;
1005 array[idx + dir*stride*5 + stride*2] = temp2;
1007 const real temp = (mom.
data[0].y + mom.
data[4].y + mom.
data[8].y)*0.3333333333333333333333333;
1008 temp2.x = (mom.
data[0].y-temp)*coeff;
1009 temp2.y = (mom.
data[4].y-temp)*coeff;
1010 array[idx + dir*stride*5 + stride*3] = temp2;
1012 temp2.x = (mom.
data[8].y - temp)*coeff;
1014 array[idx + dir*stride*5 + stride*4] = temp2;
1021 template<
class Cmplx>
1022 __device__ __host__
inline 1027 const Cmplx & det_inv =
static_cast<typename Cmplx::value_type
>(1.0)/det;
1031 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
1032 (*uinv)(0,0) = (det_inv*temp);
1034 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
1035 (*uinv)(0,1) = (temp*det_inv);
1037 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
1038 (*uinv)(0,2) = (temp*det_inv);
1040 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
1041 (*uinv)(1,0) = (det_inv*temp);
1043 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
1044 (*uinv)(1,1) = (temp*det_inv);
1046 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
1047 (*uinv)(1,2) = (temp*det_inv);
1049 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
1050 (*uinv)(2,0) = (det_inv*temp);
1052 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
1053 (*uinv)(2,1) = (temp*det_inv);
1055 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
1056 (*uinv)(2,2) = (temp*det_inv);
1063 for (
int i=0; i<3; ++i){
1065 for (
int j=0; j<3; ++j){
1066 (*link)(i,j).x = array[(i*3+j)*2];
1067 (*link)(i,j).y = array[(i*3+j)*2 + 1];
1073 template<
class Cmplx,
class Real>
1076 for (
int i=0; i<3; ++i){
1078 for (
int j=0; j<3; ++j){
1079 (*link)(i,j).x = array[(i*3+j)*2];
1080 (*link)(i,j).y = array[(i*3+j)*2 + 1];
1090 for (
int i=0; i<3; ++i){
1092 for (
int j=0; j<3; ++j){
1093 array[(i*3+j)*2] = link(i,j).x;
1094 array[(i*3+j)*2 + 1] = link(i,j).y;
1101 template<
class Cmplx,
class Real>
1104 for (
int i=0; i<3; ++i){
1106 for (
int j=0; j<3; ++j){
1107 array[(i*3+j)*2] = link(i,j).x;
1108 array[(i*3+j)*2 + 1] = link(i,j).y;
1116 T tr = (a(0,0) + a(1,1) + a(2,2)) / 3.0;
1118 res(0,0) = a(0,0) - tr; res(0,1) = a(0,1); res(0,2) = a(0,2);
1119 res(1,0) = a(1,0); res(1,1) = a(1,1) - tr; res(1,2) = a(1,2);
1120 res(2,0) = a(2,0); res(2,1) = a(2,1); res(2,2) = a(2,2) - tr;
1126 T tr = (a(0,0) + a(1,1) + a(2,2)) / static_cast<T>(3.0);
1127 a(0,0) -= tr; a(1,1) -= tr; a(2,2) -= tr;
1132 double sum = (double)(a(0,0).x * b(0,0).x + a(0,0).y * b(0,0).y);
1133 sum += (double)(a(0,1).x * b(0,1).x + a(0,1).y * b(0,1).y);
1134 sum += (double)(a(0,2).x * b(0,2).x + a(0,2).y * b(0,2).y);
1135 sum += (double)(a(1,0).x * b(1,0).x + a(1,0).y * b(1,0).y);
1136 sum += (double)(a(1,1).x * b(1,1).x + a(1,1).y * b(1,1).y);
1137 sum += (double)(a(1,2).x * b(1,2).x + a(1,2).y * b(1,2).y);
1138 sum += (double)(a(2,0).x * b(2,0).x + a(2,0).y * b(2,0).y);
1139 sum += (double)(a(2,1).x * b(2,1).x + a(2,1).y * b(2,1).y);
1140 sum += (double)(a(2,2).x * b(2,2).x + a(2,2).y * b(2,2).y);
1147 template<
class Cmplx>
1148 __host__ __device__
inline 1150 printf(
"(%lf, %lf)\t", link(0,0).x, link(0,0).y);
1151 printf(
"(%lf, %lf)\t", link(0,1).x, link(0,1).y);
1152 printf(
"(%lf, %lf)\n", link(0,2).x, link(0,2).y);
1153 printf(
"(%lf, %lf)\t", link(1,0).x, link(1,0).y);
1154 printf(
"(%lf, %lf)\t", link(1,1).x, link(1,1).y);
1155 printf(
"(%lf, %lf)\n", link(1,2).x, link(1,2).y);
1156 printf(
"(%lf, %lf)\t", link(2,0).x, link(2,0).y);
1157 printf(
"(%lf, %lf)\t", link(2,1).x, link(2,1).y);
1158 printf(
"(%lf, %lf)\n", link(2,2).x, link(2,2).y);
1162 template<
class Cmplx>
1178 temp = identity_comp(i,j);
1180 error +=
norm(temp);
1183 error +=
norm(identity_comp(i,j));
1190 __device__ __host__
inline 1199 typedef decltype(Q(0,0).x) undMatType;
1201 undMatType inv3 = 1.0/3.0;
1202 undMatType c0, c1, c0_max, Tr_re;
1203 undMatType f0_re, f0_im, f1_re, f1_im, f2_re, f2_im;
1205 undMatType u_p, w_p;
1223 c0_max = 2*
pow(c1*inv3,1.5);
1226 theta =
acos(c0/c0_max);
1228 u_p =
sqrt(c1*inv3)*
cos(theta*inv3);
1231 w_p =
sqrt(c1)*
sin(theta*inv3);
1234 undMatType u_sq = u_p*u_p;
1235 undMatType w_sq = w_p*w_p;
1236 undMatType denom_inv = 1.0/(9*u_sq - w_sq);
1237 undMatType exp_iu_re =
cos(u_p);
1238 undMatType exp_iu_im =
sin(u_p);
1239 undMatType exp_2iu_re = exp_iu_re*exp_iu_re - exp_iu_im*exp_iu_im;
1240 undMatType exp_2iu_im = 2*exp_iu_re*exp_iu_im;
1241 undMatType cos_w =
cos(w_p);
1243 undMatType hj_re = 0.0;
1244 undMatType hj_im = 0.0;
1247 if (w_p < 0.05 && w_p > -0.05) {
1249 sinc_w = 1.0 - (w_sq/6.0)*(1 - (w_sq*0.05)*(1 - (w_sq/42.0)*(1 - (w_sq/72.0))));
1251 else sinc_w =
sin(w_p)/w_p;
1264 hj_re = (u_sq - w_sq)*exp_2iu_re + 8*u_sq*cos_w*exp_iu_re + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_im;
1265 hj_im = (u_sq - w_sq)*exp_2iu_im - 8*u_sq*cos_w*exp_iu_im + 2*u_p*(3*u_sq + w_sq)*sinc_w*exp_iu_re;
1266 f0_re = hj_re*denom_inv;
1267 f0_im = hj_im*denom_inv;
1270 hj_re = 2*u_p*exp_2iu_re - 2*u_p*cos_w*exp_iu_re + (3*u_sq - w_sq)*sinc_w*exp_iu_im;
1271 hj_im = 2*u_p*exp_2iu_im + 2*u_p*cos_w*exp_iu_im + (3*u_sq - w_sq)*sinc_w*exp_iu_re;
1272 f1_re = hj_re*denom_inv;
1273 f1_im = hj_im*denom_inv;
1276 hj_re = exp_2iu_re - cos_w*exp_iu_re - 3*u_p*sinc_w*exp_iu_im;
1277 hj_im = exp_2iu_im + cos_w*exp_iu_im - 3*u_p*sinc_w*exp_iu_re;
1278 f2_re = hj_re*denom_inv;
1279 f2_im = hj_im*denom_inv;
1306 temp1 = f0_c * UnitM;
1315 temp2 = f2_c * temp1;
1325 template <
typename Float> __device__ __host__
void expsu3(
Matrix<complex<Float>, 3> &q)
1327 typedef complex<Float>
Complex;
1329 Complex a2 = (q(3) * q(1) + q(7) * q(5) + q(6) * q(2) - (q(0) * q(4) + (q(0) + q(4)) * q(8))) / (Float)3.0;
1330 Complex a3 = q(0) * q(4) * q(8) + q(1) * q(5) * q(6) + q(2) * q(3) * q(7) - q(6) * q(4) * q(2)
1331 - q(3) * q(1) * q(8) - q(0) * q(7) * q(5);
1333 Complex sg2h3 =
sqrt(a3 * a3 - (Float)4. * a2 * a2 * a2);
1334 Complex cp =
exp(
log((Float)0.5 * (a3 + sg2h3)) / (Float)3.0);
1335 Complex cm = a2 / cp;
1337 Complex r1 =
exp(
Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1338 Complex r2 =
exp(-
Complex(0.0, 1.0) * (Float)(2.0 * M_PI / 3.0));
1343 w1[1] = r1 * cp + r2 * cm;
1344 w1[2] = r2 * cp + r1 * cm;
1345 Complex z1 = q(1) * q(6) - q(0) * q(7);
1346 Complex z2 = q(3) * q(7) - q(4) * q(6);
1349 Complex wr21 = (z1 + al * q(7)) / (z2 + al * q(6));
1350 Complex wr31 = (al - q(0) - wr21 * q(3)) / q(6);
1353 Complex wr22 = (z1 + al * q(7)) / (z2 + al * q(6));
1354 Complex wr32 = (al - q(0) - wr22 * q(3)) / q(6);
1357 Complex wr23 = (z1 + al * q(7)) / (z2 + al * q(6));
1358 Complex wr33 = (al - q(0) - wr23 * q(3)) / q(6);
1360 z1 = q(3) * q(2) - q(0) * q(5);
1361 z2 = q(1) * q(5) - q(4) * q(2);
1364 Complex wl21 =
conj((z1 + al * q(5)) / (z2 + al * q(2)));
1365 Complex wl31 =
conj((al - q(0) -
conj(wl21) * q(1)) / q(2));
1368 Complex wl22 =
conj((z1 + al * q(5)) / (z2 + al * q(2)));
1369 Complex wl32 =
conj((al - q(0) -
conj(wl22) * q(1)) / q(2));
1372 Complex wl23 =
conj((z1 + al * q(5)) / (z2 + al * q(2)));
1373 Complex wl33 =
conj((al - q(0) -
conj(wl23) * q(1)) / q(2));
1375 Complex xn1 = (Float)1. + wr21 *
conj(wl21) + wr31 *
conj(wl31);
1376 Complex xn2 = (Float)1. + wr22 *
conj(wl22) + wr32 *
conj(wl32);
1377 Complex xn3 = (Float)1. + wr23 *
conj(wl23) + wr33 *
conj(wl33);
1379 Complex d1 =
exp(w1[0]);
1380 Complex d2 =
exp(w1[1]);
1381 Complex d3 =
exp(w1[2]);
1382 Complex y11 = d1 / xn1;
1383 Complex y12 = d2 / xn2;
1384 Complex y13 = d3 / xn3;
1385 Complex y21 = wr21 * d1 / xn1;
1386 Complex y22 = wr22 * d2 / xn2;
1387 Complex y23 = wr23 * d3 / xn3;
1388 Complex y31 = wr31 * d1 / xn1;
1389 Complex y32 = wr32 * d2 / xn2;
1390 Complex y33 = wr33 * d3 / xn3;
1391 q(0) = y11 + y12 + y13;
1392 q(1) = y21 + y22 + y23;
1393 q(2) = y31 + y32 + y33;
1394 q(3) = y11 *
conj(wl21) + y12 *
conj(wl22) + y13 *
conj(wl23);
1395 q(4) = y21 *
conj(wl21) + y22 *
conj(wl22) + y23 *
conj(wl23);
1396 q(5) = y31 *
conj(wl21) + y32 *
conj(wl22) + y33 *
conj(wl23);
1397 q(6) = y11 *
conj(wl31) + y12 *
conj(wl32) + y13 *
conj(wl33);
1398 q(7) = y21 *
conj(wl31) + y22 *
conj(wl32) + y23 *
conj(wl33);
1399 q(8) = y31 *
conj(wl31) + y32 *
conj(wl32) + y33 *
conj(wl33);
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
__device__ __host__ bool isUnitary(double max_error) const
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
__device__ __host__ void setZero(Matrix< T, N > *m)
__device__ __host__ constexpr int size() const
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__device__ static __host__ T val()
__device__ __host__ T max() const
Compute the absolute max element of the Hermitian matrix.
__host__ __device__ ValueType exp(ValueType x)
__host__ __device__ ValueType sqrt(ValueType x)
cudaColorSpinorField * tmp
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
__device__ __host__ Matrix(const T data_[])
double l2(float *a, float *b, int N)
__device__ __host__ void operator=(const complex< T > &a)
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
__device__ void writeMomentumToArray(const Matrix< T, 3 > &mom, const int dir, const int idx, const U coeff, const int stride, T *const array)
__host__ __device__ float2 operator*=(float2 &x, const float a)
__host__ __device__ void sum(double &a, double &b)
__device__ __host__ uint64_t checksum() const
__device__ __host__ void operator=(const Matrix< U, N > &b)
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< U, 3 > *link)
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
__device__ __host__ T const & operator()(int i, int j) const
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
__device__ __host__ T const & operator[](int i) const
wrapper class that enables us to write to Hmatrices in packed format
__device__ __host__ complex< T > const operator()(int i, int j) const
__host__ __device__ ValueType sin(ValueType x)
__device__ __host__ HMatrix()
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
__device__ static __host__ T val()
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
Provides precision abstractions and defines the register precision given the storage precision using ...
std::complex< double > Complex
__device__ __host__ T & operator()(int i)
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, U *const array)
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< U, N > *mat)
__device__ __host__ real Linf()
Compute the matrix Linfinity norm - this is the maximum of the absolute row sums. ...
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
__device__ __host__ void setIdentity(Matrix< T, N > *m)
__device__ __host__ int index(int i, int j) const
__host__ __device__ ValueType log(ValueType x)
static int index(int ndim, const int *dims, const int *x)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
__device__ __host__ Matrix(const Matrix< U, N > &a)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ int index(int i, int j) const
__device__ void appendMatrixToArray(const Matrix< complex< double >, 3 > &mat, const int idx, const int stride, double2 *const array)
__device__ void loadMomentumFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *mom)
__device__ __host__ Matrix< T, 3 > getSubTraceUnit(const Matrix< T, 3 > &a)
__device__ __host__ T & operator()(int i, int j)
__device__ __host__ void operator=(const HMatrix< U, N > &b)
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
__device__ __host__ void print() const
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
__device__ __host__ HMatrix(const T data_[])
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
__device__ __host__ Matrix()
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
__device__ __host__ Matrix(const Matrix< T, N > &a)
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType abs(ValueType x)
__device__ __host__ HMatrix_wrapper(Hmat &mat, int i, int j, int idx)
__device__ __host__ void operator+=(const complex< T > &a)
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__host__ __device__ ValueType acos(ValueType x)
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
__device__ __host__ void expsu3(Matrix< complex< Float >, 3 > &q)
__device__ __host__ void zero(vector_type< scalar, n > &v)
__device__ void writeMatrixToArray(const Matrix< T, N > &mat, const int idx, const int stride, U *const array)
__device__ __host__ HMatrix(const HMatrix< T, N > &a)
__device__ __host__ void exponentiate_iQ(const Matrix< T, 3 > &Q, Matrix< T, 3 > *exp_iQ)
__device__ __host__ T & operator[](int i)
__device__ __host__ T const & operator()(int i) const