1 #ifndef _QUDA_MATRIX_H_
2 #define _QUDA_MATRIX_H_
44 template<
class T,
class U>
105 template<
class Cmplx>
106 __device__ __host__
inline
115 __device__ __host__
inline
117 return make_double2(a,b);
120 __device__ __host__
inline
122 return make_float2(a,b);
126 template<
class Cmplx>
127 __device__ __host__
inline Cmplx &
operator+=(Cmplx & a,
const Cmplx & b){
134 template<
class Cmplx>
135 __device__ __host__
inline Cmplx
operator+(
const Cmplx & a,
const Cmplx & b){
139 template<
class Cmplx>
140 __device__ __host__
inline Cmplx
operator-(
const Cmplx & a,
const Cmplx & b)
145 template<
class Cmplx>
151 template<
class Cmplx>
157 template<
class Cmplx>
163 template<
class Cmplx>
169 template<
class Cmplx>
175 template<
class Cmplx>
182 template<
class Cmplx>
188 template<
class Cmplx>
189 __device__ __host__
inline Cmplx
operator*(
const Cmplx & a,
const Cmplx & b)
191 return makeComplex(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
194 template<
class Cmplx>
195 __device__ __host__
inline Cmplx
conj(
const Cmplx & a)
200 __device__ __host__
inline double conj(
const double & a)
205 __device__ __host__
inline float conj(
const float & a)
212 template<
class Cmplx>
213 __device__ __host__
inline
216 if( fabs(z.x) > fabs(z.y) ){ max = z.x; ratio = z.y/max; }
else{ max=z.y; ratio = z.x/max; }
217 denom = max*max*(1 + ratio*ratio);
223 inline std::ostream &
operator << (std::ostream & os,
const float2 & z){
224 os <<
"(" << z.x <<
"," << z.y <<
")";
228 inline std::ostream &
operator << (std::ostream & os,
const double2 & z){
229 os <<
"(" << z.x <<
"," << z.y <<
")";
239 __device__ __host__
inline
244 __device__ __host__
inline
247 return make_float2(0.,0.);
251 __device__ __host__
inline
254 return make_double2(0.,0.);
262 __device__ __host__
inline
267 __device__ __host__
inline
269 return make_float2(1.,0.);
273 __device__ __host__
inline
275 return make_double2(1.,0.);
279 __device__ __host__
inline
285 template<
class T,
int N>
291 __device__ __host__
inline T
const &
operator()(
int i,
int j)
const{
292 return data[index<N>(i,j)];
296 return data[index<N>(i,j)];
303 return a(0,0) + a(1,1) + a(2,2);
311 result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
312 - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
313 + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
318 template<
class T,
int N>
322 for(
int i=0; i<N*N; i++){
329 template<
class T,
int N>
332 for(
int i=0; i<N*N; i++){
340 template<
class T,
int N>
344 for(
int i=0; i<N*N; i++){
352 template<
class T,
int N,
class S>
355 for(
int i=0; i<N*N; ++i){
362 template<
class T,
int N,
class S>
370 __device__ __host__
inline
377 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
378 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
379 result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
380 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
381 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
382 result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
383 result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
384 result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
385 result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
391 template<
class T,
class U>
392 __device__ __host__
inline
396 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
397 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
398 result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
399 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
400 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
401 result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
402 result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
403 result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
404 result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
411 __device__ __host__
inline
415 result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
416 result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
417 result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
418 result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
423 template<
class T,
int N>
424 __device__ __host__
inline
427 for(
int i=0; i<N; ++i){
428 for(
int j=0; j<N; ++j){
429 result(i,j) =
conj(other(j,i));
437 __device__ __host__
inline
446 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
447 (*uinv)(0,0) = (det_inv*temp);
449 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
450 (*uinv)(0,1) = (temp*det_inv);
452 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
453 (*uinv)(0,2) = (temp*det_inv);
455 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
456 (*uinv)(1,0) = (det_inv*temp);
458 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
459 (*uinv)(1,1) = (temp*det_inv);
461 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
462 (*uinv)(1,2) = (temp*det_inv);
464 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
465 (*uinv)(2,0) = (det_inv*temp);
467 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
468 (*uinv)(2,1) = (temp*det_inv);
470 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
471 (*uinv)(2,2) = (temp*det_inv);
478 template<
class T,
int N>
479 __device__ __host__
inline
482 for(
int i=0; i<N; ++i){
484 for(
int j=i+1; j<N; ++j){
485 (*m)(i,j) = (*m)(j,i) = 0;
493 __device__ __host__
inline
496 for(
int i=0; i<N; ++i){
497 (*m)(i,i) = make_float2(1,0);
498 for(
int j=i+1; j<N; ++j){
499 (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
507 __device__ __host__
inline
510 for(
int i=0; i<N; ++i){
511 (*m)(i,i) = make_double2(1,0);
512 for(
int j=i+1; j<N; ++j){
513 (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
521 template<
class T,
int N>
522 __device__ __host__
inline
525 for(
int i=0; i<N; ++i){
526 for(
int j=0; j<N; ++j){
535 __device__ __host__
inline
538 for(
int i=0; i<N; ++i){
539 for(
int j=0; j<N; ++j){
540 (*m)(i,j) = make_float2(0.,0.);
548 __device__ __host__
inline
551 for(
int i=0; i<N; ++i){
552 for(
int j=0; j<N; ++j){
553 (*m)(i,j) = make_double2(0.,0.);
569 template<
class T,
int N>
577 __device__ __host__
inline
583 __device__ __host__
inline
590 template<
class T,
int N>
591 __device__ __host__
inline
594 for(
int i=0; i<N; ++i){
601 template<
class T,
int N>
602 __device__ __host__
inline
604 for(
int i=0; i<N; ++i){
605 const T conjb_i =
conj(b[i]);
606 for(
int j=0; j<N; ++j){
607 (*m)(j,i) = a[j]*conjb_i;
615 template<
class T,
int N>
616 std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
617 for(
int i=0; i<N; ++i){
618 for(
int j=0; j<N; ++j){
621 if(i<N-1) os << std::endl;
627 template<
class T,
int N>
628 std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
629 for(
int i=0; i<N; ++i){
640 for(
int i=0; i<9; ++i){
641 link->
data[i] = array[idx + (dir*9 + i)*stride];
651 for(
int i=0; i<9; ++i){
652 single_temp = array[idx + (dir*9 + i)*stride];
653 link->
data[i].x = single_temp.x;
654 link->
data[i].y = single_temp.y;
667 for(
int i=0; i<9; ++i){
668 array[idx + (dir*9 + i)*stride] = link.
data[i];
681 for(
int i=0; i<9; ++i){
682 single_temp.x = link.
data[i].x;
683 single_temp.y = link.
data[i].y;
684 array[idx + (dir*9 + i)*stride] = single_temp;
690 template<
class Cmplx>
691 __device__ __host__
inline
700 temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
701 (*uinv)(0,0) = (det_inv*temp);
703 temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
704 (*uinv)(0,1) = (temp*det_inv);
706 temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
707 (*uinv)(0,2) = (temp*det_inv);
709 temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
710 (*uinv)(1,0) = (det_inv*temp);
712 temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
713 (*uinv)(1,1) = (temp*det_inv);
715 temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
716 (*uinv)(1,2) = (temp*det_inv);
718 temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
719 (*uinv)(2,0) = (det_inv*temp);
721 temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
722 (*uinv)(2,1) = (temp*det_inv);
724 temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
725 (*uinv)(2,2) = (temp*det_inv);
731 for(
int i=0; i<3; ++i){
732 for(
int j=0; j<3; ++j){
733 (*link)(i,j).
x = array[(i*3+j)*2];
734 (*link)(i,j).y = array[(i*3+j)*2 + 1];
740 template<
class Cmplx,
class Real>
742 for(
int i=0; i<3; ++i){
743 for(
int j=0; j<3; ++j){
744 (*link)(i,j).
x = array[(i*3+j)*2];
745 (*link)(i,j).y = array[(i*3+j)*2 + 1];
754 for(
int i=0; i<3; ++i){
755 for(
int j=0; j<3; ++j){
756 array[(i*3+j)*2] = link(i,j).x;
757 array[(i*3+j)*2 + 1] = link(i,j).y;
764 template<
class Cmplx,
class Real>
766 for(
int i=0; i<3; ++i){
767 for(
int j=0; j<3; ++j){
768 array[(i*3+j)*2] = link(i,j).x;
769 array[(i*3+j)*2 + 1] = link(i,j).y;
778 template<
class Cmplx>
779 __host__ __device__
inline
781 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
782 printf(
"(%lf, %lf)\t", link(0,0).
x, link(0,0).y);
783 printf(
"(%lf, %lf)\t", link(0,1).
x, link(0,1).y);
784 printf(
"(%lf, %lf)\n", link(0,2).
x, link(0,2).y);
785 printf(
"(%lf, %lf)\t", link(1,0).
x, link(1,0).y);
786 printf(
"(%lf, %lf)\t", link(1,1).
x, link(1,1).y);
787 printf(
"(%lf, %lf)\n", link(1,2).
x, link(1,2).y);
788 printf(
"(%lf, %lf)\t", link(2,0).
x, link(2,0).y);
789 printf(
"(%lf, %lf)\t", link(2,1).
x, link(2,1).y);
790 printf(
"(%lf, %lf)\n", link(2,2).
x, link(2,2).y);
796 #endif // _QUDA_MATRIX_H_