QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
quda_matrix.h
Go to the documentation of this file.
1 #ifndef _QUDA_MATRIX_H_
2 #define _QUDA_MATRIX_H_
3 
4 #include <cstdio>
5 
6 #include <cstdlib>
7 #include <iostream>
8 #include <iomanip>
9 #include <cuda.h>
10 
11 #include <float_vector.h>
12 
13 #include <complex_quda.h>
14 
15 namespace quda{
16 
17  // Given a real type T, returns the corresponding complex type
18  template<class T>
19  struct ComplexTypeId;
20 
21  template<>
22  struct ComplexTypeId<float>
23  {
24  typedef float2 Type;
25  };
26 
27  template<>
28  struct ComplexTypeId<double>
29  {
30  typedef double2 Type;
31  };
32 
33  template<class T>
34  struct RealTypeId;
35 
36  template<>
37  struct RealTypeId<float>
38  {
39  typedef float Type;
40  };
41 
42  template<>
43  struct RealTypeId<double>
44  {
45  typedef double Type;
46  };
47 
48 
49  template<>
50  struct RealTypeId<float2>
51  {
52  typedef float Type;
53  };
54 
55  template<>
56  struct RealTypeId<double2>
57  {
58  typedef double Type;
59  };
60 
61 
62  template<class T, class U>
64  {
65  typedef T Type;
66  };
67 
68 
69  template<>
70  struct PromoteTypeId<float2, float>
71  {
72  typedef float2 Type;
73  };
74 
75 
76  template<>
77  struct PromoteTypeId<float, float2>
78  {
79  typedef float2 Type;
80  };
81 
82 
83  template<>
84  struct PromoteTypeId<double2, double>
85  {
86  typedef double2 Type;
87  };
88 
89 
90  template<>
91  struct PromoteTypeId<double, double2>
92  {
93  typedef double2 Type;
94  };
95 
96  template<>
97  struct PromoteTypeId<double,int>
98  {
99  typedef double Type;
100  };
101 
102  template<>
103  struct PromoteTypeId<int,double>
104  {
105  typedef double Type;
106  };
107 
108  template<>
109  struct PromoteTypeId<float,int>
110  {
111  typedef float Type;
112  };
113 
114  template<>
115  struct PromoteTypeId<int,float>
116  {
117  typedef float Type;
118  };
119 
120 
121 
122 
123  template<class Cmplx>
124  __device__ __host__ inline
125  Cmplx makeComplex(const typename RealTypeId<Cmplx>::Type & a, const typename RealTypeId<Cmplx>::Type & b)
126  {
127  Cmplx z;
128  z.x = a; z.y = b;
129  return z;
130  }
131 
132 
133  __device__ __host__ inline
134  double2 makeComplex(const double & a, const double & b){
135  return make_double2(a,b);
136  }
137 
138  __device__ __host__ inline
139  float2 makeComplex(const float & a, const float & b){
140  return make_float2(a,b);
141  }
142 
143  template<class Cmplx>
144  __device__ __host__ inline Cmplx operator-(const Cmplx &a){
145  return makeComplex(-a.x, -a.y);
146  }
147 
148  template<class Cmplx>
149  __device__ __host__ inline Cmplx & operator+=(Cmplx & a, const Cmplx & b){
150  a.x += b.x;
151  a.y += b.y;
152  return a;
153  }
154 
155  template<class Cmplx>
156  __device__ __host__ inline Cmplx & operator-=(Cmplx & a, const Cmplx & b){
157  a.x -= b.x;
158  a.y -= b.y;
159  return a;
160  }
161 
162 
163  template<class Cmplx>
164  __device__ __host__ inline Cmplx operator+(const Cmplx & a, const Cmplx & b){
165  return makeComplex(a.x+b.x,a.y+b.y);
166  }
167 
168  template<class Cmplx>
169  __device__ __host__ inline Cmplx operator-(const Cmplx & a, const Cmplx & b)
170  {
171  return makeComplex(a.x-b.x,a.y-b.y);
172  }
173 
174 #ifdef __GNU_C__
175  template<class Cmplx>
176  __device__ __host__ inline Cmplx operator*(const Cmplx & a, const typename RealTypeId<Cmplx>::Type & scalar)
177  {
178  return makeComplex(a.x*scalar,a.y*scalar);
179  }
180 
181  template<class Cmplx>
182  __device__ __host__ inline Cmplx operator+(const Cmplx & a, const typename RealTypeId<Cmplx>::Type & scalar)
183  {
184  return makeComplex(a.x+scalar,a.y);
185  }
186 
187  template<class Cmplx>
188  __device__ __host__ inline Cmplx operator*(const typename RealTypeId<Cmplx>::Type & scalar, const Cmplx & b)
189  {
190  return operator*(b,scalar);
191  }
192 #else
193  __device__ __host__ inline double2 operator*(const double2 & a, const double & scalar)
194  {
195  return makeComplex(a.x*scalar,a.y*scalar);
196  }
197 
198  __device__ __host__ inline float2 operator*(const float2 & a, const float & scalar)
199  {
200  return makeComplex(a.x*scalar,a.y*scalar);
201  }
202 
203  template<class Cmplx, class Float>
204  __device__ __host__ inline Cmplx operator+(const Cmplx & a, const Float & scalar)
205  {
206  return makeComplex(a.x+scalar,a.y);
207  }
208 
209  /*__device__ __host__ inline float2 operator*(const float & scalar, const float2 & b)
210  {
211  return makeComplex(b.x*scalar,b.y*scalar);
212  }
213 
214  __device__ __host__ inline double2 operator*(const double & scalar, const double2 & b)
215  {
216  return makeComplex(b.x*scalar,b.y*scalar);
217  }*/
218 #endif
219 
220  template<class Cmplx>
221  __device__ __host__ inline Cmplx operator/(const Cmplx & a, const typename RealTypeId<Cmplx>::Type & scalar)
222  {
223  return makeComplex(a.x/scalar,a.y/scalar);
224  }
225 
226  template<class Cmplx>
227  __device__ __host__ inline Cmplx operator+(const typename RealTypeId<Cmplx>::Type & scalar, const Cmplx & a)
228  {
229  return makeComplex(a.x+scalar,a.y);
230  }
231 
232  template<class Cmplx>
233  __device__ __host__ inline Cmplx operator-(const Cmplx & a, const typename RealTypeId<Cmplx>::Type & scalar)
234  {
235  return makeComplex(a.x-scalar,a.y);
236  }
237 
238  template<class Cmplx>
239  __device__ __host__ inline Cmplx operator-(const typename RealTypeId<Cmplx>::Type & scalar, const Cmplx & a)
240  {
241  return makeComplex(scalar-a.x,-a.y);
242  }
243 
244  template<class Cmplx>
245  __device__ __host__ inline Cmplx operator*(const Cmplx & a, const Cmplx & b)
246  {
247  return makeComplex(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
248  }
249 
250  template<class Cmplx>
251  __device__ __host__ inline Cmplx conj(const Cmplx & a)
252  {
253  return makeComplex(a.x,-a.y);
254  }
255 
256  __device__ __host__ inline double conj(const double & a)
257  {
258  return a;
259  }
260 
261  __device__ __host__ inline float conj(const float & a)
262  {
263  return a;
264  }
265 
266  template<typename Cmplx>
267  __device__ __host__ inline Cmplx Conj(const Cmplx & a)
268  {
269  return makeComplex(a.x,-a.y);
270  }
271 
272 
273 
274  template<class Cmplx>
275  __device__ __host__ inline
276  Cmplx getPreciseInverse(const Cmplx & z){
277  typename RealTypeId<Cmplx>::Type ratio, max, denom;
278  if( fabs(z.x) > fabs(z.y) ){ max = z.x; ratio = z.y/max; }else{ max=z.y; ratio = z.x/max; }
279  denom = max*max*(1 + ratio*ratio);
280  return makeComplex(z.x/denom, -z.y/denom);
281  }
282 
283 
284  // for printing
285  inline std::ostream & operator << (std::ostream & os, const float2 & z){
286  os << "(" << z.x << "," << z.y << ")";
287  return os;
288  }
289 
290  inline std::ostream & operator << (std::ostream & os, const double2 & z){
291  os << "(" << z.x << "," << z.y << ")";
292  return os;
293  }
294 
295 
296 
297  template<class T>
298  struct Zero
299  {
300  //static const T val;
301  __device__ __host__ inline
302  static T val();
303  };
304 
305  template<>
306  __device__ __host__ inline
308  {
309  return make_float2(0.,0.);
310  }
311 
312  template<>
313  __device__ __host__ inline
315  {
316  return make_double2(0.,0.);
317  }
318 
319 
320 
321  template<class T>
322  struct Identity
323  {
324  __device__ __host__ inline
325  static T val();
326  };
327 
328  template<>
329  __device__ __host__ inline
331  return make_float2(1.,0.);
332  }
333 
334  template<>
335  __device__ __host__ inline
337  return make_double2(1.,0.);
338  }
339 
340  template<int N>
341  __device__ __host__ inline
342  int index(int i, int j)
343  {
344  return i*N + j;
345  }
346 
347  template<class T, int N>
348  class Matrix
349  {
350  public:
351  T data[N*N];
352 
353  __device__ __host__ inline T const & operator()(int i, int j) const{
354  return data[index<N>(i,j)];
355  }
356 
357  __device__ __host__ inline T & operator()(int i, int j){
358  return data[index<N>(i,j)];
359  }
360 
361  __device__ __host__ inline complex<typename RealTypeId<T>::Type> const & operator()(int i) const{
362  int j = i % N;
363  int k = i / N;
364  return static_cast<complex<typename RealTypeId<T>::Type> >
365  (data[index<N>(j,k)]);
366  }
367 
368  __device__ __host__ inline complex<typename RealTypeId<T>::Type>& operator()(int i) {
369  int j = i % N;
370  int k = i / N;
371  return static_cast<complex<typename RealTypeId<T>::Type>& >
372  (data[index<N>(j,k)]);
373  }
374 
375  };
376 
377  template<class T>
378  __device__ __host__ inline T getTrace(const Matrix<T,3>& a)
379  {
380  return a(0,0) + a(1,1) + a(2,2);
381  }
382 
383 
384  template<class T>
385  __device__ __host__ inline T getDeterminant(const Matrix<T,3> & a){
386 
387  T result;
388  result = a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2))
389  - a(0,1)*(a(1,0)*a(2,2) - a(1,2)*a(2,0))
390  + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
391 
392  return result;
393  }
394 
395  template<class T, int N>
396  __device__ __host__ inline Matrix<T,N> operator+(const Matrix<T,N> & a, const Matrix<T,N> & b)
397  {
398  Matrix<T,N> result;
399  for(int i=0; i<N*N; i++){
400  result.data[i] = a.data[i] + b.data[i];
401  }
402  return result;
403  }
404 
405 
406  template<class T, int N>
407  __device__ __host__ inline Matrix<T,N> operator+=(Matrix<T,N> & a, const Matrix<T,N> & b)
408  {
409  for(int i=0; i<N*N; i++){
410  a.data[i] += b.data[i];
411  }
412  return a;
413  }
414 
415 
416  template<class T, int N>
417  __device__ __host__ inline Matrix<T,N> operator-=(Matrix<T,N> & a, const Matrix<T,N> & b)
418  {
419  for(int i=0; i<N*N; i++){
420  a.data[i] -= b.data[i];
421  }
422  return a;
423  }
424 
425 
426  template<class T, int N>
427  __device__ __host__ inline Matrix<T,N> operator-(const Matrix<T,N> & a, const Matrix<T,N> & b)
428  {
429  Matrix<T,N> result;
430  for(int i=0; i<N*N; i++){
431  result.data[i] = a.data[i] - b.data[i];
432  }
433  return result;
434  }
435 
436 
437 
438  template<class T, int N, class S>
439  __device__ __host__ inline Matrix<T,N> operator*(const S & scalar, const Matrix<T,N> & a){
440  Matrix<T,N> result;
441  for(int i=0; i<N*N; ++i){
442  result.data[i] = scalar*a.data[i];
443  }
444  return result;
445  }
446 
447 
448  template<class T, int N, class S>
449  __device__ __host__ inline Matrix<T,N> operator*(const Matrix<T,N> & a, const S & scalar){
450  return scalar*a;
451  }
452 
453  template<class T, int N, class S>
454  __device__ __host__ inline Matrix<T,N> operator *=(Matrix<T,N> & a, const S & scalar){
455  a = scalar*a;
456  return a;
457  }
458 
459  template<class T, int N>
460  __device__ __host__ inline Matrix<T,N> operator-(const Matrix<T,N> & a){
461  Matrix<T,N> result;
462  for(int i=0; i<(N*N); ++i){
463  result.data[i] = -1*a.data[i];
464  }
465  return result;
466  }
467 
468 
469 
470  template<class T>
471  __device__ __host__ inline
473  {
474  // The compiler has a hard time unrolling nested loops,
475  // so here I do it by hand.
476  // I could do something more sophisticated in the future.
477  Matrix<T,3> result;
478  result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
479  result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
480  result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
481  result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
482  result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
483  result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
484  result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
485  result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
486  result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
487  return result;
488  }
489 
490  template<class T, int N>
491  __device__ __host__ inline Matrix<T,N> operator *=(Matrix<T,N> & a, const Matrix<T,N>& b){
492 
493  Matrix<T,N> c = a;
494  a = c*b;
495  return a;
496  }
497 
498 
499 
500 
501 
502 
503 
504 
505  // This is so that I can multiply real and complex matrices
506  template<class T, class U>
507  __device__ __host__ inline
509  {
511  result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
512  result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1) + a(0,2)*b(2,1);
513  result(0,2) = a(0,0)*b(0,2) + a(0,1)*b(1,2) + a(0,2)*b(2,2);
514  result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0) + a(1,2)*b(2,0);
515  result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1) + a(1,2)*b(2,1);
516  result(1,2) = a(1,0)*b(0,2) + a(1,1)*b(1,2) + a(1,2)*b(2,2);
517  result(2,0) = a(2,0)*b(0,0) + a(2,1)*b(1,0) + a(2,2)*b(2,0);
518  result(2,1) = a(2,0)*b(0,1) + a(2,1)*b(1,1) + a(2,2)*b(2,1);
519  result(2,2) = a(2,0)*b(0,2) + a(2,1)*b(1,2) + a(2,2)*b(2,2);
520  return result;
521  }
522 
523 
524 
525  template<class T>
526  __device__ __host__ inline
528  {
529  Matrix<T,2> result;
530  result(0,0) = a(0,0)*b(0,0) + a(0,1)*b(1,0);
531  result(0,1) = a(0,0)*b(0,1) + a(0,1)*b(1,1);
532  result(1,0) = a(1,0)*b(0,0) + a(1,1)*b(1,0);
533  result(1,1) = a(1,0)*b(0,1) + a(1,1)*b(1,1);
534  return result;
535  }
536 
537 
538  template<class T, int N>
539  __device__ __host__ inline
540  Matrix<T,N> conj(const Matrix<T,N> & other){
541  Matrix<T,N> result;
542  for(int i=0; i<N; ++i){
543  for(int j=0; j<N; ++j){
544  result(i,j) =
546  (other(j,i)));
547  }
548  }
549  return result;
550  }
551 
552 
553  template<class T>
554  __device__ __host__ inline
556  {
557 
558  const T & det = getDeterminant(u);
559  const T & det_inv = getPreciseInverse(det);
560 
561  T temp;
562 
563  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
564  (*uinv)(0,0) = (det_inv*temp);
565 
566  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
567  (*uinv)(0,1) = (temp*det_inv);
568 
569  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
570  (*uinv)(0,2) = (temp*det_inv);
571 
572  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
573  (*uinv)(1,0) = (det_inv*temp);
574 
575  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
576  (*uinv)(1,1) = (temp*det_inv);
577 
578  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
579  (*uinv)(1,2) = (temp*det_inv);
580 
581  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
582  (*uinv)(2,0) = (det_inv*temp);
583 
584  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
585  (*uinv)(2,1) = (temp*det_inv);
586 
587  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
588  (*uinv)(2,2) = (temp*det_inv);
589 
590  return;
591  }
592 
593 
594 
595  template<class T, int N>
596  __device__ __host__ inline
598 
599  for(int i=0; i<N; ++i){
600  (*m)(i,i) = 1;
601  for(int j=i+1; j<N; ++j){
602  (*m)(i,j) = (*m)(j,i) = 0;
603  }
604  }
605  return;
606  }
607 
608 
609  template<int N>
610  __device__ __host__ inline
612 
613  for(int i=0; i<N; ++i){
614  (*m)(i,i) = make_float2(1,0);
615  for(int j=i+1; j<N; ++j){
616  (*m)(i,j) = (*m)(j,i) = make_float2(0.,0.);
617  }
618  }
619  return;
620  }
621 
622 
623  template<int N>
624  __device__ __host__ inline
626 
627  for(int i=0; i<N; ++i){
628  (*m)(i,i) = make_double2(1,0);
629  for(int j=i+1; j<N; ++j){
630  (*m)(i,j) = (*m)(j,i) = make_double2(0.,0.);
631  }
632  }
633  return;
634  }
635 
636 
637  // Need to write more generic code for this!
638  template<class T, int N>
639  __device__ __host__ inline
641 
642  for(int i=0; i<N; ++i){
643  for(int j=0; j<N; ++j){
644  (*m)(i,j) = 0;
645  }
646  }
647  return;
648  }
649 
650 
651  template<int N>
652  __device__ __host__ inline
654 
655  for(int i=0; i<N; ++i){
656  for(int j=0; j<N; ++j){
657  (*m)(i,j) = make_float2(0.,0.);
658  }
659  }
660  return;
661  }
662 
663 
664  template<int N>
665  __device__ __host__ inline
667 
668  for(int i=0; i<N; ++i){
669  for(int j=0; j<N; ++j){
670  (*m)(i,j) = make_double2(0.,0.);
671  }
672  }
673  return;
674  }
675 
676 
677 
678 
679 
680  // Matrix and array are very similar
681  // Maybe I should factor out the similar
682  // code. However, I want to make sure that
683  // the compiler knows to store the
684  // data elements in registers, so I won't do
685  // it right now.
686  template<class T, int N>
687  class Array
688  {
689  private:
690  T data[N];
691 
692  public:
693  // access function
694  __device__ __host__ inline
695  T const & operator[](int i) const{
696  return data[i];
697  }
698 
699  // assignment function
700  __device__ __host__ inline
701  T & operator[](int i){
702  return data[i];
703  }
704  };
705 
706 
707  template<class T, int N>
708  __device__ __host__ inline
709  void copyColumn(const Matrix<T,N>& m, int c, Array<T,N>* a)
710  {
711  for(int i=0; i<N; ++i){
712  (*a)[i] = m(i,c); // c is the column index
713  }
714  return;
715  }
716 
717 
718  template<class T, int N>
719  __device__ __host__ inline
720  void outerProd(const Array<T,N>& a, const Array<T,N> & b, Matrix<T,N>* m){
721  for(int i=0; i<N; ++i){
722  const T conjb_i = Conj(b[i]);
723  for(int j=0; j<N; ++j){
724  (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
725  }
726  }
727  return;
728  }
729 
730  template<class T, int N>
731  __device__ __host__ inline
732  void outerProd(const T (&a)[N], const T (&b)[N], Matrix<T,N>* m){
733  for(int i=0; i<N; ++i){
734  const T conjb_i = conj(static_cast<complex<typename RealTypeId<T>::Type> >(b[i]));
735  for(int j=0; j<N; ++j){
736  (*m)(j,i) = a[j]*conjb_i; // we reverse the ordering of indices because it cuts down on the number of function calls
737  }
738  }
739  return;
740  }
741 
742 
743  // Need some print utilities
744  template<class T, int N>
745  std::ostream & operator << (std::ostream & os, const Matrix<T,N> & m){
746  for(int i=0; i<N; ++i){
747  for(int j=0; j<N; ++j){
748  os << m(i,j) << " ";
749  }
750  if(i<N-1) os << std::endl;
751  }
752  return os;
753  }
754 
755 
756  template<class T, int N>
757  std::ostream & operator << (std::ostream & os, const Array<T,N> & a){
758  for(int i=0; i<N; ++i){
759  os << a[i] << " ";
760  }
761  return os;
762  }
763 
764 
765  template<class T>
766  __device__ inline
767  void loadLinkVariableFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<T,3> *link)
768  {
769  for(int i=0; i<9; ++i){
770  link->data[i] = array[idx + (dir*9 + i)*stride];
771  }
772  return;
773  }
774 
775 
776  template<class T, int N>
777  __device__ inline
778  void loadMatrixFromArray(const T* const array, const int idx, const int stride, Matrix<T,N> *mat)
779  {
780  for(int i=0; i<(N*N); ++i){
781  mat->data[i] = array[idx + i*stride];
782  }
783  }
784 
785 
786  __device__ inline
787  void loadLinkVariableFromArray(const float2* const array, const int dir, const int idx, const int stride, Matrix<double2,3> *link)
788  {
789  float2 single_temp;
790  for(int i=0; i<9; ++i){
791  single_temp = array[idx + (dir*9 + i)*stride];
792  link->data[i].x = single_temp.x;
793  link->data[i].y = single_temp.y;
794  }
795  return;
796  }
797 
798 
799 
800  template<class T, int N>
801  __device__ inline
802  void writeMatrixToArray(const Matrix<T,N>& mat, const int idx, const int stride, T* const array)
803  {
804  for(int i=0; i<(N*N); ++i){
805  array[idx + i*stride] = mat.data[i];
806  }
807  }
808 
809  __device__ inline
810  void appendMatrixToArray(const Matrix<double2,3>& mat, const int idx, const int stride, double2* const array)
811  {
812  for(int i=0; i<9; ++i){
813  array[idx + i*stride].x += mat.data[i].x;
814  array[idx + i*stride].y += mat.data[i].y;
815  }
816  }
817 
818  __device__ inline
819  void appendMatrixToArray(const Matrix<float2,3>& mat, const int idx, const int stride, float2* const array)
820  {
821  for(int i=0; i<9; ++i){
822  array[idx + i*stride].x += mat.data[i].x;
823  array[idx + i*stride].y += mat.data[i].y;
824  }
825  }
826 
827 
828  template<class T>
829  __device__ inline
830  void writeLinkVariableToArray(const Matrix<T,3> & link, const int dir, const int idx, const int stride, T* const array)
831  {
832  for(int i=0; i<9; ++i){
833  array[idx + (dir*9 + i)*stride] = link.data[i];
834  }
835  return;
836  }
837 
838 
839 
840 
841  __device__ inline
842  void writeLinkVariableToArray(const Matrix<double2,3> & link, const int dir, const int idx, const int stride, float2* const array)
843  {
844  float2 single_temp;
845 
846  for(int i=0; i<9; ++i){
847  single_temp.x = link.data[i].x;
848  single_temp.y = link.data[i].y;
849  array[idx + (dir*9 + i)*stride] = single_temp;
850  }
851  return;
852  }
853 
854 
855  template<class T>
856  __device__ inline
857  void loadMomentumFromArray(const T* const array, const int dir, const int idx, const int stride, Matrix<T,3> *mom)
858  {
859  T temp2[5];
860  temp2[0] = array[idx + dir*stride*5];
861  temp2[1] = array[idx + dir*stride*5 + stride];
862  temp2[2] = array[idx + dir*stride*5 + 2*stride];
863  temp2[3] = array[idx + dir*stride*5 + 3*stride];
864  temp2[4] = array[idx + dir*stride*5 + 4*stride];
865 
866  mom->data[0].x = 0.;
867  mom->data[0].y = temp2[3].x;
868  mom->data[1] = temp2[0];
869  mom->data[2] = temp2[1];
870 
871  mom->data[3].x = -mom->data[1].x;
872  mom->data[3].y = mom->data[1].y;
873  mom->data[4].x = 0.;
874  mom->data[4].y = temp2[3].y;
875  mom->data[5] = temp2[2];
876 
877  mom->data[6].x = -mom->data[2].x;
878  mom->data[6].y = mom->data[2].y;
879 
880  mom->data[7].x = -mom->data[5].x;
881  mom->data[7].y = mom->data[5].y;
882 
883  mom->data[8].x = 0.;
884  mom->data[8].y = temp2[4].x;
885 
886  return;
887  }
888 
889 
890 
891  template<class T, class U>
892  __device__ inline
893  void writeMomentumToArray(const Matrix<T,3> & mom, const int dir, const int idx, const U coeff, const int stride, T* const array)
894  {
895  T temp2;
896  temp2.x = (mom.data[1].x - mom.data[3].x)*0.5*coeff;
897  temp2.y = (mom.data[1].y + mom.data[3].y)*0.5*coeff;
898  array[idx + dir*stride*5] = temp2;
899 
900  temp2.x = (mom.data[2].x - mom.data[6].x)*0.5*coeff;
901  temp2.y = (mom.data[2].y + mom.data[6].y)*0.5*coeff;
902  array[idx + dir*stride*5 + stride] = temp2;
903 
904  temp2.x = (mom.data[5].x - mom.data[7].x)*0.5*coeff;
905  temp2.y = (mom.data[5].y + mom.data[7].y)*0.5*coeff;
906  array[idx + dir*stride*5 + stride*2] = temp2;
907 
908  const typename RealTypeId<T>::Type temp = (mom.data[0].y + mom.data[4].y + mom.data[8].y)*0.3333333333333333333333333;
909  temp2.x = (mom.data[0].y-temp)*coeff;
910  temp2.y = (mom.data[4].y-temp)*coeff;
911  array[idx + dir*stride*5 + stride*3] = temp2;
912 
913  temp2.x = (mom.data[8].y - temp)*coeff;
914  temp2.y = 0.0;
915  array[idx + dir*stride*5 + stride*4] = temp2;
916 
917  return;
918  }
919 
920 
921 
922  template<class Cmplx>
923  __device__ __host__ inline
925  {
926 
927  const Cmplx & det = getDeterminant(u);
928  const Cmplx & det_inv = getPreciseInverse(det);
929 
930  Cmplx temp;
931 
932  temp = u(1,1)*u(2,2) - u(1,2)*u(2,1);
933  (*uinv)(0,0) = (det_inv*temp);
934 
935  temp = u(0,2)*u(2,1) - u(0,1)*u(2,2);
936  (*uinv)(0,1) = (temp*det_inv);
937 
938  temp = u(0,1)*u(1,2) - u(0,2)*u(1,1);
939  (*uinv)(0,2) = (temp*det_inv);
940 
941  temp = u(1,2)*u(2,0) - u(1,0)*u(2,2);
942  (*uinv)(1,0) = (det_inv*temp);
943 
944  temp = u(0,0)*u(2,2) - u(0,2)*u(2,0);
945  (*uinv)(1,1) = (temp*det_inv);
946 
947  temp = u(0,2)*u(1,0) - u(0,0)*u(1,2);
948  (*uinv)(1,2) = (temp*det_inv);
949 
950  temp = u(1,0)*u(2,1) - u(1,1)*u(2,0);
951  (*uinv)(2,0) = (det_inv*temp);
952 
953  temp = u(0,1)*u(2,0) - u(0,0)*u(2,1);
954  (*uinv)(2,1) = (temp*det_inv);
955 
956  temp = u(0,0)*u(1,1) - u(0,1)*u(1,0);
957  (*uinv)(2,2) = (temp*det_inv);
958 
959  return;
960  }
961  // template this!
962  inline void copyArrayToLink(Matrix<float2,3>* link, float* array){
963  for(int i=0; i<3; ++i){
964  for(int j=0; j<3; ++j){
965  (*link)(i,j).x = array[(i*3+j)*2];
966  (*link)(i,j).y = array[(i*3+j)*2 + 1];
967  }
968  }
969  return;
970  }
971 
972  template<class Cmplx, class Real>
973  inline void copyArrayToLink(Matrix<Cmplx,3>* link, Real* array){
974  for(int i=0; i<3; ++i){
975  for(int j=0; j<3; ++j){
976  (*link)(i,j).x = array[(i*3+j)*2];
977  (*link)(i,j).y = array[(i*3+j)*2 + 1];
978  }
979  }
980  return;
981  }
982 
983 
984  // and this!
985  inline void copyLinkToArray(float* array, const Matrix<float2,3>& link){
986  for(int i=0; i<3; ++i){
987  for(int j=0; j<3; ++j){
988  array[(i*3+j)*2] = link(i,j).x;
989  array[(i*3+j)*2 + 1] = link(i,j).y;
990  }
991  }
992  return;
993  }
994 
995  // and this!
996  template<class Cmplx, class Real>
997  inline void copyLinkToArray(Real* array, const Matrix<Cmplx,3>& link){
998  for(int i=0; i<3; ++i){
999  for(int j=0; j<3; ++j){
1000  array[(i*3+j)*2] = link(i,j).x;
1001  array[(i*3+j)*2 + 1] = link(i,j).y;
1002  }
1003  }
1004  return;
1005  }
1006 
1007 
1008 
1009  // and this!
1010  template<class Cmplx>
1011  __host__ __device__ inline
1012  void printLink(const Matrix<Cmplx,3>& link){
1013 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
1014  printf("(%lf, %lf)\t", link(0,0).x, link(0,0).y);
1015  printf("(%lf, %lf)\t", link(0,1).x, link(0,1).y);
1016  printf("(%lf, %lf)\n", link(0,2).x, link(0,2).y);
1017  printf("(%lf, %lf)\t", link(1,0).x, link(1,0).y);
1018  printf("(%lf, %lf)\t", link(1,1).x, link(1,1).y);
1019  printf("(%lf, %lf)\n", link(1,2).x, link(1,2).y);
1020  printf("(%lf, %lf)\t", link(2,0).x, link(2,0).y);
1021  printf("(%lf, %lf)\t", link(2,1).x, link(2,1).y);
1022  printf("(%lf, %lf)\n", link(2,2).x, link(2,2).y);
1023  printf("\n");
1024 #endif
1025  }
1026 
1027 } // end namespace quda
1028 #endif // _QUDA_MATRIX_H_
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:110
__device__ __host__ complex< typename RealTypeId< T >::Type > const & operator()(int i) const
Definition: quda_matrix.h:361
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:640
int y[4]
__device__ __host__ T const & operator()(int i, int j) const
Definition: quda_matrix.h:353
__device__ static __host__ T val()
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:385
__device__ void writeMomentumToArray(const Matrix< T, 3 > &mom, const int dir, const int idx, const U coeff, const int stride, T *const array)
Definition: quda_matrix.h:893
__host__ __device__ float2 operator*=(float2 &x, const float a)
Definition: float_vector.h:130
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1012
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:985
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:720
__device__ __host__ Cmplx Conj(const Cmplx &a)
Definition: quda_matrix.h:267
__device__ __host__ T const & operator[](int i) const
Definition: quda_matrix.h:695
__device__ __host__ int index(int i, int j)
Definition: quda_matrix.h:342
FloatingPoint< float > Float
Definition: gtest.h:7350
__device__ __host__ Cmplx getPreciseInverse(const Cmplx &z)
Definition: quda_matrix.h:276
__host__ __device__ complex< ValueType > operator-(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:673
T data[N *N]
Definition: quda_matrix.h:351
__device__ static __host__ T val()
__constant__ double coeff
__device__ __host__ void copyColumn(const Matrix< T, N > &m, int c, Array< T, N > *a)
Definition: quda_matrix.h:709
int x[4]
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:962
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:597
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< T, N > *mat)
Definition: quda_matrix.h:778
__host__ __device__ complex< ValueType > operator/(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:716
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:378
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:692
__device__ void writeMatrixToArray(const Matrix< T, N > &mat, const int idx, const int stride, T *const array)
Definition: quda_matrix.h:802
__device__ void loadMomentumFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *mom)
Definition: quda_matrix.h:857
__device__ __host__ T & operator()(int i, int j)
Definition: quda_matrix.h:357
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767
__device__ __host__ void computeLinkInverse(Matrix< Cmplx, 3 > *uinv, const Matrix< Cmplx, 3 > &u)
Definition: quda_matrix.h:924
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:555
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:83
__device__ void appendMatrixToArray(const Matrix< double2, 3 > &mat, const int idx, const int stride, double2 *const array)
Definition: quda_matrix.h:810
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
Definition: quda_matrix.h:830
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
__host__ __device__ complex< ValueType > operator+(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:644
__device__ __host__ complex< typename RealTypeId< T >::Type > & operator()(int i)
Definition: quda_matrix.h:368
__device__ __host__ T & operator[](int i)
Definition: quda_matrix.h:701
__device__ __host__ Cmplx makeComplex(const typename RealTypeId< Cmplx >::Type &a, const typename RealTypeId< Cmplx >::Type &b)
Definition: quda_matrix.h:125