QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_force_reference2.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <iostream>
4 #include <iomanip>
5 #include <complex>
6 
7 #include <quda.h>
8 #include <gauge_field.h>
9 
10 using namespace quda;
11 //namespace quda {
12 
13 #define RETURN_IF_ERR if(err) return;
14 
15 extern int gauge_order;
16 extern int Vh;
17 extern int Vh_ex;
18 
19  static int OPP_DIR(int dir){ return 7-dir; }
20  static bool GOES_FORWARDS(int dir){ return (dir<=3); }
21 
22  template<int N>
23  struct Sign{
24  static const int result = 1;
25  };
26 
27  template<>
28  struct Sign<1>
29  {
30  static const int result = -1;
31  };
32 
33 
34  template<class T, class U>
35  struct Promote
36  {
37  typedef T Type;
38  };
39 
40  template<>
41  struct Promote<int,float>
42  {
43  typedef float Type;
44  };
45 
46  template<>
47  struct Promote<float,int>
48  {
49  typedef float Type;
50  };
51 
52  template<>
53  struct Promote<int, double>
54  {
55  typedef double Type;
56  };
57 
58  template<>
59  struct Promote<double, int>
60  {
61  typedef double Type;
62  };
63 
64  template<>
65  struct Promote<float, double>
66  {
67  typedef double Type;
68  };
69 
70  template<>
71  struct Promote<double, float>
72  {
73  typedef double Type;
74  };
75 
76  template<>
77  struct Promote< int, std::complex<float> >
78  {
79  typedef std::complex<float> Type;
80  };
81 
82  template<>
83  struct Promote< std::complex<float>, int >
84  {
85  typedef std::complex<float> Type;
86  };
87 
88  template<>
89  struct Promote< float, std::complex<float> >
90  {
91  typedef std::complex<float> Type;
92  };
93 
94  template<>
95  struct Promote< int, std::complex<double> >
96  {
97  typedef std::complex<double> Type;
98  };
99 
100  template<>
101  struct Promote< std::complex<double>, int >
102  {
103  typedef std::complex<double> Type;
104  };
105 
106  template<>
107  struct Promote< float, std::complex<double> >
108  {
109  typedef std::complex<double> Type;
110  };
111 
112  template<>
113  struct Promote< std::complex<double>, float >
114  {
115  typedef std::complex<double> Type;
116  };
117 
118  template<>
119  struct Promote< double, std::complex<double> >
120  {
121  typedef std::complex<double> Type;
122  };
123 
124  template<>
125  struct Promote< std::complex<double>, double >
126  {
127  typedef std::complex<double> Type;
128  };
129 
130  template<int N, class T>
131  class Matrix
132  {
133  private:
134  T data[N][N];
135 
136  public:
137  Matrix(); // default constructor
138  Matrix(const Matrix<N,T>& mat); // copy constructor
139  Matrix & operator += (const Matrix<N,T>& mat);
140  Matrix & operator -= (const Matrix<N,T>& mat);
141  const T& operator()(int i, int j) const;
142  T& operator()(int i, int j);
143  };
144 
145  template<int N, class T>
147  {
148  for(int i=0; i<N; ++i){
149  for(int j=0; j<N; ++j){
150  data[i][j] = static_cast<T>(0);
151  }
152  }
153  }
154 
155  template<int N, class T>
157  {
158  for(int i=0; i<N; ++i){
159  for(int j=0; j<N; ++j){
160  data[i][j] = mat.data[i][j];
161  }
162  }
163  }
164 
165  template<int N, class T>
166  T& Matrix<N,T>::operator()(int i, int j)
167  {
168  return data[i][j];
169  }
170 
171  template<int N, class T>
172  const T& Matrix<N,T>::operator()(int i, int j) const
173  {
174  return data[i][j];
175  }
176 
177  template<int N, class T>
179  {
180  for(int i=0; i<N; ++i){
181  for(int j=0; j<N; ++j){
182  data[i][j] += mat.data[i][j];
183  }
184  }
185  return *this;
186  }
187 
188  template<int N, class T>
190  {
191  for(int i=0; i<N; ++i){
192  for(int j=0; j<N; ++j){
193  data[i][j] -= mat.data[i][j];
194  }
195  }
196  return *this;
197  }
198 
199  template<int N, class T>
201  {
202  Matrix<N,T> result(a);
203  result += b;
204  return result;
205  }
206 
207  template<int N, class T>
209  {
210  Matrix<N,T> result(a);
211  result -= b;
212  return result;
213  }
214 
215  template<int N, class T>
217  {
218  Matrix<N,T> result;
219  for(int i=0; i<N; ++i){
220  for(int j=0; j<N; ++j){
221  result(i,j) = static_cast<T>(0);
222  for(int k=0; k<N; ++k){
223  result(i,j) += a(i,k)*b(k,j);
224  }
225  }
226  }
227  return result;
228  }
229 
230  template<int N, class T>
231  Matrix<N,std::complex<T> > conj(const Matrix<N,std::complex<T> >& mat)
232  {
233  Matrix<N,std::complex<T> > result;
234  for(int i=0; i<N; ++i){
235  for(int j=0; j<N; ++j){
236  result(i,j) = std::conj(mat(j,i));
237  }
238  }
239  return result;
240  }
241 
242  template<int N, class T>
243  Matrix<N,T> transpose(const Matrix<N,std::complex<T> >& mat)
244  {
245  Matrix<N,T> result;
246  for(int i=0; i<N; ++i){
247  for(int j=0; j<N; ++j){
248  result(i,j) = mat(j,i);
249  }
250  }
251  return result;
252  }
253 
254  template<int N, class T, class U>
256  {
257  typedef typename Promote<T,U>::Type return_type;
258  Matrix<N,return_type> result;
259 
260  for(int i=0; i<N; ++i){
261  for(int j=0; j<N; ++j){
262  result(i,j) = scalar*mat(i,j);
263  }
264  }
265  return result;
266  }
267 
268  template<int N, class T, class U>
270  {
271  return mat*scalar;
272  }
273 
274  template<int N, class T>
275  struct Identity
276  {
278  Matrix<N,T> id;
279  for(int i=0; i<N; ++i){
280  id(i,i) = static_cast<T>(1);
281  }
282  return id;
283  } // operator()
284  };
285 
286  template<int N, class T>
287  struct Zero
288  {
289  // the default constructor zeros all matrix elements
291  return Matrix<N,T>();
292  }
293  };
294 
295 
296  template<int N, class T>
297  std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)
298  {
299  for(int i=0; i<N; ++i){
300  for(int j=0; j<N; ++j){
301  os << m(i,j) << " ";
302  }
303  if(i<N-1) os << std::endl;
304  }
305  return os;
306  }
307 
308 
309 
310  template<class Real>
311  class LoadStore{
312  private:
313  const int volume;
314  const int half_volume;
315  public:
316  LoadStore(int vol) : volume(vol), half_volume(vol/2) {}
317 
318  void loadMatrixFromField(const Real* const field, int oddBit, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
319 
320  void loadMatrixFromField(const Real* const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
321 
322  void storeMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real* const field) const;
323 
324  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real coeff, Real* const) const;
325 
326  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
327 
328  void storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
329  Real getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const;
330  void addData(Real* const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
331  int half_idx_conversion_ex2normal(int half_lattice_index, const int* dim, int oddBit) const ;
332  int half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const ;
333  };
334 
335 template<class Real>
336 int LoadStore<Real>::half_idx_conversion_ex2normal(int half_lattice_index_ex, const int* dim, int oddBit) const
337 {
338  int X1=dim[0];
339  int X2=dim[1];
340  int X3=dim[2];
341  //int X4=dim[3];
342  int X1h=X1/2;
343 
344  int E1=dim[0]+4;
345  int E2=dim[1]+4;
346  int E3=dim[2]+4;
347  //int E4=dim[3]+4;
348  int E1h=E1/2;
349 
350  int sid = half_lattice_index_ex;
351 
352  int za = sid/E1h;
353  int x1h = sid - za*E1h;
354  int zb = za/E2;
355  int x2 = za - zb*E2;
356  int x4 = zb/E3;
357  int x3 = zb - x4*E3;
358  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
359  int x1 = 2*x1h + x1odd;
360 
361  int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
362  return idx;
363 }
364 
365 template<class Real>
366 int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const
367 {
368  int X1=dim[0];
369  int X2=dim[1];
370  int X3=dim[2];
371  //int X4=dim[3];
372  int X1h=X1/2;
373 
374  int E1=dim[0]+4;
375  int E2=dim[1]+4;
376  int E3=dim[2]+4;
377  //int E4=dim[3]+4;
378  //int E1h=E1/2;
379 
380  int sid = half_lattice_index;
381 
382  int za = sid/X1h;
383  int x1h = sid - za*X1h;
384  int zb = za/X2;
385  int x2 = za - zb*X2;
386  int x4 = zb/X3;
387  int x3 = zb - x4*X3;
388  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
389  int x1 = 2*x1h + x1odd;
390 
391  int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;
392 
393  return idx;
394 }
395 
396 template<class Real>
397 Real LoadStore<Real>::getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const
398 {
400  return field[(4*hfv*oddBit +4*idx + dir)*18+offset];
401  }else{ //QDP format
402  return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
403  }
404 }
405 template<class Real>
406 void LoadStore<Real>::addData(Real* const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
407 {
409  field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;
410  }else{ //QDP format
411  ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset] += v;
412  }
413 }
414 
415 
416 
417 template<class Real>
418  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
419  int oddBit,
420  int half_lattice_index,
421  Matrix<3, std::complex<Real> >* const mat
422  ) const
423 {
424 #ifdef MULTI_GPU
425  int hfv = Vh_ex;
426 #else
427  int hfv = Vh;
428 #endif
429 
430  int offset = 0;
431  for(int i=0; i<3; ++i){
432  for(int j=0; j<3; ++j){
433  (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
434  (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
435  }
436  }
437  return;
438  }
439 
440  template<class Real>
441  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
442  int oddBit,
443  int dir,
444  int half_lattice_index,
445  Matrix<3, std::complex<Real> >* const mat
446  ) const
447  {
448 #ifdef MULTI_GPU
449  int hfv = Vh_ex;
450 #else
451  int hfv = Vh;
452 #endif
453 
454  //const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
455  int offset = 0;
456  for(int i=0; i<3; ++i){
457  for(int j=0; j<3; ++j){
458  (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
459  (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
460  }
461  }
462  return;
463  }
464 
465  template<class Real>
466  void LoadStore<Real>::storeMatrixToField(const Matrix<3, std::complex<Real> >& mat,
467  int oddBit,
468  int half_lattice_index,
469  Real* const field) const
470  {
471 #ifdef MULTI_GPU
472  int hfv = Vh_ex;
473 #else
474  int hfv = Vh;
475 #endif
476 
477  int offset = 0;
478  for(int i=0; i<3; ++i){
479  for(int j=0; j<3; ++j){
480  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).real();
481  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).imag();
482  }
483  }
484  return;
485  }
486 
487  template<class Real>
488  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
489  int oddBit,
490  int half_lattice_index,
491  Real coeff,
492  Real* const field) const
493  {
494 #ifdef MULTI_GPU
495  int hfv = Vh_ex;
496 #else
497  int hfv = Vh;
498 #endif
499  Real* const local_field = field + (oddBit*hfv + half_lattice_index)*18;
500 
501 
502  int offset = 0;
503  for(int i=0; i<3; ++i){
504  for(int j=0; j<3; ++j){
505  local_field[offset++] += coeff*mat(i,j).real();
506  local_field[offset++] += coeff*mat(i,j).imag();
507  }
508  }
509  return;
510  }
511 
512 
513  template<class Real>
514  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
515  int oddBit,
516  int dir,
517  int half_lattice_index,
518  Real coeff,
519  Real* const field) const
520  {
521 
522 #ifdef MULTI_GPU
523  int hfv = Vh_ex;
524 #else
525  int hfv = Vh;
526 #endif
527 
528  //Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
529  int offset = 0;
530  for(int i=0; i<3; ++i){
531  for(int j=0; j<3; ++j){
532  //local_field[offset++] += coeff*mat(i,j).real();
533  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).real(), hfv);
534 
535  //local_field[offset++] += coeff*mat(i,j).imag();
536  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).imag(), hfv);
537  }
538  }
539  return;
540  }
541 
542 
543  template<class Real>
544  void LoadStore<Real>::storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat,
545  int oddBit,
546  int dir,
547  int half_lattice_index,
548  Real coeff,
549  Real* const field) const
550  {
551  Real* const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
552  mom_field[0] = (mat(0,1).real() - mat(1,0).real())*0.5*coeff;
553  mom_field[1] = (mat(0,1).imag() + mat(1,0).imag())*0.5*coeff;
554 
555  mom_field[2] = (mat(0,2).real() - mat(2,0).real())*0.5*coeff;
556  mom_field[3] = (mat(0,2).imag() + mat(2,0).imag())*0.5*coeff;
557 
558  mom_field[4] = (mat(1,2).real() - mat(2,1).real())*0.5*coeff;
559  mom_field[5] = (mat(1,2).imag() + mat(2,1).imag())*0.5*coeff;
560 
561  const Real temp = (mat(0,0).imag() + mat(1,1).imag() + mat(2,2).imag())*0.3333333333333333333;
562  mom_field[6] = (mat(0,0).imag() - temp)*coeff;
563  mom_field[7] = (mat(1,1).imag() - temp)*coeff;
564  mom_field[8] = (mat(2,2).imag() - temp)*coeff;
565  mom_field[9] = 0.0;
566  return;
567  }
568 
569 
570 
571  template<int oddBit>
572  struct Locator
573  {
574  private:
575  int local_dim[4];
576  int volume;
577  int half_index;
578  int full_index;
579  int full_coord[4];
580  void getCoordsFromHalfIndex(int half_index, int coord[4]);
581  void getCoordsFromFullIndex(int full_index, int coord[4]);
582  void cache(int half_lattice_index); // caches the half-lattice index, full-lattice index,
583  // and full-lattice coordinates
584 
585 
586  public:
587  Locator(const int dim[4]);
588  int getFullFromHalfIndex(int half_lattice_index);
589  int getNeighborFromFullIndex(int full_lattice_index, int dir, int* err=NULL);
590  };
591 
592  template<int oddBit>
593  Locator<oddBit>::Locator(const int dim[4]) : half_index(-1), full_index(-1){
594  volume = 1;
595  for(int dir=0; dir<4; ++dir){
596  local_dim[dir] = dim[dir];
597  volume *= local_dim[dir];
598  }
599  }
600 
601  // Store the half_lattice index, works out and stores the full lattice index
602  // and the coordinates.
603  template<int oddBit>
604  void Locator<oddBit>::getCoordsFromHalfIndex(int half_lattice_index, int coord[4])
605  {
606 #ifdef MULTI_GPU
607  int E1 = local_dim[0]+4;
608  int E2 = local_dim[1]+4;
609  int E3 = local_dim[2]+4;
610  //int E4 = local_dim[3]+4;
611  int E1h = E1/2;
612 
613  int z1 = half_lattice_index/E1h;
614  int x1h = half_lattice_index - z1*E1h;
615  int z2 = z1/E2;
616  coord[1] = z1 - z2*E2;
617  coord[3] = z2/E3;
618  coord[2] = z2 - coord[3]*E3;
619  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
620  coord[0] = 2*x1h + x1odd;
621 #else
622  int half_dim_0 = local_dim[0]/2;
623  int z1 = half_lattice_index/half_dim_0;
624  int x1h = half_lattice_index - z1*half_dim_0;
625  int z2 = z1/local_dim[1];
626  coord[1] = z1 - z2*local_dim[1];
627  coord[3] = z2/local_dim[2];
628  coord[2] = z2 - coord[3]*local_dim[2];
629  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
630  coord[0] = 2*x1h + x1odd;
631 #endif
632 
633  }
634 
635  template<int oddBit>
636  void Locator<oddBit>::getCoordsFromFullIndex(int full_lattice_index, int coord[4])
637  {
638 #ifdef MULTI_GPU
639  int D1=local_dim[0]+4;
640  int D2=local_dim[1]+4;
641  int D3=local_dim[2]+4;
642  //int D4=local_dim[3]+4;
643  //int D1h=D1/2;
644 #else
645  int D1=local_dim[0];
646  int D2=local_dim[1];
647  int D3=local_dim[2];
648  //int D4=local_dim[3];
649  //int D1h=D1/2;
650 #endif
651 
652  int z1 = full_lattice_index/D1;
653  coord[0] = full_lattice_index - z1*D1;
654  int z2 = z1/D2;
655  coord[1] = z1 - z2*D2;
656  coord[3] = z2/D3;
657  coord[2] = z2 - coord[3]*D3;
658  }
659 
660 
661 
662 
663 
664  template<int oddBit>
665  void Locator<oddBit>::cache(int half_lattice_index)
666  {
667  half_index = half_lattice_index;
668  getCoordsFromHalfIndex(half_lattice_index, full_coord);
669  int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;
670  full_index = 2*half_lattice_index + x1odd;
671  return;
672  }
673 
674  template<int oddBit>
675  int Locator<oddBit>::getFullFromHalfIndex(int half_lattice_index)
676  {
677  if(half_index != half_lattice_index) cache(half_lattice_index);
678  return full_index;
679  }
680 
681  // From full index return the neighbouring full index
682  template<int oddBit>
683  int Locator<oddBit>::getNeighborFromFullIndex(int full_lattice_index, int dir, int* err)
684  {
685  if(err) *err = 0;
686 
687  int coord[4];
688  int neighbor_index;
689  getCoordsFromFullIndex(full_lattice_index, coord);
690 #ifdef MULTI_GPU
691  int E1 = local_dim[0] + 4;
692  int E2 = local_dim[1] + 4;
693  int E3 = local_dim[2] + 4;
694  int E4 = local_dim[3] + 4;
695  switch(dir){
696  case 0: //+X
697  neighbor_index = full_lattice_index + 1;
698  if(err && (coord[0] == E1-1) ) *err = 1;
699  break;
700  case 1: //+Y
701  neighbor_index = full_lattice_index + E1;
702  if(err && (coord[1] == E2-1) ) *err = 1;
703  break;
704  case 2: //+Z
705  neighbor_index = full_lattice_index + E2*E1;
706  if(err && (coord[2] == E3-1) ) *err = 1;
707  break;
708  case 3: //+T
709  neighbor_index = full_lattice_index + E3*E2*E1;
710  if(err && (coord[3] == E4-1) ) *err = 1;
711  break;
712  case 7: //-X
713  neighbor_index = full_lattice_index - 1;
714  if(err && (coord[0] == 0) ) *err = 1;
715  break;
716  case 6: //-Y
717  neighbor_index = full_lattice_index - E1;
718  if(err && (coord[1] == 0) ) *err = 1;
719  break;
720  case 5: //-Z
721  neighbor_index = full_lattice_index - E2*E1;
722  if(err && (coord[2] == 0) ) *err = 1;
723  break;
724  case 4: //-T
725  neighbor_index = full_lattice_index - E3*E2*E1;
726  if(err && (coord[3] == 0) ) *err = 1;
727  break;
728  default:
729  errorQuda("Neighbor index could not be determined\n");
730  exit(1);
731  break;
732  } // switch(dir)
733 
734 #else
735  switch(dir){
736  case 0:
737  neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
738  break;
739  case 1:
740  neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
741  break;
742  case 2:
743  neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
744  break;
745  case 3:
746  neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
747  break;
748  case 7:
749  neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
750  break;
751  case 6:
752  neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
753  break;
754  case 5:
755  neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
756  break;
757  case 4:
758  neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
759  break;
760  default:
761  errorQuda("Neighbor index could not be determined\n");
762  exit(1);
763  break;
764  } // switch(dir)
765  if(err) *err = 0;
766 #endif
767  return neighbor_index;
768  }
769 
770 // Can't typedef a template
771 template<class Real>
773 {
775 };
776 
777  template<class Real, int oddBit>
778  void computeOneLinkSite(const int dim[4],
779  int half_lattice_index,
780  const Real* const oprod,
781  int sig, Real coeff,
782  const LoadStore<Real>& ls,
783  Real* const output)
784  {
785  if( GOES_FORWARDS(sig) ){
786  typename ColorMatrix<Real>::Type colorMatW;
787 #ifdef MULTI_GPU
788  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
789 #else
790  int idx = half_lattice_index;
791 #endif
792  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatW);
793  ls.addMatrixToField(colorMatW, oddBit, sig, idx, coeff, output);
794  }
795  return;
796  }
797 
798  template<class Real>
799  void computeOneLinkField(const int dim[4],
800  const Real* const oprod,
801  int sig, Real coeff,
802  Real* const output)
803  {
804  int volume = 1;
805  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
806  const int half_volume = volume/2;
807  LoadStore<Real> ls(volume);
808  for(int site=0; site<half_volume; ++site){
809  computeOneLinkSite<Real,0>(dim, site,
810  oprod,
811  sig, coeff, ls,
812  output);
813 
814  }
815  // Loop over odd lattice sites
816  for(int site=0; site<half_volume; ++site){
817  computeOneLinkSite<Real,1>(dim, site,
818  oprod,
819  sig, coeff, ls,
820  output);
821  }
822  return;
823  }
824 
825 
826 
827 
828 
829 
830 
831 
832 
833  // middleLinkKernel compiles for now, but lots of debugging to be done
834  template<class Real, int oddBit>
835  void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
836  const int dim[4],
837  const Real* const oprod,
838  const Real* const Qprev,
839  const Real* const link,
840  int sig, int mu,
841  Real coeff,
842  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
843  Real* const Pmu,
844  Real* const P3,
845  Real* const Qmu,
846  Real* const newOprod
847  )
848  {
849  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
850  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
851 
852 
853  Locator<oddBit> locator(dim);
854  int point_b, point_c, point_d;
856  int X = locator.getFullFromHalfIndex(half_lattice_index);
857 
858  int err;
859  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
860  point_d = new_mem_idx >> 1;
861  // getNeighborFromFullIndex will work on any site on the lattice, odd or even
862  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
863  point_c = new_mem_idx >> 1;
864 
865  new_mem_idx = locator.getNeighborFromFullIndex(X,sig); RETURN_IF_ERR;
866  point_b = new_mem_idx >> 1;
867 
868  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
869  bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
870  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
871 
872  typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;
873  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
874 
875 
876 
877 
878 
879  if(sig_positive){
880  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
881  }else{
882  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
883  }
884 
885  if(mu_positive){
886  ls.loadMatrixFromField(link, oddBit, mu, bc_link_nbr_idx, &bc_link);
887  }else{
888  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(mu), bc_link_nbr_idx, &bc_link);
889  }
890 
891  if(Qprev == NULL){
892  if(sig_positive){
893  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_d, &colorMatY);
894  }else{
895  ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);
896  colorMatY = conj(colorMatY);
897  }
898  }else{ // Qprev != NULL
899  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
900  }
901 
902  colorMatW = (!mu_positive) ? bc_link*colorMatY : conj(bc_link)*colorMatY;
903  if(Pmu) ls.storeMatrixToField(colorMatW, 1-oddBit, point_b, Pmu);
904 
905  colorMatY = (sig_positive) ? ab_link*colorMatW : conj(ab_link)*colorMatW;
906  ls.storeMatrixToField(colorMatY, oddBit, half_lattice_index, P3);
907 
908  if(mu_positive){
909  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
910  }else{
911  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
912  ad_link = conj(ad_link);
913  }
914 
915 
916  if(Qprev == NULL){
917  if(sig_positive) colorMatY = colorMatW*ad_link;
918  if(Qmu) ls.storeMatrixToField(ad_link, oddBit, half_lattice_index, Qmu);
919  }else{ // Qprev != NULL
920  if(Qmu || sig_positive){
921  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatY);
922  colorMatX= colorMatY*ad_link;
923  }
924  if(Qmu) ls.storeMatrixToField(colorMatX, oddBit, half_lattice_index, Qmu);
925  if(sig_positive) colorMatY = colorMatW*colorMatX;
926  }
927 
928  if(sig_positive) ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);
929 
930  return;
931  } // computeMiddleLinkSite
932 
933 
934  template<class Real>
935  void computeMiddleLinkField(const int dim[4],
936  const Real* const oprod,
937  const Real* const Qprev,
938  const Real* const link,
939  int sig, int mu,
940  Real coeff,
941  Real* const Pmu,
942  Real* const P3,
943  Real* const Qmu,
944  Real* const newOprod
945  )
946  {
947 
948  int volume = 1;
949  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
950 #ifdef MULTI_GPU
951  const int loop_count = Vh_ex;
952 #else
953  const int loop_count = volume/2;
954 #endif
955  // loop over the lattice volume
956  // To keep the code as close to the GPU code as possible, we'll
957  // loop over the even sites first and then the odd sites
958  LoadStore<Real> ls(volume);
959  for(int site=0; site<loop_count; ++site){
960  computeMiddleLinkSite<Real, 0>(site, dim,
961  oprod, Qprev, link,
962  sig, mu, coeff,
963  ls,
964  Pmu, P3, Qmu, newOprod);
965  }
966  // Loop over odd lattice sites
967  for(int site=0; site<loop_count; ++site){
968  computeMiddleLinkSite<Real,1>(site, dim,
969  oprod, Qprev, link,
970  sig, mu, coeff,
971  ls,
972  Pmu, P3, Qmu, newOprod);
973  }
974  return;
975  }
976 
977 
978 
979 
980  template<class Real, int oddBit>
981  void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
982  const int dim[4],
983  const Real* const P3,
984  const Real* const Qprod, // why?
985  const Real* const link,
986  int sig, int mu,
987  Real coeff, Real accumu_coeff,
988  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
989  Real* const shortP,
990  Real* const newOprod
991  )
992  {
993 
994  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
995  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
996 
997  Locator<oddBit> locator(dim);
998  int point_d;
999  int ad_link_nbr_idx;
1000  int X = locator.getFullFromHalfIndex(half_lattice_index);
1001 
1002  int err;
1003  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1004  point_d = new_mem_idx >> 1;
1005  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1006 
1007 
1008  typename ColorMatrix<Real>::Type ad_link;
1009  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY;
1010 
1011  ls.loadMatrixFromField(P3, oddBit, half_lattice_index, &colorMatY);
1012 
1013  if(shortP){
1014  if(mu_positive){
1015  ad_link_nbr_idx = point_d;
1016  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
1017  }else{
1018  ad_link_nbr_idx = half_lattice_index;
1019  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
1020  }
1021  colorMatW = (mu_positive) ? ad_link*colorMatY : conj(ad_link)*colorMatY;
1022  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1023  } // if(shortP)
1024 
1025 
1026  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1027 
1028  if(Qprod){
1029  ls.loadMatrixFromField(Qprod, 1-oddBit, point_d, &colorMatX);
1030  if(mu_positive){
1031  colorMatW = colorMatY*colorMatX;
1032  if(!oddBit){ mycoeff = -mycoeff; }
1033  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, mycoeff, newOprod);
1034  }else{
1035  colorMatW = conj(colorMatX)*conj(colorMatY);
1036  if(oddBit){ mycoeff = -mycoeff; }
1037  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1038  }
1039  }
1040 
1041  if(!Qprod){
1042  if(mu_positive){
1043  if(!oddBit){ mycoeff = -mycoeff; }
1044  ls.addMatrixToField(colorMatY, 1-oddBit, mu, point_d, mycoeff, newOprod);
1045  }else{
1046  if(oddBit){ mycoeff = -mycoeff; }
1047  colorMatW = conj(colorMatY);
1048  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1049  }
1050  } // if !(Qprod)
1051 
1052  return;
1053  } // computeSideLinkSite
1054 
1055 
1056  // Maybe change to computeSideLinkField
1057  template<class Real>
1058  void computeSideLinkField(const int dim[4],
1059  const Real* const P3,
1060  const Real* const Qprod, // why?
1061  const Real* const link,
1062  int sig, int mu,
1063  Real coeff, Real accumu_coeff,
1064  Real* const shortP,
1065  Real* const newOprod
1066  )
1067  {
1068  // Need some way of setting half_volume
1069  int volume = 1;
1070  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1071 #ifdef MULTI_GPU
1072  const int loop_count = Vh_ex;
1073 #else
1074  const int loop_count = volume/2;
1075 #endif
1076  LoadStore<Real> ls(volume);
1077 
1078  for(int site=0; site<loop_count; ++site){
1079  computeSideLinkSite<Real,0>(site, dim,
1080  P3, Qprod, link,
1081  sig, mu,
1082  coeff, accumu_coeff,
1083  ls, shortP, newOprod);
1084  }
1085 
1086  for(int site=0; site<loop_count; ++site){
1087  computeSideLinkSite<Real,1>(site, dim,
1088  P3, Qprod, link,
1089  sig, mu,
1090  coeff, accumu_coeff,
1091  ls, shortP, newOprod);
1092  }
1093 
1094  return;
1095  }
1096 
1097 
1098 
1099 
1100 
1101  template<class Real, int oddBit>
1102  void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
1103  const int dim[4],
1104  const Real* const oprod,
1105  const Real* const Qprev,
1106  const Real* const link,
1107  int sig, int mu,
1108  Real coeff, Real accumu_coeff,
1109  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
1110  Real* const shortP,
1111  Real* const newOprod)
1112  {
1113 
1114  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
1115  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
1116 
1117  typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;
1118  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
1119 
1120 
1122 
1123  Locator<oddBit> locator(dim);
1124  int X = locator.getFullFromHalfIndex(half_lattice_index);
1125 
1126  int err;
1127  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1128  point_d = new_mem_idx >> 1;
1129 
1130  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
1131  point_c = new_mem_idx >> 1;
1132 
1133  new_mem_idx = locator.getNeighborFromFullIndex(X,sig, &err); RETURN_IF_ERR;
1134  point_b = new_mem_idx >> 1;
1135  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1136 
1137 
1138 
1139  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1140 
1141  if(mu_positive){
1142  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1143  ls.loadMatrixFromField(link, 1-oddBit, mu, point_d, &ad_link);
1144  // compute point_c
1145  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1146  ls.loadMatrixFromField(link, oddBit, mu, point_c, &bc_link);
1147  colorMatZ = conj(bc_link)*colorMatY; // okay
1148 
1149  if(sig_positive)
1150  {
1151  colorMatY = colorMatX*ad_link;
1152  colorMatW = colorMatZ*colorMatY;
1153  ls.addMatrixToField(colorMatW, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1154  }
1155 
1156  if(sig_positive){
1157  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1158  }else{
1159  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1160  }
1161  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // okay
1162  colorMatW = colorMatY*colorMatX;
1163  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, -Sign<oddBit>::result*mycoeff, newOprod);
1164  colorMatW = ad_link*colorMatY;
1165  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP); // Warning! Need to check this!
1166  }else{ //negative mu
1167  mu = OPP_DIR(mu);
1168  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1169  ls.loadMatrixFromField(link, oddBit, mu, half_lattice_index, &ad_link);
1170  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1171  ls.loadMatrixFromField(link, 1-oddBit, mu, point_b, &bc_link);
1172 
1173  if(sig_positive) colorMatW = colorMatX*conj(ad_link);
1174  colorMatZ = bc_link*colorMatY;
1175  if(sig_positive){
1176  colorMatY = colorMatZ*colorMatW;
1177  ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1178  }
1179 
1180  if(sig_positive){
1181  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1182  }else{
1183  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1184  }
1185 
1186  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // 611
1187  colorMatW = conj(colorMatX)*conj(colorMatY);
1188 
1189  ls.addMatrixToField(colorMatW, oddBit, mu, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1190  colorMatW = conj(ad_link)*colorMatY;
1191  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1192  } // end mu
1193  return;
1194  } // allLinkKernel
1195 
1196 
1197 
1198  template<class Real>
1199  void computeAllLinkField(const int dim[4],
1200  const Real* const oprod,
1201  const Real* const Qprev,
1202  const Real* const link,
1203  int sig, int mu,
1204  Real coeff, Real accumu_coeff,
1205  Real* const shortP,
1206  Real* const newOprod)
1207  {
1208  int volume = 1;
1209  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1210 #ifdef MULTI_GPU
1211  const int loop_count = Vh_ex;
1212 #else
1213  const int loop_count = volume/2;
1214 #endif
1215 
1216  LoadStore<Real> ls(volume);
1217  for(int site=0; site<loop_count; ++site){
1218 
1219  computeAllLinkSite<Real,0>(site, dim,
1220  oprod, Qprev, link,
1221  sig, mu,
1223  ls,
1224  shortP, newOprod);
1225  }
1226 
1227  for(int site=0; site<loop_count; ++site){
1228  computeAllLinkSite<Real, 1>(site, dim,
1229  oprod, Qprev, link,
1230  sig, mu,
1232  ls,
1233  shortP, newOprod);
1234  }
1235 
1236  return;
1237  }
1238 
1239 #define Pmu tempmat[0]
1240 #define P3 tempmat[1]
1241 #define P5 tempmat[2]
1242 #define Pnumu tempmat[3]
1243 #define Qmu tempmat[4]
1244 #define Qnumu tempmat[5]
1245 
1246  template<class Real>
1248  {
1249  Real one;
1250  Real three;
1251  Real five;
1252  Real seven;
1253  Real naik;
1254  Real lepage;
1255  };
1256 
1257 
1258  template<class Real>
1259  void doHisqStaplesForceCPU(const int dim[4],
1260  PathCoefficients<double> staple_coeff,
1261  Real* oprod,
1262  Real* link,
1263  Real** tempmat,
1264  Real* newOprod)
1265  {
1266  Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
1267 
1268  OneLink = staple_coeff.one;
1269  ThreeSt = staple_coeff.three;
1270  FiveSt = staple_coeff.five;
1271  SevenSt = staple_coeff.seven;
1272  Lepage = staple_coeff.lepage;
1273 
1274  for(int sig=0; sig<4; ++sig){
1275  computeOneLinkField(dim,
1276  oprod,
1277  sig, OneLink,
1278  newOprod);
1279  }
1280 
1281 
1282  // sig labels the net displacement of the staple
1283  for(int sig=0; sig<8; ++sig){
1284  for(int mu=0; mu<8; ++mu){
1285  if( mu == sig || mu == OPP_DIR(sig) ) continue;
1286 
1287  computeMiddleLinkField<Real>(dim,
1288  oprod, NULL, link,
1289  sig, mu, -ThreeSt,
1290  Pmu, P3, Qmu,
1291  newOprod);
1292 
1293  for(int nu=0; nu<8; ++nu){
1294  if( nu==mu || nu==OPP_DIR(mu)
1295  || nu==sig || nu==OPP_DIR(sig) ) continue;
1296 
1297  computeMiddleLinkField<Real>(dim,
1298  Pmu, Qmu, link,
1299  sig, nu, staple_coeff.five,
1300  Pnumu, P5, Qnumu,
1301  newOprod);
1302 
1303 
1304  for(int rho=0; rho<8; ++rho){
1305  if( rho == sig || rho == OPP_DIR(sig)
1306  || rho == mu || rho == OPP_DIR(mu)
1307  || rho == nu || rho == OPP_DIR(nu) )
1308  {
1309  continue;
1310  }
1311 
1312  if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1313  computeAllLinkField<Real>(dim,
1314  Pnumu, Qnumu, link,
1315  sig, rho, staple_coeff.seven, coeff,
1316  P5, newOprod);
1317 
1318  } // rho
1319 
1320  // 5-staple: side link
1321  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1322  computeSideLinkField<Real>(dim,
1323  P5, Qmu, link,
1324  sig, nu, -FiveSt, coeff,
1325  P3, newOprod);
1326 
1327 
1328  } // nu
1329 
1330 
1331  // lepage
1332  if(staple_coeff.lepage != 0.){
1333  computeMiddleLinkField<Real>(dim,
1334  Pmu, Qmu, link,
1335  sig, mu, Lepage,
1336  NULL, P5, NULL,
1337  newOprod);
1338 
1339  if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
1340  computeSideLinkField<Real>(dim,
1341  P5, Qmu, link,
1342  sig, mu, -Lepage, coeff,
1343  P3, newOprod);
1344  } // lepage != 0
1345 
1346 
1347  computeSideLinkField<Real>(dim,
1348  P3, NULL, link,
1349  sig, mu, ThreeSt, 0.,
1350  NULL, newOprod);
1351  } // mu
1352  } // sig
1353 
1354  // Need also to compute the one-link contribution
1355  return;
1356  }
1357 
1358 #undef Pmu
1359 #undef P3
1360 #undef P5
1361 #undef Pnumu
1362 #undef Qmu
1363 #undef Qnumu
1364 
1365 
1366  void hisqStaplesForceCPU(const double* path_coeff,
1367  const QudaGaugeParam &param,
1368  cpuGaugeField &oprod,
1369  cpuGaugeField &link,
1370  cpuGaugeField* newOprod)
1371  {
1372  int volume = 1;
1373  for(int dir=0; dir<4; ++dir) volume *= param.X[dir];
1374 
1375 #ifdef MULTI_GPU
1376  int len = Vh_ex*2;
1377 #else
1378  int len = volume;
1379 #endif
1380  // allocate memory for temporary fields
1381  void* tempmat[6];
1382  if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1383  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(double));
1384  }else{
1385  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(float));
1386  }
1387 
1388  PathCoefficients<double> act_path_coeff;
1389  act_path_coeff.one = path_coeff[0];
1390  act_path_coeff.naik = path_coeff[1];
1391  act_path_coeff.three = path_coeff[2];
1392  act_path_coeff.five = path_coeff[3];
1393  act_path_coeff.seven = path_coeff[4];
1394  act_path_coeff.lepage = path_coeff[5];
1395 
1396  if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1397  doHisqStaplesForceCPU<double>(param.X,
1398  act_path_coeff,
1399  (double*)oprod.Gauge_p(),
1400  (double*)link.Gauge_p(),
1401  (double**)tempmat,
1402  (double*)newOprod->Gauge_p()
1403  );
1404 
1405  }else if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1406  doHisqStaplesForceCPU<float>(param.X,
1407  act_path_coeff,
1408  (float*)oprod.Gauge_p(),
1409  (float*)link.Gauge_p(),
1410  (float**)tempmat,
1411  (float*)newOprod->Gauge_p()
1412  );
1413  }else{
1414  errorQuda("Unsupported precision");
1415  }
1416 
1417  for(int i=0; i<6; ++i){
1418  free(tempmat[i]);
1419  }
1420  return;
1421  }
1422 
1423 
1424 
1425  template<class Real, int oddBit>
1426  void computeLongLinkSite(int half_lattice_index,
1427  const int dim[4],
1428  const Real* const oprod,
1429  const Real* const link,
1430  int sig, Real coeff,
1431  const LoadStore<Real>& ls,
1432  Real* const output)
1433  {
1434  if( GOES_FORWARDS(sig) ){
1435 
1436  Locator<oddBit> locator(dim);
1437 
1438  typename ColorMatrix<Real>::Type ab_link, bc_link, de_link, ef_link;
1439  typename ColorMatrix<Real>::Type colorMatU, colorMatV, colorMatW, colorMatX, colorMatY, colorMatZ;
1440 
1442 #ifdef MULTI_GPU
1443  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1444 #else
1445  int idx = half_lattice_index;
1446 #endif
1447 
1448  int X = locator.getFullFromHalfIndex(idx);
1449  point_c = idx;
1450 
1451  int new_mem_idx = locator.getNeighborFromFullIndex(X,sig);
1452  point_d = new_mem_idx >> 1;
1453 
1454  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, sig);
1455  point_e = new_mem_idx >> 1;
1456 
1457  new_mem_idx = locator.getNeighborFromFullIndex(X, OPP_DIR(sig));
1458  point_b = new_mem_idx >> 1;
1459 
1460  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, OPP_DIR(sig));
1461  point_a = new_mem_idx >> 1;
1462 
1463  ls.loadMatrixFromField(link, oddBit, sig, point_a, &ab_link);
1464  ls.loadMatrixFromField(link, 1-oddBit, sig, point_b, &bc_link);
1465  ls.loadMatrixFromField(link, 1-oddBit, sig, point_d, &de_link);
1466  ls.loadMatrixFromField(link, oddBit, sig, point_e, &ef_link);
1467 
1468  ls.loadMatrixFromField(oprod, oddBit, sig, point_c, &colorMatZ);
1469  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_b, &colorMatY);
1470  ls.loadMatrixFromField(oprod, oddBit, sig, point_a, &colorMatX);
1471 
1472  colorMatV = de_link*ef_link*colorMatZ
1473  - de_link*colorMatY*bc_link
1474  + colorMatX*ab_link*bc_link;
1475 
1476  ls.addMatrixToField(colorMatV, oddBit, sig, point_c, coeff, output);
1477  }
1478  return;
1479  }
1480 
1481  template<class Real>
1482  void computeLongLinkField(const int dim[4],
1483  const Real* const oprod,
1484  const Real* const link,
1485  int sig, Real coeff,
1486  Real* const output)
1487  {
1488  int volume = 1;
1489  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1490  const int half_volume = volume/2;
1491 
1492  LoadStore<Real> ls(volume);
1493  for(int site=0; site<half_volume; ++site){
1494  computeLongLinkSite<Real,0>(site,
1495  dim,
1496  oprod,
1497  link,
1498  sig, coeff, ls,
1499  output);
1500 
1501  }
1502  // Loop over odd lattice sites
1503  for(int site=0; site<half_volume; ++site){
1504  computeLongLinkSite<Real,1>(site,
1505  dim,
1506  oprod,
1507  link,
1508  sig, coeff, ls,
1509  output);
1510  }
1511  return;
1512  }
1513 
1514 
1516  const QudaGaugeParam &param,
1517  cpuGaugeField &oprod,
1518  cpuGaugeField &link,
1519  cpuGaugeField *newOprod)
1520  {
1521  for(int sig=0; sig<4; ++sig){
1522  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1523  computeLongLinkField<float>(param.X,
1524  (float*)oprod.Gauge_p(),
1525  (float*)link.Gauge_p(),
1526  sig, coeff,
1527  (float*)newOprod->Gauge_p());
1528  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1529  computeLongLinkField<double>(param.X,
1530  (double*)oprod.Gauge_p(),
1531  (double*)link.Gauge_p(),
1532  sig, coeff,
1533  (double*)newOprod->Gauge_p());
1534  }else {
1535  errorQuda("Unrecognised precision\n");
1536  }
1537  } // sig
1538  return;
1539  }
1540 
1541 
1542 template<class Real, int oddBit>
1543 void completeForceSite(int half_lattice_index,
1544  const int dim[4],
1545  const Real* const oprod,
1546  const Real* const link,
1547  int sig,
1548  const LoadStore<Real>& ls,
1549  Real* const mom)
1550 {
1551 
1552  typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
1553 
1554 #ifdef MULTI_GPU
1555  int half_lattice_index_ex = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1556  int idx = half_lattice_index_ex;
1557 #else
1558  int idx = half_lattice_index;
1559 #endif
1560  ls.loadMatrixFromField(link, oddBit, sig, idx, &linkW);
1561  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatX);
1562 
1563  const Real coeff = (oddBit) ? -1 : 1;
1564  colorMatY = linkW*colorMatX;
1565 
1566  ls.storeMatrixToMomentumField(colorMatY, oddBit, sig, half_lattice_index, coeff, mom);
1567  return;
1568 }
1569 
1570 template <class Real>
1571 void completeForceField(const int dim[4],
1572  const Real* const oprod,
1573  const Real* const link,
1574  int sig,
1575  Real* const mom)
1576 {
1577  int volume = dim[0]*dim[1]*dim[2]*dim[3];
1578  const int half_volume = volume/2;
1579  LoadStore<Real> ls(volume);
1580 
1581 
1582  for(int site=0; site<half_volume; ++site){
1583  completeForceSite<Real,0>(site,
1584  dim,
1585  oprod, link,
1586  sig,
1587  ls,
1588  mom);
1589 
1590  }
1591  for(int site=0; site<half_volume; ++site){
1592  completeForceSite<Real,1>(site,
1593  dim,
1594  oprod, link,
1595  sig,
1596  ls,
1597  mom);
1598  }
1599  return;
1600 }
1601 
1602 
1604  cpuGaugeField &oprod,
1605  cpuGaugeField &link,
1606  cpuGaugeField* mom)
1607  {
1608  for(int sig=0; sig<4; ++sig){
1609  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1610  completeForceField<float>(param.X,
1611  (float*)oprod.Gauge_p(),
1612  (float*)link.Gauge_p(),
1613  sig,
1614  (float*)mom->Gauge_p());
1615  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1616  completeForceField<double>(param.X,
1617  (double*)oprod.Gauge_p(),
1618  (double*)link.Gauge_p(),
1619  sig,
1620  (double*)mom->Gauge_p());
1621  }else{
1622  errorQuda("Unrecognised precision\n");
1623  }
1624  } // loop over sig
1625  return;
1626  }
1627 
1628 
1629 //} // namespace quda
1630 
1631 
1632 
1633 
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:110
#define D3
Definition: llfat_core.h:14
__constant__ int X1h
void completeForceSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, const LoadStore< Real > &ls, Real *const mom)
__constant__ int X2
int bc_link_nbr_idx
__device__ __host__ T const & operator()(int i, int j) const
Definition: quda_matrix.h:353
int point_d
void addMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real coeff, Real *const) const
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
#define errorQuda(...)
Definition: util_quda.h:73
Matrix< N, T > operator()() const
Matrix & operator-=(const Matrix< N, T > &mat)
__constant__ int X1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int getFullFromHalfIndex(int half_lattice_index)
#define RETURN_IF_ERR
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
#define D2
Definition: llfat_core.h:13
int getNeighborFromFullIndex(int full_lattice_index, int dir, int *err=NULL)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const
#define Pnumu
void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
int point_b
__constant__ int E4
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
int point_c
#define OPP_DIR(dir)
Definition: force_common.h:16
QudaGaugeParam param
Definition: pack_test.cpp:17
int Vh
void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
void computeSideLinkSite(int half_lattice_index, const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
#define Pmu
int z1
Definition: llfat_core.h:814
__host__ __device__ complex< ValueType > operator-(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:673
void computeMiddleLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, const LoadStore< Real > &ls, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
int new_mem_idx
Definition: llfat_core.h:834
T data[N *N]
Definition: quda_matrix.h:351
void storeMatrixToMomentumField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const
__constant__ double coeff
void computeOneLinkSite(const int dim[4], int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
#define Qnumu
int X[4]
Definition: quda.h:29
#define D1
Definition: llfat_core.h:12
int point_a
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
short x1h
Definition: llfat_core.h:815
int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const
Matrix< N, T > operator()() const
int gauge_order
int point_e
int ab_link_nbr_idx
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:692
Matrix< 3, std::complex< Real > > Type
Main header file for the QUDA library.
int Vh_ex
__constant__ int X3
int z2
Definition: llfat_core.h:816
Matrix & operator+=(const Matrix< N, T > &mat)
#define P3
void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
#define Qmu
short x1odd
Definition: llfat_core.h:821
void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix< 3, std::complex< Real > > *const mat) const
Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
Matrix< N, T > transpose(const Matrix< N, std::complex< T > > &mat)
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:83
#define GOES_FORWARDS(dir)
Definition: force_common.h:17
__constant__ int E1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, Real *const output)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
RealTypeId< RealA >::Type mycoeff
__host__ __device__ complex< ValueType > operator+(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:644
void computeAllLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
int ad_link_nbr_idx
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
#define P5
Locator(const int dim[4])
void hisqCompleteForceCPU(const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *mom)
__constant__ int E1h
void storeMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real *const field) const
int oddBit
__constant__ int E3
QudaPrecision cpu_prec
Definition: quda.h:40
__constant__ int E2
void doHisqStaplesForceCPU(const int dim[4], PathCoefficients< double > staple_coeff, Real *oprod, Real *link, Real **tempmat, Real *newOprod)
int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const