QUDA  v1.1.0
A library for QCD on GPUs
hisq_force_reference2.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <iostream>
4 #include <iomanip>
5 #include <complex>
6 
7 #include <quda.h>
8 #include <gauge_field.h>
9 
10 using namespace quda;
11 //namespace quda {
12 
13 #define RETURN_IF_ERR if(err) return;
14 
15 extern int gauge_order;
16 extern int Vh;
17 extern int Vh_ex;
18 
19  static int OPP_DIR(int dir){ return 7-dir; }
20  static bool GOES_FORWARDS(int dir){ return (dir<=3); }
21 
22  template<int N>
23  struct Sign{
24  static const int result = 1;
25  };
26 
27  template<>
28  struct Sign<1>
29  {
30  static const int result = -1;
31  };
32 
33 
34  template<class T, class U>
35  struct Promote
36  {
37  typedef T Type;
38  };
39 
40  template<>
41  struct Promote<int,float>
42  {
43  typedef float Type;
44  };
45 
46  template<>
47  struct Promote<float,int>
48  {
49  typedef float Type;
50  };
51 
52  template<>
53  struct Promote<int, double>
54  {
55  typedef double Type;
56  };
57 
58  template<>
59  struct Promote<double, int>
60  {
61  typedef double Type;
62  };
63 
64  template<>
65  struct Promote<float, double>
66  {
67  typedef double Type;
68  };
69 
70  template<>
71  struct Promote<double, float>
72  {
73  typedef double Type;
74  };
75 
76  template<>
77  struct Promote< int, std::complex<float> >
78  {
79  typedef std::complex<float> Type;
80  };
81 
82  template<>
83  struct Promote< std::complex<float>, int >
84  {
85  typedef std::complex<float> Type;
86  };
87 
88  template<>
89  struct Promote< float, std::complex<float> >
90  {
91  typedef std::complex<float> Type;
92  };
93 
94  template<>
95  struct Promote< int, std::complex<double> >
96  {
97  typedef std::complex<double> Type;
98  };
99 
100  template<>
101  struct Promote< std::complex<double>, int >
102  {
103  typedef std::complex<double> Type;
104  };
105 
106  template<>
107  struct Promote< float, std::complex<double> >
108  {
109  typedef std::complex<double> Type;
110  };
111 
112  template<>
113  struct Promote< std::complex<double>, float >
114  {
115  typedef std::complex<double> Type;
116  };
117 
118  template<>
119  struct Promote< double, std::complex<double> >
120  {
121  typedef std::complex<double> Type;
122  };
123 
124  template<>
125  struct Promote< std::complex<double>, double >
126  {
127  typedef std::complex<double> Type;
128  };
129 
130  template<int N, class T>
131  class Matrix
132  {
133  private:
134  T data[N][N];
135 
136  public:
137  Matrix(); // default constructor
138  Matrix(const Matrix<N,T>& mat); // copy constructor
141  const T& operator()(int i, int j) const;
142  T& operator()(int i, int j);
143  };
144 
145  template<int N, class T>
147  {
148  for(int i=0; i<N; ++i){
149  for(int j=0; j<N; ++j){
150  data[i][j] = static_cast<T>(0);
151  }
152  }
153  }
154 
155  template<int N, class T>
157  {
158  for(int i=0; i<N; ++i){
159  for(int j=0; j<N; ++j){
160  data[i][j] = mat.data[i][j];
161  }
162  }
163  }
164 
165  template<int N, class T>
166  T& Matrix<N,T>::operator()(int i, int j)
167  {
168  return data[i][j];
169  }
170 
171  template<int N, class T>
172  const T& Matrix<N,T>::operator()(int i, int j) const
173  {
174  return data[i][j];
175  }
176 
177  template<int N, class T>
179  {
180  for(int i=0; i<N; ++i){
181  for(int j=0; j<N; ++j){
182  data[i][j] += mat.data[i][j];
183  }
184  }
185  return *this;
186  }
187 
188  template<int N, class T>
190  {
191  for(int i=0; i<N; ++i){
192  for(int j=0; j<N; ++j){
193  data[i][j] -= mat.data[i][j];
194  }
195  }
196  return *this;
197  }
198 
199  template<int N, class T>
201  {
202  Matrix<N,T> result(a);
203  result += b;
204  return result;
205  }
206 
207  template<int N, class T>
209  {
210  Matrix<N,T> result(a);
211  result -= b;
212  return result;
213  }
214 
215  template<int N, class T>
217  {
218  Matrix<N,T> result;
219  for(int i=0; i<N; ++i){
220  for(int j=0; j<N; ++j){
221  result(i,j) = static_cast<T>(0);
222  for(int k=0; k<N; ++k){
223  result(i,j) += a(i,k)*b(k,j);
224  }
225  }
226  }
227  return result;
228  }
229 
230  template<int N, class T>
231  Matrix<N,std::complex<T> > conj(const Matrix<N,std::complex<T> >& mat)
232  {
233  Matrix<N,std::complex<T> > result;
234  for(int i=0; i<N; ++i){
235  for(int j=0; j<N; ++j){
236  result(i,j) = std::conj(mat(j,i));
237  }
238  }
239  return result;
240  }
241 
242  template<int N, class T>
243  Matrix<N,T> transpose(const Matrix<N,std::complex<T> >& mat)
244  {
245  Matrix<N,T> result;
246  for(int i=0; i<N; ++i){
247  for(int j=0; j<N; ++j){
248  result(i,j) = mat(j,i);
249  }
250  }
251  return result;
252  }
253 
254  template<int N, class T, class U>
256  {
257  typedef typename Promote<T,U>::Type return_type;
258  Matrix<N,return_type> result;
259 
260  for(int i=0; i<N; ++i){
261  for(int j=0; j<N; ++j){
262  result(i,j) = scalar*mat(i,j);
263  }
264  }
265  return result;
266  }
267 
268  template<int N, class T, class U>
270  {
271  return mat*scalar;
272  }
273 
274  template<int N, class T>
275  struct Identity
276  {
278  Matrix<N,T> id;
279  for(int i=0; i<N; ++i){
280  id(i,i) = static_cast<T>(1);
281  }
282  return id;
283  } // operator()
284  };
285 
286  template<int N, class T>
287  struct Zero
288  {
289  // the default constructor zeros all matrix elements
291  return Matrix<N,T>();
292  }
293  };
294 
295 
296  template<int N, class T>
297  std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)
298  {
299  for(int i=0; i<N; ++i){
300  for(int j=0; j<N; ++j){
301  os << m(i,j) << " ";
302  }
303  if(i<N-1) os << std::endl;
304  }
305  return os;
306  }
307 
308 
309 
310  template<class Real>
311  class LoadStore{
312  private:
313  const int volume;
314  const int half_volume;
315  public:
316  LoadStore(int vol) : volume(vol), half_volume(vol/2) {}
317 
318  void loadMatrixFromField(const Real* const field, int oddBit, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
319 
320  void loadMatrixFromField(const Real* const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
321 
322  void storeMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real* const field) const;
323 
324  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real coeff, Real* const) const;
325 
326  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
327 
328  void storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
329  Real getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const;
330  void addData(Real* const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
331  int half_idx_conversion_ex2normal(int half_lattice_index, const int* dim, int oddBit) const ;
332  int half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const ;
333  };
334 
335 template<class Real>
336 int LoadStore<Real>::half_idx_conversion_ex2normal(int half_lattice_index_ex, const int* dim, int oddBit) const
337 {
338  int X1=dim[0];
339  int X2=dim[1];
340  int X3=dim[2];
341  //int X4=dim[3];
342 
343  int E1=dim[0]+4;
344  int E2=dim[1]+4;
345  int E3=dim[2]+4;
346  //int E4=dim[3]+4;
347  int E1h=E1/2;
348 
349  int sid = half_lattice_index_ex;
350 
351  int za = sid/E1h;
352  int x1h = sid - za*E1h;
353  int zb = za/E2;
354  int x2 = za - zb*E2;
355  int x4 = zb/E3;
356  int x3 = zb - x4*E3;
357  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
358  int x1 = 2*x1h + x1odd;
359 
360  int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
361  return idx;
362 }
363 
364 template<class Real>
365 int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const
366 {
367  int X1=dim[0];
368  int X2=dim[1];
369  int X3=dim[2];
370  //int X4=dim[3];
371  int X1h=X1/2;
372 
373  int E1=dim[0]+4;
374  int E2=dim[1]+4;
375  int E3=dim[2]+4;
376  //int E4=dim[3]+4;
377  //int E1h=E1/2;
378 
379  int sid = half_lattice_index;
380 
381  int za = sid/X1h;
382  int x1h = sid - za*X1h;
383  int zb = za/X2;
384  int x2 = za - zb*X2;
385  int x4 = zb/X3;
386  int x3 = zb - x4*X3;
387  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
388  int x1 = 2*x1h + x1odd;
389 
390  int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;
391 
392  return idx;
393 }
394 
395 template<class Real>
396 Real LoadStore<Real>::getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const
397 {
399  return field[(4*hfv*oddBit +4*idx + dir)*18+offset];
400  }else{ //QDP format
401  return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
402  }
403 }
404 template<class Real>
405 void LoadStore<Real>::addData(Real* const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
406 {
408  field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;
409  }else{ //QDP format
410  ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset] += v;
411  }
412 }
413 
414 
415 
416 template<class Real>
417  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
418  int oddBit,
419  int half_lattice_index,
420  Matrix<3, std::complex<Real> >* const mat
421  ) const
422 {
423 #ifdef MULTI_GPU
424  int hfv = Vh_ex;
425 #else
426  int hfv = Vh;
427 #endif
428 
429  int offset = 0;
430  for(int i=0; i<3; ++i){
431  for(int j=0; j<3; ++j){
432  (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
433  (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
434  }
435  }
436  return;
437  }
438 
439  template<class Real>
440  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
441  int oddBit,
442  int dir,
443  int half_lattice_index,
444  Matrix<3, std::complex<Real> >* const mat
445  ) const
446  {
447 #ifdef MULTI_GPU
448  int hfv = Vh_ex;
449 #else
450  int hfv = Vh;
451 #endif
452 
453  //const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
454  int offset = 0;
455  for(int i=0; i<3; ++i){
456  for(int j=0; j<3; ++j){
457  (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
458  (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
459  }
460  }
461  return;
462  }
463 
464  template<class Real>
465  void LoadStore<Real>::storeMatrixToField(const Matrix<3, std::complex<Real> >& mat,
466  int oddBit,
467  int half_lattice_index,
468  Real* const field) const
469  {
470 #ifdef MULTI_GPU
471  int hfv = Vh_ex;
472 #else
473  int hfv = Vh;
474 #endif
475 
476  int offset = 0;
477  for(int i=0; i<3; ++i){
478  for(int j=0; j<3; ++j){
479  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).real();
480  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).imag();
481  }
482  }
483  return;
484  }
485 
486  template<class Real>
487  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
488  int oddBit,
489  int half_lattice_index,
490  Real coeff,
491  Real* const field) const
492  {
493 #ifdef MULTI_GPU
494  int hfv = Vh_ex;
495 #else
496  int hfv = Vh;
497 #endif
498  Real* const local_field = field + (oddBit*hfv + half_lattice_index)*18;
499 
500 
501  int offset = 0;
502  for(int i=0; i<3; ++i){
503  for(int j=0; j<3; ++j){
504  local_field[offset++] += coeff*mat(i,j).real();
505  local_field[offset++] += coeff*mat(i,j).imag();
506  }
507  }
508  return;
509  }
510 
511 
512  template<class Real>
513  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
514  int oddBit,
515  int dir,
516  int half_lattice_index,
517  Real coeff,
518  Real* const field) const
519  {
520 
521 #ifdef MULTI_GPU
522  int hfv = Vh_ex;
523 #else
524  int hfv = Vh;
525 #endif
526 
527  //Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
528  int offset = 0;
529  for(int i=0; i<3; ++i){
530  for(int j=0; j<3; ++j){
531  //local_field[offset++] += coeff*mat(i,j).real();
532  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).real(), hfv);
533 
534  //local_field[offset++] += coeff*mat(i,j).imag();
535  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).imag(), hfv);
536  }
537  }
538  return;
539  }
540 
541 
542  template<class Real>
543  void LoadStore<Real>::storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat,
544  int oddBit,
545  int dir,
546  int half_lattice_index,
547  Real coeff,
548  Real* const field) const
549  {
550  Real* const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
551  mom_field[0] = (mat(0,1).real() - mat(1,0).real())*0.5*coeff;
552  mom_field[1] = (mat(0,1).imag() + mat(1,0).imag())*0.5*coeff;
553 
554  mom_field[2] = (mat(0,2).real() - mat(2,0).real())*0.5*coeff;
555  mom_field[3] = (mat(0,2).imag() + mat(2,0).imag())*0.5*coeff;
556 
557  mom_field[4] = (mat(1,2).real() - mat(2,1).real())*0.5*coeff;
558  mom_field[5] = (mat(1,2).imag() + mat(2,1).imag())*0.5*coeff;
559 
560  const Real temp = (mat(0,0).imag() + mat(1,1).imag() + mat(2,2).imag())*0.3333333333333333333;
561  mom_field[6] = (mat(0,0).imag() - temp)*coeff;
562  mom_field[7] = (mat(1,1).imag() - temp)*coeff;
563  mom_field[8] = (mat(2,2).imag() - temp)*coeff;
564  mom_field[9] = 0.0;
565  return;
566  }
567 
568 
569 
570  template<int oddBit>
571  struct Locator
572  {
573  private:
574  int local_dim[4];
575  int volume;
576  int half_index;
577  int full_index;
578  int full_coord[4];
579  void getCoordsFromHalfIndex(int half_index, int coord[4]);
580  void getCoordsFromFullIndex(int full_index, int coord[4]);
581  void cache(int half_lattice_index); // caches the half-lattice index, full-lattice index,
582  // and full-lattice coordinates
583 
584 
585  public:
586  Locator(const int dim[4]);
587  int getFullFromHalfIndex(int half_lattice_index);
588  int getNeighborFromFullIndex(int full_lattice_index, int dir, int* err=NULL);
589  };
590 
591  template<int oddBit>
592  Locator<oddBit>::Locator(const int dim[4]) : half_index(-1), full_index(-1){
593  volume = 1;
594  for(int dir=0; dir<4; ++dir){
595  local_dim[dir] = dim[dir];
596  volume *= local_dim[dir];
597  }
598  }
599 
600  // Store the half_lattice index, works out and stores the full lattice index
601  // and the coordinates.
602  template<int oddBit>
603  void Locator<oddBit>::getCoordsFromHalfIndex(int half_lattice_index, int coord[4])
604  {
605 #ifdef MULTI_GPU
606  int E1 = local_dim[0]+4;
607  int E2 = local_dim[1]+4;
608  int E3 = local_dim[2]+4;
609  //int E4 = local_dim[3]+4;
610  int E1h = E1/2;
611 
612  int z1 = half_lattice_index/E1h;
613  int x1h = half_lattice_index - z1*E1h;
614  int z2 = z1/E2;
615  coord[1] = z1 - z2*E2;
616  coord[3] = z2/E3;
617  coord[2] = z2 - coord[3]*E3;
618  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
619  coord[0] = 2*x1h + x1odd;
620 #else
621  int half_dim_0 = local_dim[0]/2;
622  int z1 = half_lattice_index/half_dim_0;
623  int x1h = half_lattice_index - z1*half_dim_0;
624  int z2 = z1/local_dim[1];
625  coord[1] = z1 - z2*local_dim[1];
626  coord[3] = z2/local_dim[2];
627  coord[2] = z2 - coord[3]*local_dim[2];
628  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
629  coord[0] = 2*x1h + x1odd;
630 #endif
631 
632  }
633 
634  template<int oddBit>
635  void Locator<oddBit>::getCoordsFromFullIndex(int full_lattice_index, int coord[4])
636  {
637 #ifdef MULTI_GPU
638  int D1=local_dim[0]+4;
639  int D2=local_dim[1]+4;
640  int D3=local_dim[2]+4;
641  //int D4=local_dim[3]+4;
642  //int D1h=D1/2;
643 #else
644  int D1=local_dim[0];
645  int D2=local_dim[1];
646  int D3=local_dim[2];
647  //int D4=local_dim[3];
648  //int D1h=D1/2;
649 #endif
650 
651  int z1 = full_lattice_index/D1;
652  coord[0] = full_lattice_index - z1*D1;
653  int z2 = z1/D2;
654  coord[1] = z1 - z2*D2;
655  coord[3] = z2/D3;
656  coord[2] = z2 - coord[3]*D3;
657  }
658 
659 
660 
661 
662 
663  template<int oddBit>
664  void Locator<oddBit>::cache(int half_lattice_index)
665  {
666  half_index = half_lattice_index;
667  getCoordsFromHalfIndex(half_lattice_index, full_coord);
668  int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;
669  full_index = 2*half_lattice_index + x1odd;
670  return;
671  }
672 
673  template<int oddBit>
674  int Locator<oddBit>::getFullFromHalfIndex(int half_lattice_index)
675  {
676  if(half_index != half_lattice_index) cache(half_lattice_index);
677  return full_index;
678  }
679 
680  // From full index return the neighbouring full index
681  template<int oddBit>
682  int Locator<oddBit>::getNeighborFromFullIndex(int full_lattice_index, int dir, int* err)
683  {
684  if(err) *err = 0;
685 
686  int coord[4];
687  int neighbor_index;
688  getCoordsFromFullIndex(full_lattice_index, coord);
689 #ifdef MULTI_GPU
690  int E1 = local_dim[0] + 4;
691  int E2 = local_dim[1] + 4;
692  int E3 = local_dim[2] + 4;
693  int E4 = local_dim[3] + 4;
694  switch(dir){
695  case 0: //+X
696  neighbor_index = full_lattice_index + 1;
697  if(err && (coord[0] == E1-1) ) *err = 1;
698  break;
699  case 1: //+Y
700  neighbor_index = full_lattice_index + E1;
701  if(err && (coord[1] == E2-1) ) *err = 1;
702  break;
703  case 2: //+Z
704  neighbor_index = full_lattice_index + E2*E1;
705  if(err && (coord[2] == E3-1) ) *err = 1;
706  break;
707  case 3: //+T
708  neighbor_index = full_lattice_index + E3*E2*E1;
709  if(err && (coord[3] == E4-1) ) *err = 1;
710  break;
711  case 7: //-X
712  neighbor_index = full_lattice_index - 1;
713  if(err && (coord[0] == 0) ) *err = 1;
714  break;
715  case 6: //-Y
716  neighbor_index = full_lattice_index - E1;
717  if(err && (coord[1] == 0) ) *err = 1;
718  break;
719  case 5: //-Z
720  neighbor_index = full_lattice_index - E2*E1;
721  if(err && (coord[2] == 0) ) *err = 1;
722  break;
723  case 4: //-T
724  neighbor_index = full_lattice_index - E3*E2*E1;
725  if(err && (coord[3] == 0) ) *err = 1;
726  break;
727  default:
728  errorQuda("Neighbor index could not be determined\n");
729  exit(1);
730  break;
731  } // switch(dir)
732 
733 #else
734  switch(dir){
735  case 0:
736  neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
737  break;
738  case 1:
739  neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
740  break;
741  case 2:
742  neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
743  break;
744  case 3:
745  neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
746  break;
747  case 7:
748  neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
749  break;
750  case 6:
751  neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
752  break;
753  case 5:
754  neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
755  break;
756  case 4:
757  neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
758  break;
759  default:
760  errorQuda("Neighbor index could not be determined\n");
761  exit(1);
762  break;
763  } // switch(dir)
764  if(err) *err = 0;
765 #endif
766  return neighbor_index;
767  }
768 
769 // Can't typedef a template
770 template<class Real>
772 {
774 };
775 
776  template<class Real, int oddBit>
777  void computeOneLinkSite(const int dim[4],
778  int half_lattice_index,
779  const Real* const oprod,
780  int sig, Real coeff,
781  const LoadStore<Real>& ls,
782  Real* const output)
783  {
784  if( GOES_FORWARDS(sig) ){
785  typename ColorMatrix<Real>::Type colorMatW;
786 #ifdef MULTI_GPU
787  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
788 #else
789  int idx = half_lattice_index;
790 #endif
791  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatW);
792  ls.addMatrixToField(colorMatW, oddBit, sig, idx, coeff, output);
793  }
794  return;
795  }
796 
797  template<class Real>
798  void computeOneLinkField(const int dim[4],
799  const Real* const oprod,
800  int sig, Real coeff,
801  Real* const output)
802  {
803  int volume = 1;
804  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
805  const int half_volume = volume/2;
806  LoadStore<Real> ls(volume);
807  for(int site=0; site<half_volume; ++site){
808  computeOneLinkSite<Real,0>(dim, site,
809  oprod,
810  sig, coeff, ls,
811  output);
812 
813  }
814  // Loop over odd lattice sites
815  for(int site=0; site<half_volume; ++site){
816  computeOneLinkSite<Real,1>(dim, site,
817  oprod,
818  sig, coeff, ls,
819  output);
820  }
821  return;
822  }
823 
824 
825 
826 
827 
828 
829 
830 
831 
832  // middleLinkKernel compiles for now, but lots of debugging to be done
833  template<class Real, int oddBit>
834  void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
835  const int dim[4],
836  const Real* const oprod,
837  const Real* const Qprev,
838  const Real* const link,
839  int sig, int mu,
840  Real coeff,
841  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
842  Real* const Pmu,
843  Real* const P3,
844  Real* const Qmu,
845  Real* const newOprod
846  )
847  {
848  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
849  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
850 
851 
852  Locator<oddBit> locator(dim);
853  int point_b, point_c, point_d;
854  int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx;
855  int X = locator.getFullFromHalfIndex(half_lattice_index);
856 
857  int err;
858  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
859  point_d = new_mem_idx >> 1;
860  // getNeighborFromFullIndex will work on any site on the lattice, odd or even
861  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
862  point_c = new_mem_idx >> 1;
863 
864  new_mem_idx = locator.getNeighborFromFullIndex(X,sig); RETURN_IF_ERR;
865  point_b = new_mem_idx >> 1;
866 
867  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
868  bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
869  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
870 
871  typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;
872  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
873 
874 
875 
876 
877 
878  if(sig_positive){
879  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
880  }else{
881  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
882  }
883 
884  if(mu_positive){
885  ls.loadMatrixFromField(link, oddBit, mu, bc_link_nbr_idx, &bc_link);
886  }else{
887  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(mu), bc_link_nbr_idx, &bc_link);
888  }
889 
890  if(Qprev == NULL){
891  if(sig_positive){
892  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_d, &colorMatY);
893  }else{
894  ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);
895  colorMatY = conj(colorMatY);
896  }
897  }else{ // Qprev != NULL
898  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
899  }
900 
901  colorMatW = (!mu_positive) ? bc_link*colorMatY : conj(bc_link)*colorMatY;
902  if(Pmu) ls.storeMatrixToField(colorMatW, 1-oddBit, point_b, Pmu);
903 
904  colorMatY = (sig_positive) ? ab_link*colorMatW : conj(ab_link)*colorMatW;
905  ls.storeMatrixToField(colorMatY, oddBit, half_lattice_index, P3);
906 
907  if(mu_positive){
908  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
909  }else{
910  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
911  ad_link = conj(ad_link);
912  }
913 
914 
915  if(Qprev == NULL){
916  if(sig_positive) colorMatY = colorMatW*ad_link;
917  if(Qmu) ls.storeMatrixToField(ad_link, oddBit, half_lattice_index, Qmu);
918  }else{ // Qprev != NULL
919  if(Qmu || sig_positive){
920  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatY);
921  colorMatX= colorMatY*ad_link;
922  }
923  if(Qmu) ls.storeMatrixToField(colorMatX, oddBit, half_lattice_index, Qmu);
924  if(sig_positive) colorMatY = colorMatW*colorMatX;
925  }
926 
927  if(sig_positive) ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);
928 
929  return;
930  } // computeMiddleLinkSite
931 
932 
933  template<class Real>
934  void computeMiddleLinkField(const int dim[4],
935  const Real* const oprod,
936  const Real* const Qprev,
937  const Real* const link,
938  int sig, int mu,
939  Real coeff,
940  Real* const Pmu,
941  Real* const P3,
942  Real* const Qmu,
943  Real* const newOprod
944  )
945  {
946 
947  int volume = 1;
948  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
949 #ifdef MULTI_GPU
950  const int loop_count = Vh_ex;
951 #else
952  const int loop_count = volume/2;
953 #endif
954  // loop over the lattice volume
955  // To keep the code as close to the GPU code as possible, we'll
956  // loop over the even sites first and then the odd sites
957  LoadStore<Real> ls(volume);
958  for(int site=0; site<loop_count; ++site){
959  computeMiddleLinkSite<Real, 0>(site, dim,
960  oprod, Qprev, link,
961  sig, mu, coeff,
962  ls,
963  Pmu, P3, Qmu, newOprod);
964  }
965  // Loop over odd lattice sites
966  for(int site=0; site<loop_count; ++site){
967  computeMiddleLinkSite<Real,1>(site, dim,
968  oprod, Qprev, link,
969  sig, mu, coeff,
970  ls,
971  Pmu, P3, Qmu, newOprod);
972  }
973  return;
974  }
975 
976 
977 
978 
979  template<class Real, int oddBit>
980  void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
981  const int dim[4],
982  const Real* const P3,
983  const Real* const Qprod, // why?
984  const Real* const link,
985  int sig, int mu,
986  Real coeff, Real accumu_coeff,
987  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
988  Real* const shortP,
989  Real* const newOprod
990  )
991  {
992 
993  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
994  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
995 
996  Locator<oddBit> locator(dim);
997  int point_d;
998  int ad_link_nbr_idx;
999  int X = locator.getFullFromHalfIndex(half_lattice_index);
1000 
1001  int err;
1002  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1003  point_d = new_mem_idx >> 1;
1004  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1005 
1006 
1007  typename ColorMatrix<Real>::Type ad_link;
1008  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY;
1009 
1010  ls.loadMatrixFromField(P3, oddBit, half_lattice_index, &colorMatY);
1011 
1012  if(shortP){
1013  if(mu_positive){
1014  ad_link_nbr_idx = point_d;
1015  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
1016  }else{
1017  ad_link_nbr_idx = half_lattice_index;
1018  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
1019  }
1020  colorMatW = (mu_positive) ? ad_link*colorMatY : conj(ad_link)*colorMatY;
1021  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1022  } // if(shortP)
1023 
1024 
1025  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1026 
1027  if(Qprod){
1028  ls.loadMatrixFromField(Qprod, 1-oddBit, point_d, &colorMatX);
1029  if(mu_positive){
1030  colorMatW = colorMatY*colorMatX;
1031  if(!oddBit){ mycoeff = -mycoeff; }
1032  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, mycoeff, newOprod);
1033  }else{
1034  colorMatW = conj(colorMatX)*conj(colorMatY);
1035  if(oddBit){ mycoeff = -mycoeff; }
1036  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1037  }
1038  }
1039 
1040  if(!Qprod){
1041  if(mu_positive){
1042  if(!oddBit){ mycoeff = -mycoeff; }
1043  ls.addMatrixToField(colorMatY, 1-oddBit, mu, point_d, mycoeff, newOprod);
1044  }else{
1045  if(oddBit){ mycoeff = -mycoeff; }
1046  colorMatW = conj(colorMatY);
1047  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1048  }
1049  } // if !(Qprod)
1050 
1051  return;
1052  } // computeSideLinkSite
1053 
1054 
1055  // Maybe change to computeSideLinkField
1056  template<class Real>
1057  void computeSideLinkField(const int dim[4],
1058  const Real* const P3,
1059  const Real* const Qprod, // why?
1060  const Real* const link,
1061  int sig, int mu,
1062  Real coeff, Real accumu_coeff,
1063  Real* const shortP,
1064  Real* const newOprod
1065  )
1066  {
1067  // Need some way of setting half_volume
1068  int volume = 1;
1069  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1070 #ifdef MULTI_GPU
1071  const int loop_count = Vh_ex;
1072 #else
1073  const int loop_count = volume/2;
1074 #endif
1075  LoadStore<Real> ls(volume);
1076 
1077  for(int site=0; site<loop_count; ++site){
1078  computeSideLinkSite<Real,0>(site, dim,
1079  P3, Qprod, link,
1080  sig, mu,
1081  coeff, accumu_coeff,
1082  ls, shortP, newOprod);
1083  }
1084 
1085  for(int site=0; site<loop_count; ++site){
1086  computeSideLinkSite<Real,1>(site, dim,
1087  P3, Qprod, link,
1088  sig, mu,
1089  coeff, accumu_coeff,
1090  ls, shortP, newOprod);
1091  }
1092 
1093  return;
1094  }
1095 
1096 
1097 
1098 
1099 
1100  template<class Real, int oddBit>
1101  void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
1102  const int dim[4],
1103  const Real* const oprod,
1104  const Real* const Qprev,
1105  const Real* const link,
1106  int sig, int mu,
1107  Real coeff, Real accumu_coeff,
1108  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
1109  Real* const shortP,
1110  Real* const newOprod)
1111  {
1112 
1113  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
1114  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
1115 
1116  typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;
1117  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
1118 
1119 
1120  int ab_link_nbr_idx, point_b, point_c, point_d;
1121 
1122  Locator<oddBit> locator(dim);
1123  int X = locator.getFullFromHalfIndex(half_lattice_index);
1124 
1125  int err;
1126  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1127  point_d = new_mem_idx >> 1;
1128 
1129  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
1130  point_c = new_mem_idx >> 1;
1131 
1132  new_mem_idx = locator.getNeighborFromFullIndex(X,sig, &err); RETURN_IF_ERR;
1133  point_b = new_mem_idx >> 1;
1134  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1135 
1136 
1137 
1138  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1139 
1140  if(mu_positive){
1141  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1142  ls.loadMatrixFromField(link, 1-oddBit, mu, point_d, &ad_link);
1143  // compute point_c
1144  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1145  ls.loadMatrixFromField(link, oddBit, mu, point_c, &bc_link);
1146  colorMatZ = conj(bc_link)*colorMatY; // okay
1147 
1148  if(sig_positive)
1149  {
1150  colorMatY = colorMatX*ad_link;
1151  colorMatW = colorMatZ*colorMatY;
1152  ls.addMatrixToField(colorMatW, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1153  }
1154 
1155  if(sig_positive){
1156  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1157  }else{
1158  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1159  }
1160  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // okay
1161  colorMatW = colorMatY*colorMatX;
1162  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, -Sign<oddBit>::result*mycoeff, newOprod);
1163  colorMatW = ad_link*colorMatY;
1164  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP); // Warning! Need to check this!
1165  }else{ //negative mu
1166  mu = OPP_DIR(mu);
1167  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1168  ls.loadMatrixFromField(link, oddBit, mu, half_lattice_index, &ad_link);
1169  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1170  ls.loadMatrixFromField(link, 1-oddBit, mu, point_b, &bc_link);
1171 
1172  if(sig_positive) colorMatW = colorMatX*conj(ad_link);
1173  colorMatZ = bc_link*colorMatY;
1174  if(sig_positive){
1175  colorMatY = colorMatZ*colorMatW;
1176  ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1177  }
1178 
1179  if(sig_positive){
1180  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1181  }else{
1182  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1183  }
1184 
1185  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // 611
1186  colorMatW = conj(colorMatX)*conj(colorMatY);
1187 
1188  ls.addMatrixToField(colorMatW, oddBit, mu, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1189  colorMatW = conj(ad_link)*colorMatY;
1190  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1191  } // end mu
1192  return;
1193  } // allLinkKernel
1194 
1195 
1196 
1197  template<class Real>
1198  void computeAllLinkField(const int dim[4],
1199  const Real* const oprod,
1200  const Real* const Qprev,
1201  const Real* const link,
1202  int sig, int mu,
1203  Real coeff, Real accumu_coeff,
1204  Real* const shortP,
1205  Real* const newOprod)
1206  {
1207  int volume = 1;
1208  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1209 #ifdef MULTI_GPU
1210  const int loop_count = Vh_ex;
1211 #else
1212  const int loop_count = volume/2;
1213 #endif
1214 
1215  LoadStore<Real> ls(volume);
1216  for(int site=0; site<loop_count; ++site){
1217 
1218  computeAllLinkSite<Real,0>(site, dim,
1219  oprod, Qprev, link,
1220  sig, mu,
1221  coeff, accumu_coeff,
1222  ls,
1223  shortP, newOprod);
1224  }
1225 
1226  for(int site=0; site<loop_count; ++site){
1227  computeAllLinkSite<Real, 1>(site, dim,
1228  oprod, Qprev, link,
1229  sig, mu,
1230  coeff, accumu_coeff,
1231  ls,
1232  shortP, newOprod);
1233  }
1234 
1235  return;
1236  }
1237 
1238 #define Pmu tempmat[0]
1239 #define P3 tempmat[1]
1240 #define P5 tempmat[2]
1241 #define Pnumu tempmat[3]
1242 #define Qmu tempmat[4]
1243 #define Qnumu tempmat[5]
1244 
1245  template<class Real>
1247  {
1248  Real one;
1249  Real three;
1250  Real five;
1251  Real seven;
1252  Real naik;
1253  Real lepage;
1254  };
1255 
1256 
1257  template<class Real>
1258  void doHisqStaplesForceCPU(const int dim[4],
1259  PathCoefficients<double> staple_coeff,
1260  Real* oprod,
1261  Real* link,
1262  Real** tempmat,
1263  Real* newOprod)
1264  {
1265  Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
1266 
1267  OneLink = staple_coeff.one;
1268  ThreeSt = staple_coeff.three;
1269  FiveSt = staple_coeff.five;
1270  SevenSt = staple_coeff.seven;
1271  Lepage = staple_coeff.lepage;
1272 
1273  for(int sig=0; sig<4; ++sig){
1275  oprod,
1276  sig, OneLink,
1277  newOprod);
1278  }
1279 
1280 
1281  // sig labels the net displacement of the staple
1282  for(int sig=0; sig<8; ++sig){
1283  for(int mu=0; mu<8; ++mu){
1284  if( mu == sig || mu == OPP_DIR(sig) ) continue;
1285 
1286  computeMiddleLinkField<Real>(dim,
1287  oprod, NULL, link,
1288  sig, mu, -ThreeSt,
1289  Pmu, P3, Qmu,
1290  newOprod);
1291 
1292  for(int nu=0; nu<8; ++nu){
1293  if( nu==mu || nu==OPP_DIR(mu)
1294  || nu==sig || nu==OPP_DIR(sig) ) continue;
1295 
1296  computeMiddleLinkField<Real>(dim,
1297  Pmu, Qmu, link,
1298  sig, nu, staple_coeff.five,
1299  Pnumu, P5, Qnumu,
1300  newOprod);
1301 
1302 
1303  for(int rho=0; rho<8; ++rho){
1304  if( rho == sig || rho == OPP_DIR(sig)
1305  || rho == mu || rho == OPP_DIR(mu)
1306  || rho == nu || rho == OPP_DIR(nu) )
1307  {
1308  continue;
1309  }
1310 
1311  if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1312  computeAllLinkField<Real>(dim,
1313  Pnumu, Qnumu, link,
1314  sig, rho, staple_coeff.seven, coeff,
1315  P5, newOprod);
1316 
1317  } // rho
1318 
1319  // 5-staple: side link
1320  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1321  computeSideLinkField<Real>(dim,
1322  P5, Qmu, link,
1323  sig, nu, -FiveSt, coeff,
1324  P3, newOprod);
1325 
1326 
1327  } // nu
1328 
1329 
1330  // lepage
1331  if(staple_coeff.lepage != 0.){
1332  computeMiddleLinkField<Real>(dim,
1333  Pmu, Qmu, link,
1334  sig, mu, Lepage,
1335  NULL, P5, NULL,
1336  newOprod);
1337 
1338  if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
1339  computeSideLinkField<Real>(dim,
1340  P5, Qmu, link,
1341  sig, mu, -Lepage, coeff,
1342  P3, newOprod);
1343  } // lepage != 0
1344 
1345 
1346  computeSideLinkField<Real>(dim,
1347  P3, NULL, link,
1348  sig, mu, ThreeSt, 0.,
1349  NULL, newOprod);
1350  } // mu
1351  } // sig
1352 
1353  // Need also to compute the one-link contribution
1354  return;
1355  }
1356 
1357 #undef Pmu
1358 #undef P3
1359 #undef P5
1360 #undef Pnumu
1361 #undef Qmu
1362 #undef Qnumu
1363 
1364 
1365  void hisqStaplesForceCPU(const double* path_coeff,
1366  const QudaGaugeParam &param,
1367  cpuGaugeField &oprod,
1368  cpuGaugeField &link,
1369  cpuGaugeField* newOprod)
1370  {
1371  int volume = 1;
1372  for(int dir=0; dir<4; ++dir) volume *= param.X[dir];
1373 
1374 #ifdef MULTI_GPU
1375  int len = Vh_ex*2;
1376 #else
1377  int len = volume;
1378 #endif
1379  // allocate memory for temporary fields
1380  void* tempmat[6];
1382  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(double));
1383  }else{
1384  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(float));
1385  }
1386 
1387  PathCoefficients<double> act_path_coeff;
1388  act_path_coeff.one = path_coeff[0];
1389  act_path_coeff.naik = path_coeff[1];
1390  act_path_coeff.three = path_coeff[2];
1391  act_path_coeff.five = path_coeff[3];
1392  act_path_coeff.seven = path_coeff[4];
1393  act_path_coeff.lepage = path_coeff[5];
1394 
1396  doHisqStaplesForceCPU<double>(param.X,
1397  act_path_coeff,
1398  (double*)oprod.Gauge_p(),
1399  (double*)link.Gauge_p(),
1400  (double**)tempmat,
1401  (double*)newOprod->Gauge_p()
1402  );
1403 
1404  }else if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1405  doHisqStaplesForceCPU<float>(param.X,
1406  act_path_coeff,
1407  (float*)oprod.Gauge_p(),
1408  (float*)link.Gauge_p(),
1409  (float**)tempmat,
1410  (float*)newOprod->Gauge_p()
1411  );
1412  }else{
1413  errorQuda("Unsupported precision");
1414  }
1415 
1416  for(int i=0; i<6; ++i){
1417  free(tempmat[i]);
1418  }
1419  return;
1420  }
1421 
1422 
1423 
1424  template<class Real, int oddBit>
1425  void computeLongLinkSite(int half_lattice_index,
1426  const int dim[4],
1427  const Real* const oprod,
1428  const Real* const link,
1429  int sig, Real coeff,
1430  const LoadStore<Real>& ls,
1431  Real* const output)
1432  {
1433  if( GOES_FORWARDS(sig) ){
1434 
1435  Locator<oddBit> locator(dim);
1436 
1437  typename ColorMatrix<Real>::Type ab_link, bc_link, de_link, ef_link;
1438  typename ColorMatrix<Real>::Type colorMatU, colorMatV, colorMatW, colorMatX, colorMatY, colorMatZ;
1439 
1440  int point_a, point_b, point_c, point_d, point_e;
1441 #ifdef MULTI_GPU
1442  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1443 #else
1444  int idx = half_lattice_index;
1445 #endif
1446 
1447  int X = locator.getFullFromHalfIndex(idx);
1448  point_c = idx;
1449 
1450  int new_mem_idx = locator.getNeighborFromFullIndex(X,sig);
1451  point_d = new_mem_idx >> 1;
1452 
1453  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, sig);
1454  point_e = new_mem_idx >> 1;
1455 
1456  new_mem_idx = locator.getNeighborFromFullIndex(X, OPP_DIR(sig));
1457  point_b = new_mem_idx >> 1;
1458 
1459  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, OPP_DIR(sig));
1460  point_a = new_mem_idx >> 1;
1461 
1462  ls.loadMatrixFromField(link, oddBit, sig, point_a, &ab_link);
1463  ls.loadMatrixFromField(link, 1-oddBit, sig, point_b, &bc_link);
1464  ls.loadMatrixFromField(link, 1-oddBit, sig, point_d, &de_link);
1465  ls.loadMatrixFromField(link, oddBit, sig, point_e, &ef_link);
1466 
1467  ls.loadMatrixFromField(oprod, oddBit, sig, point_c, &colorMatZ);
1468  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_b, &colorMatY);
1469  ls.loadMatrixFromField(oprod, oddBit, sig, point_a, &colorMatX);
1470 
1471  colorMatV = de_link*ef_link*colorMatZ
1472  - de_link*colorMatY*bc_link
1473  + colorMatX*ab_link*bc_link;
1474 
1475  ls.addMatrixToField(colorMatV, oddBit, sig, point_c, coeff, output);
1476  }
1477  return;
1478  }
1479 
1480  template<class Real>
1481  void computeLongLinkField(const int dim[4],
1482  const Real* const oprod,
1483  const Real* const link,
1484  int sig, Real coeff,
1485  Real* const output)
1486  {
1487  int volume = 1;
1488  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1489  const int half_volume = volume/2;
1490 
1491  LoadStore<Real> ls(volume);
1492  for(int site=0; site<half_volume; ++site){
1493  computeLongLinkSite<Real,0>(site,
1494  dim,
1495  oprod,
1496  link,
1497  sig, coeff, ls,
1498  output);
1499 
1500  }
1501  // Loop over odd lattice sites
1502  for(int site=0; site<half_volume; ++site){
1503  computeLongLinkSite<Real,1>(site,
1504  dim,
1505  oprod,
1506  link,
1507  sig, coeff, ls,
1508  output);
1509  }
1510  return;
1511  }
1512 
1513 
1514  void hisqLongLinkForceCPU(double coeff,
1515  const QudaGaugeParam &param,
1516  cpuGaugeField &oprod,
1517  cpuGaugeField &link,
1518  cpuGaugeField *newOprod)
1519  {
1520  for(int sig=0; sig<4; ++sig){
1522  computeLongLinkField<float>(param.X,
1523  (float*)oprod.Gauge_p(),
1524  (float*)link.Gauge_p(),
1525  sig, coeff,
1526  (float*)newOprod->Gauge_p());
1527  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1528  computeLongLinkField<double>(param.X,
1529  (double*)oprod.Gauge_p(),
1530  (double*)link.Gauge_p(),
1531  sig, coeff,
1532  (double*)newOprod->Gauge_p());
1533  }else {
1534  errorQuda("Unrecognised precision\n");
1535  }
1536  } // sig
1537  return;
1538  }
1539 
1540 
1541 template<class Real, int oddBit>
1542 void completeForceSite(int half_lattice_index,
1543  const int dim[4],
1544  const Real* const oprod,
1545  const Real* const link,
1546  int sig,
1547  const LoadStore<Real>& ls,
1548  Real* const mom)
1549 {
1550 
1551  typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
1552 
1553 #ifdef MULTI_GPU
1554  int half_lattice_index_ex = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1555  int idx = half_lattice_index_ex;
1556 #else
1557  int idx = half_lattice_index;
1558 #endif
1559  ls.loadMatrixFromField(link, oddBit, sig, idx, &linkW);
1560  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatX);
1561 
1562  const Real coeff = (oddBit) ? -1 : 1;
1563  colorMatY = linkW*colorMatX;
1564 
1565  ls.storeMatrixToMomentumField(colorMatY, oddBit, sig, half_lattice_index, coeff, mom);
1566  return;
1567 }
1568 
1569 template <class Real>
1570 void completeForceField(const int dim[4],
1571  const Real* const oprod,
1572  const Real* const link,
1573  int sig,
1574  Real* const mom)
1575 {
1576  int volume = dim[0]*dim[1]*dim[2]*dim[3];
1577  const int half_volume = volume/2;
1578  LoadStore<Real> ls(volume);
1579 
1580 
1581  for(int site=0; site<half_volume; ++site){
1582  completeForceSite<Real,0>(site,
1583  dim,
1584  oprod, link,
1585  sig,
1586  ls,
1587  mom);
1588 
1589  }
1590  for(int site=0; site<half_volume; ++site){
1591  completeForceSite<Real,1>(site,
1592  dim,
1593  oprod, link,
1594  sig,
1595  ls,
1596  mom);
1597  }
1598  return;
1599 }
1600 
1601 
1603  cpuGaugeField &oprod,
1604  cpuGaugeField &link,
1605  cpuGaugeField* mom)
1606  {
1607  for(int sig=0; sig<4; ++sig){
1609  completeForceField<float>(param.X,
1610  (float*)oprod.Gauge_p(),
1611  (float*)link.Gauge_p(),
1612  sig,
1613  (float*)mom->Gauge_p());
1614  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1615  completeForceField<double>(param.X,
1616  (double*)oprod.Gauge_p(),
1617  (double*)link.Gauge_p(),
1618  sig,
1619  (double*)mom->Gauge_p());
1620  }else{
1621  errorQuda("Unrecognised precision\n");
1622  }
1623  } // loop over sig
1624  return;
1625  }
1626 
1627 
1628 //} // namespace quda
1629 
1630 
1631 
1632 
void storeMatrixToMomentumField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const
Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix< 3, std::complex< Real > > *const mat) const
int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const
int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const
void storeMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real *const field) const
void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const
void addMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real coeff, Real *const) const
T & operator()(int i, int j)
Matrix & operator-=(const Matrix< N, T > &mat)
Matrix & operator+=(const Matrix< N, T > &mat)
const T & operator()(int i, int j) const
T data[N *N]
Definition: quda_matrix.h:70
__device__ __host__ Matrix()
Definition: quda_matrix.h:74
__device__ __host__ T const & operator()(int i, int j) const
Definition: quda_matrix.h:96
std::array< int, 4 > dim
double mu
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
#define P3
void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
#define Qmu
void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, Real *const output)
int Vh
Definition: host_utils.cpp:38
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void computeAllLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
void completeForceSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, const LoadStore< Real > &ls, Real *const mom)
void computeSideLinkSite(int half_lattice_index, const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
#define P5
void doHisqStaplesForceCPU(const int dim[4], PathCoefficients< double > staple_coeff, Real *oprod, Real *link, Real **tempmat, Real *newOprod)
#define Pnumu
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
Matrix< N, T > transpose(const Matrix< N, std::complex< T > > &mat)
#define RETURN_IF_ERR
#define Pmu
void computeMiddleLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, const LoadStore< Real > &ls, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
int gauge_order
#define Qnumu
void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
int Vh_ex
Definition: host_utils.cpp:46
void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void hisqCompleteForceCPU(const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *mom)
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
void computeOneLinkSite(const int dim[4], int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
int E1h
Definition: host_utils.cpp:44
int E1
Definition: host_utils.cpp:44
int E2
Definition: host_utils.cpp:44
int E4
Definition: host_utils.cpp:44
int E3
Definition: host_utils.cpp:44
#define OPP_DIR(dir)
Definition: misc.h:33
#define GOES_FORWARDS(dir)
Definition: misc.h:34
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
__host__ __device__ float4 operator+=(float4 &x, const float4 &y)
Definition: float_vector.h:122
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__host__ __device__ float4 operator-=(float4 &x, const float4 &y)
Definition: float_vector.h:169
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
QudaGaugeParam param
Definition: pack_test.cpp:18
Main header file for the QUDA library.
Matrix< 3, std::complex< Real > > Type
Matrix< N, T > operator()() const
int getNeighborFromFullIndex(int full_lattice_index, int dir, int *err=NULL)
Locator(const int dim[4])
int getFullFromHalfIndex(int half_lattice_index)
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
Matrix< N, T > operator()() const
#define errorQuda(...)
Definition: util_quda.h:120