QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_force_reference2.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <iostream>
4 #include <iomanip>
5 #include <complex>
6 
7 #include <quda.h>
8 #include <gauge_field.h>
9 
10 using namespace quda;
11 //namespace quda {
12 
13 #define RETURN_IF_ERR if(err) return;
14 
15 extern int gauge_order;
16 extern int Vh;
17 extern int Vh_ex;
18 
19  static int OPP_DIR(int dir){ return 7-dir; }
20  static bool GOES_FORWARDS(int dir){ return (dir<=3); }
21  static bool GOES_BACKWARDS(int dir){ return (dir>3); }
22 
23  template<int N>
24  struct Sign{
25  static const int result = 1;
26  };
27 
28  template<>
29  struct Sign<1>
30  {
31  static const int result = -1;
32  };
33 
34 
35  template<class T, class U>
36  struct Promote
37  {
38  typedef T Type;
39  };
40 
41  template<>
42  struct Promote<int,float>
43  {
44  typedef float Type;
45  };
46 
47  template<>
48  struct Promote<float,int>
49  {
50  typedef float Type;
51  };
52 
53  template<>
54  struct Promote<int, double>
55  {
56  typedef double Type;
57  };
58 
59  template<>
60  struct Promote<double, int>
61  {
62  typedef double Type;
63  };
64 
65  template<>
66  struct Promote<float, double>
67  {
68  typedef double Type;
69  };
70 
71  template<>
72  struct Promote<double, float>
73  {
74  typedef double Type;
75  };
76 
77  template<>
78  struct Promote< int, std::complex<float> >
79  {
80  typedef std::complex<float> Type;
81  };
82 
83  template<>
84  struct Promote< std::complex<float>, int >
85  {
86  typedef std::complex<float> Type;
87  };
88 
89  template<>
90  struct Promote< float, std::complex<float> >
91  {
92  typedef std::complex<float> Type;
93  };
94 
95  template<>
96  struct Promote< int, std::complex<double> >
97  {
98  typedef std::complex<double> Type;
99  };
100 
101  template<>
102  struct Promote< std::complex<double>, int >
103  {
104  typedef std::complex<double> Type;
105  };
106 
107  template<>
108  struct Promote< float, std::complex<double> >
109  {
110  typedef std::complex<double> Type;
111  };
112 
113  template<>
114  struct Promote< std::complex<double>, float >
115  {
116  typedef std::complex<double> Type;
117  };
118 
119  template<>
120  struct Promote< double, std::complex<double> >
121  {
122  typedef std::complex<double> Type;
123  };
124 
125  template<>
126  struct Promote< std::complex<double>, double >
127  {
128  typedef std::complex<double> Type;
129  };
130 
131  template<int N, class T>
132  class Matrix
133  {
134  private:
135  T data[N][N];
136 
137  public:
138  Matrix(); // default constructor
139  Matrix(const Matrix<N,T>& mat); // copy constructor
140  Matrix & operator += (const Matrix<N,T>& mat);
141  Matrix & operator -= (const Matrix<N,T>& mat);
142  const T& operator()(int i, int j) const;
143  T& operator()(int i, int j);
144  };
145 
146  template<int N, class T>
148  {
149  for(int i=0; i<N; ++i){
150  for(int j=0; j<N; ++j){
151  data[i][j] = static_cast<T>(0);
152  }
153  }
154  }
155 
156  template<int N, class T>
158  {
159  for(int i=0; i<N; ++i){
160  for(int j=0; j<N; ++j){
161  data[i][j] = mat.data[i][j];
162  }
163  }
164  }
165 
166  template<int N, class T>
167  T& Matrix<N,T>::operator()(int i, int j)
168  {
169  return data[i][j];
170  }
171 
172  template<int N, class T>
173  const T& Matrix<N,T>::operator()(int i, int j) const
174  {
175  return data[i][j];
176  }
177 
178  template<int N, class T>
180  {
181  for(int i=0; i<N; ++i){
182  for(int j=0; j<N; ++j){
183  data[i][j] += mat.data[i][j];
184  }
185  }
186  return *this;
187  }
188 
189  template<int N, class T>
191  {
192  for(int i=0; i<N; ++i){
193  for(int j=0; j<N; ++j){
194  data[i][j] -= mat.data[i][j];
195  }
196  }
197  return *this;
198  }
199 
200  template<int N, class T>
202  {
203  Matrix<N,T> result(a);
204  result += b;
205  return result;
206  }
207 
208  template<int N, class T>
210  {
211  Matrix<N,T> result(a);
212  result -= b;
213  return result;
214  }
215 
216  template<int N, class T>
218  {
219  Matrix<N,T> result;
220  for(int i=0; i<N; ++i){
221  for(int j=0; j<N; ++j){
222  result(i,j) = static_cast<T>(0);
223  for(int k=0; k<N; ++k){
224  result(i,j) += a(i,k)*b(k,j);
225  }
226  }
227  }
228  return result;
229  }
230 
231  template<int N, class T>
232  Matrix<N,std::complex<T> > conj(const Matrix<N,std::complex<T> >& mat)
233  {
234  Matrix<N,std::complex<T> > result;
235  for(int i=0; i<N; ++i){
236  for(int j=0; j<N; ++j){
237  result(i,j) = std::conj(mat(j,i));
238  }
239  }
240  return result;
241  }
242 
243  template<int N, class T>
244  Matrix<N,T> transpose(const Matrix<N,std::complex<T> >& mat)
245  {
246  Matrix<N,T> result;
247  for(int i=0; i<N; ++i){
248  for(int j=0; j<N; ++j){
249  result(i,j) = mat(j,i);
250  }
251  }
252  return result;
253  }
254 
255  template<int N, class T, class U>
257  {
258  typedef typename Promote<T,U>::Type return_type;
259  Matrix<N,return_type> result;
260 
261  for(int i=0; i<N; ++i){
262  for(int j=0; j<N; ++j){
263  result(i,j) = scalar*mat(i,j);
264  }
265  }
266  return result;
267  }
268 
269  template<int N, class T, class U>
271  {
272  return mat*scalar;
273  }
274 
275  template<int N, class T>
276  struct Identity
277  {
279  Matrix<N,T> id;
280  for(int i=0; i<N; ++i){
281  id(i,i) = static_cast<T>(1);
282  }
283  return id;
284  } // operator()
285  };
286 
287  template<int N, class T>
288  struct Zero
289  {
290  // the default constructor zeros all matrix elements
292  return Matrix<N,T>();
293  }
294  };
295 
296 
297  template<int N, class T>
298  std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)
299  {
300  for(int i=0; i<N; ++i){
301  for(int j=0; j<N; ++j){
302  os << m(i,j) << " ";
303  }
304  if(i<N-1) os << std::endl;
305  }
306  return os;
307  }
308 
309 
310 
311  template<class Real>
312  class LoadStore{
313  private:
314  const int volume;
315  const int half_volume;
316  public:
317  LoadStore(int vol) : volume(vol), half_volume(vol/2) {}
318 
319  void loadMatrixFromField(const Real* const field, int oddBit, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
320 
321  void loadMatrixFromField(const Real* const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;
322 
323  void storeMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real* const field) const;
324 
325  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real coeff, Real* const) const;
326 
327  void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
328 
329  void storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;
330  Real getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const;
331  void addData(Real* const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
332  int half_idx_conversion_ex2normal(int half_lattice_index, const int* dim, int oddBit) const ;
333  int half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const ;
334  };
335 
336 template<class Real>
337 int LoadStore<Real>::half_idx_conversion_ex2normal(int half_lattice_index_ex, const int* dim, int oddBit) const
338 {
339  int X1=dim[0];
340  int X2=dim[1];
341  int X3=dim[2];
342  //int X4=dim[3];
343  int X1h=X1/2;
344 
345  int E1=dim[0]+4;
346  int E2=dim[1]+4;
347  int E3=dim[2]+4;
348  //int E4=dim[3]+4;
349  int E1h=E1/2;
350 
351  int sid = half_lattice_index_ex;
352 
353  int za = sid/E1h;
354  int x1h = sid - za*E1h;
355  int zb = za/E2;
356  int x2 = za - zb*E2;
357  int x4 = zb/E3;
358  int x3 = zb - x4*E3;
359  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
360  int x1 = 2*x1h + x1odd;
361 
362  int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
363  return idx;
364 }
365 
366 template<class Real>
367 int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const
368 {
369  int X1=dim[0];
370  int X2=dim[1];
371  int X3=dim[2];
372  //int X4=dim[3];
373  int X1h=X1/2;
374 
375  int E1=dim[0]+4;
376  int E2=dim[1]+4;
377  int E3=dim[2]+4;
378  //int E4=dim[3]+4;
379  //int E1h=E1/2;
380 
381  int sid = half_lattice_index;
382 
383  int za = sid/X1h;
384  int x1h = sid - za*X1h;
385  int zb = za/X2;
386  int x2 = za - zb*X2;
387  int x4 = zb/X3;
388  int x3 = zb - x4*X3;
389  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
390  int x1 = 2*x1h + x1odd;
391 
392  int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;
393 
394  return idx;
395 }
396 
397 template<class Real>
398 Real LoadStore<Real>::getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const
399 {
401  return field[(4*hfv*oddBit +4*idx + dir)*18+offset];
402  }else{ //QDP format
403  return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
404  }
405 }
406 template<class Real>
407 void LoadStore<Real>::addData(Real* const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
408 {
410  field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;
411  }else{ //QDP format
412  ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset] += v;
413  }
414 }
415 
416 
417 
418 template<class Real>
419  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
420  int oddBit,
421  int half_lattice_index,
422  Matrix<3, std::complex<Real> >* const mat
423  ) const
424 {
425 #ifdef MULTI_GPU
426  int hfv = Vh_ex;
427 #else
428  int hfv = Vh;
429 #endif
430 
431  int offset = 0;
432  for(int i=0; i<3; ++i){
433  for(int j=0; j<3; ++j){
434  (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
435  (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
436  }
437  }
438  return;
439  }
440 
441  template<class Real>
442  void LoadStore<Real>::loadMatrixFromField(const Real* const field,
443  int oddBit,
444  int dir,
445  int half_lattice_index,
446  Matrix<3, std::complex<Real> >* const mat
447  ) const
448  {
449 #ifdef MULTI_GPU
450  int hfv = Vh_ex;
451 #else
452  int hfv = Vh;
453 #endif
454 
455  //const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
456  int offset = 0;
457  for(int i=0; i<3; ++i){
458  for(int j=0; j<3; ++j){
459  (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
460  (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
461  }
462  }
463  return;
464  }
465 
466  template<class Real>
467  void LoadStore<Real>::storeMatrixToField(const Matrix<3, std::complex<Real> >& mat,
468  int oddBit,
469  int half_lattice_index,
470  Real* const field) const
471  {
472 #ifdef MULTI_GPU
473  int hfv = Vh_ex;
474 #else
475  int hfv = Vh;
476 #endif
477 
478  int offset = 0;
479  for(int i=0; i<3; ++i){
480  for(int j=0; j<3; ++j){
481  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).real();
482  *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).imag();
483  }
484  }
485  return;
486  }
487 
488  template<class Real>
489  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
490  int oddBit,
491  int half_lattice_index,
492  Real coeff,
493  Real* const field) const
494  {
495 #ifdef MULTI_GPU
496  int hfv = Vh_ex;
497 #else
498  int hfv = Vh;
499 #endif
500  Real* const local_field = field + (oddBit*hfv + half_lattice_index)*18;
501 
502 
503  int offset = 0;
504  for(int i=0; i<3; ++i){
505  for(int j=0; j<3; ++j){
506  local_field[offset++] += coeff*mat(i,j).real();
507  local_field[offset++] += coeff*mat(i,j).imag();
508  }
509  }
510  return;
511  }
512 
513 
514  template<class Real>
515  void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,
516  int oddBit,
517  int dir,
518  int half_lattice_index,
519  Real coeff,
520  Real* const field) const
521  {
522 
523 #ifdef MULTI_GPU
524  int hfv = Vh_ex;
525 #else
526  int hfv = Vh;
527 #endif
528 
529  //Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
530  int offset = 0;
531  for(int i=0; i<3; ++i){
532  for(int j=0; j<3; ++j){
533  //local_field[offset++] += coeff*mat(i,j).real();
534  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).real(), hfv);
535 
536  //local_field[offset++] += coeff*mat(i,j).imag();
537  addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).imag(), hfv);
538  }
539  }
540  return;
541  }
542 
543 
544  template<class Real>
545  void LoadStore<Real>::storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat,
546  int oddBit,
547  int dir,
548  int half_lattice_index,
549  Real coeff,
550  Real* const field) const
551  {
552  Real* const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
553  mom_field[0] = (mat(0,1).real() - mat(1,0).real())*0.5*coeff;
554  mom_field[1] = (mat(0,1).imag() + mat(1,0).imag())*0.5*coeff;
555 
556  mom_field[2] = (mat(0,2).real() - mat(2,0).real())*0.5*coeff;
557  mom_field[3] = (mat(0,2).imag() + mat(2,0).imag())*0.5*coeff;
558 
559  mom_field[4] = (mat(1,2).real() - mat(2,1).real())*0.5*coeff;
560  mom_field[5] = (mat(1,2).imag() + mat(2,1).imag())*0.5*coeff;
561 
562  const Real temp = (mat(0,0).imag() + mat(1,1).imag() + mat(2,2).imag())*0.3333333333333333333;
563  mom_field[6] = (mat(0,0).imag() - temp)*coeff;
564  mom_field[7] = (mat(1,1).imag() - temp)*coeff;
565  mom_field[8] = (mat(2,2).imag() - temp)*coeff;
566  mom_field[9] = 0.0;
567  return;
568  }
569 
570 
571 
572  template<int oddBit>
573  struct Locator
574  {
575  private:
576  int local_dim[4];
577  int volume;
578  int half_index;
579  int full_index;
580  int full_coord[4];
581  void getCoordsFromHalfIndex(int half_index, int coord[4]);
582  void getCoordsFromFullIndex(int full_index, int coord[4]);
583  void cache(int half_lattice_index); // caches the half-lattice index, full-lattice index,
584  // and full-lattice coordinates
585 
586 
587  public:
588  Locator(const int dim[4]);
589  int getFullFromHalfIndex(int half_lattice_index);
590  int getNeighborFromFullIndex(int full_lattice_index, int dir, int* err=NULL);
591  };
592 
593  template<int oddBit>
594  Locator<oddBit>::Locator(const int dim[4]) : half_index(-1), full_index(-1){
595  volume = 1;
596  for(int dir=0; dir<4; ++dir){
597  local_dim[dir] = dim[dir];
598  volume *= local_dim[dir];
599  }
600  }
601 
602  // Store the half_lattice index, works out and stores the full lattice index
603  // and the coordinates.
604  template<int oddBit>
605  void Locator<oddBit>::getCoordsFromHalfIndex(int half_lattice_index, int coord[4])
606  {
607 #ifdef MULTI_GPU
608  int E1 = local_dim[0]+4;
609  int E2 = local_dim[1]+4;
610  int E3 = local_dim[2]+4;
611  //int E4 = local_dim[3]+4;
612  int E1h = E1/2;
613 
614  int z1 = half_lattice_index/E1h;
615  int x1h = half_lattice_index - z1*E1h;
616  int z2 = z1/E2;
617  coord[1] = z1 - z2*E2;
618  coord[3] = z2/E3;
619  coord[2] = z2 - coord[3]*E3;
620  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
621  coord[0] = 2*x1h + x1odd;
622 #else
623  int half_dim_0 = local_dim[0]/2;
624  int z1 = half_lattice_index/half_dim_0;
625  int x1h = half_lattice_index - z1*half_dim_0;
626  int z2 = z1/local_dim[1];
627  coord[1] = z1 - z2*local_dim[1];
628  coord[3] = z2/local_dim[2];
629  coord[2] = z2 - coord[3]*local_dim[2];
630  int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
631  coord[0] = 2*x1h + x1odd;
632 #endif
633 
634  }
635 
636  template<int oddBit>
637  void Locator<oddBit>::getCoordsFromFullIndex(int full_lattice_index, int coord[4])
638  {
639 #ifdef MULTI_GPU
640  int D1=local_dim[0]+4;
641  int D2=local_dim[1]+4;
642  int D3=local_dim[2]+4;
643  //int D4=local_dim[3]+4;
644  //int D1h=D1/2;
645 #else
646  int D1=local_dim[0];
647  int D2=local_dim[1];
648  int D3=local_dim[2];
649  //int D4=local_dim[3];
650  //int D1h=D1/2;
651 #endif
652 
653  int z1 = full_lattice_index/D1;
654  coord[0] = full_lattice_index - z1*D1;
655  int z2 = z1/D2;
656  coord[1] = z1 - z2*D2;
657  coord[3] = z2/D3;
658  coord[2] = z2 - coord[3]*D3;
659  }
660 
661 
662 
663 
664 
665  template<int oddBit>
666  void Locator<oddBit>::cache(int half_lattice_index)
667  {
668  half_index = half_lattice_index;
669  getCoordsFromHalfIndex(half_lattice_index, full_coord);
670  int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;
671  full_index = 2*half_lattice_index + x1odd;
672  return;
673  }
674 
675  template<int oddBit>
676  int Locator<oddBit>::getFullFromHalfIndex(int half_lattice_index)
677  {
678  if(half_index != half_lattice_index) cache(half_lattice_index);
679  return full_index;
680  }
681 
682  // From full index return the neighbouring full index
683  template<int oddBit>
684  int Locator<oddBit>::getNeighborFromFullIndex(int full_lattice_index, int dir, int* err)
685  {
686  if(err) *err = 0;
687 
688  int coord[4];
689  int neighbor_index;
690  getCoordsFromFullIndex(full_lattice_index, coord);
691 #ifdef MULTI_GPU
692  int E1 = local_dim[0] + 4;
693  int E2 = local_dim[1] + 4;
694  int E3 = local_dim[2] + 4;
695  int E4 = local_dim[3] + 4;
696  switch(dir){
697  case 0: //+X
698  neighbor_index = full_lattice_index + 1;
699  if(err && (coord[0] == E1-1) ) *err = 1;
700  break;
701  case 1: //+Y
702  neighbor_index = full_lattice_index + E1;
703  if(err && (coord[1] == E2-1) ) *err = 1;
704  break;
705  case 2: //+Z
706  neighbor_index = full_lattice_index + E2*E1;
707  if(err && (coord[2] == E3-1) ) *err = 1;
708  break;
709  case 3: //+T
710  neighbor_index = full_lattice_index + E3*E2*E1;
711  if(err && (coord[3] == E4-1) ) *err = 1;
712  break;
713  case 7: //-X
714  neighbor_index = full_lattice_index - 1;
715  if(err && (coord[0] == 0) ) *err = 1;
716  break;
717  case 6: //-Y
718  neighbor_index = full_lattice_index - E1;
719  if(err && (coord[1] == 0) ) *err = 1;
720  break;
721  case 5: //-Z
722  neighbor_index = full_lattice_index - E2*E1;
723  if(err && (coord[2] == 0) ) *err = 1;
724  break;
725  case 4: //-T
726  neighbor_index = full_lattice_index - E3*E2*E1;
727  if(err && (coord[3] == 0) ) *err = 1;
728  break;
729  default:
730  errorQuda("Neighbor index could not be determined\n");
731  exit(1);
732  break;
733  } // switch(dir)
734 
735 #else
736  switch(dir){
737  case 0:
738  neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
739  break;
740  case 1:
741  neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
742  break;
743  case 2:
744  neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
745  break;
746  case 3:
747  neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
748  break;
749  case 7:
750  neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
751  break;
752  case 6:
753  neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
754  break;
755  case 5:
756  neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
757  break;
758  case 4:
759  neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
760  break;
761  default:
762  errorQuda("Neighbor index could not be determined\n");
763  exit(1);
764  break;
765  } // switch(dir)
766  if(err) *err = 0;
767 #endif
768  return neighbor_index;
769  }
770 
771 // Can't typedef a template
772 template<class Real>
774 {
776 };
777 
778  template<class Real, int oddBit>
779  void computeOneLinkSite(const int dim[4],
780  int half_lattice_index,
781  const Real* const oprod,
782  int sig, Real coeff,
783  const LoadStore<Real>& ls,
784  Real* const output)
785  {
786  if( GOES_FORWARDS(sig) ){
787  typename ColorMatrix<Real>::Type colorMatW;
788 #ifdef MULTI_GPU
789  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
790 #else
791  int idx = half_lattice_index;
792 #endif
793  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatW);
794  ls.addMatrixToField(colorMatW, oddBit, sig, idx, coeff, output);
795  }
796  return;
797  }
798 
799  template<class Real>
800  void computeOneLinkField(const int dim[4],
801  const Real* const oprod,
802  int sig, Real coeff,
803  Real* const output)
804  {
805  int volume = 1;
806  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
807  const int half_volume = volume/2;
808  LoadStore<Real> ls(volume);
809  for(int site=0; site<half_volume; ++site){
810  computeOneLinkSite<Real,0>(dim, site,
811  oprod,
812  sig, coeff, ls,
813  output);
814 
815  }
816  // Loop over odd lattice sites
817  for(int site=0; site<half_volume; ++site){
818  computeOneLinkSite<Real,1>(dim, site,
819  oprod,
820  sig, coeff, ls,
821  output);
822  }
823  return;
824  }
825 
826 
827 
828 
829 
830 
831 
832 
833 
834  // middleLinkKernel compiles for now, but lots of debugging to be done
835  template<class Real, int oddBit>
836  void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
837  const int dim[4],
838  const Real* const oprod,
839  const Real* const Qprev,
840  const Real* const link,
841  int sig, int mu,
842  Real coeff,
843  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
844  Real* const Pmu,
845  Real* const P3,
846  Real* const Qmu,
847  Real* const newOprod
848  )
849  {
850  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
851  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
852 
853 
854  Locator<oddBit> locator(dim);
855  int point_b, point_c, point_d;
857  int X = locator.getFullFromHalfIndex(half_lattice_index);
858 
859  int err;
860  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
861  point_d = new_mem_idx >> 1;
862  // getNeighborFromFullIndex will work on any site on the lattice, odd or even
863  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
864  point_c = new_mem_idx >> 1;
865 
866  new_mem_idx = locator.getNeighborFromFullIndex(X,sig); RETURN_IF_ERR;
867  point_b = new_mem_idx >> 1;
868 
869  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
870  bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
871  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
872 
874  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
875 
876 
877 
878 
879 
880  if(sig_positive){
881  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
882  }else{
883  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
884  }
885 
886  if(mu_positive){
887  ls.loadMatrixFromField(link, oddBit, mu, bc_link_nbr_idx, &bc_link);
888  }else{
889  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(mu), bc_link_nbr_idx, &bc_link);
890  }
891 
892  if(Qprev == NULL){
893  if(sig_positive){
894  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_d, &colorMatY);
895  }else{
896  ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);
897  colorMatY = conj(colorMatY);
898  }
899  }else{ // Qprev != NULL
900  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
901  }
902 
903  colorMatW = (!mu_positive) ? bc_link*colorMatY : conj(bc_link)*colorMatY;
904  if(Pmu) ls.storeMatrixToField(colorMatW, 1-oddBit, point_b, Pmu);
905 
906  colorMatY = (sig_positive) ? ab_link*colorMatW : conj(ab_link)*colorMatW;
907  ls.storeMatrixToField(colorMatY, oddBit, half_lattice_index, P3);
908 
909  if(mu_positive){
910  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
911  }else{
912  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
913  ad_link = conj(ad_link);
914  }
915 
916 
917  if(Qprev == NULL){
918  if(sig_positive) colorMatY = colorMatW*ad_link;
919  if(Qmu) ls.storeMatrixToField(ad_link, oddBit, half_lattice_index, Qmu);
920  }else{ // Qprev != NULL
921  if(Qmu || sig_positive){
922  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatY);
923  colorMatX= colorMatY*ad_link;
924  }
925  if(Qmu) ls.storeMatrixToField(colorMatX, oddBit, half_lattice_index, Qmu);
926  if(sig_positive) colorMatY = colorMatW*colorMatX;
927  }
928 
929  if(sig_positive) ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);
930 
931  return;
932  } // computeMiddleLinkSite
933 
934 
935  template<class Real>
936  void computeMiddleLinkField(const int dim[4],
937  const Real* const oprod,
938  const Real* const Qprev,
939  const Real* const link,
940  int sig, int mu,
941  Real coeff,
942  Real* const Pmu,
943  Real* const P3,
944  Real* const Qmu,
945  Real* const newOprod
946  )
947  {
948 
949  int volume = 1;
950  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
951 #ifdef MULTI_GPU
952  const int loop_count = Vh_ex;
953 #else
954  const int loop_count = volume/2;
955 #endif
956  // loop over the lattice volume
957  // To keep the code as close to the GPU code as possible, we'll
958  // loop over the even sites first and then the odd sites
959  LoadStore<Real> ls(volume);
960  for(int site=0; site<loop_count; ++site){
961  computeMiddleLinkSite<Real, 0>(site, dim,
962  oprod, Qprev, link,
963  sig, mu, coeff,
964  ls,
965  Pmu, P3, Qmu, newOprod);
966  }
967  // Loop over odd lattice sites
968  for(int site=0; site<loop_count; ++site){
969  computeMiddleLinkSite<Real,1>(site, dim,
970  oprod, Qprev, link,
971  sig, mu, coeff,
972  ls,
973  Pmu, P3, Qmu, newOprod);
974  }
975  return;
976  }
977 
978 
979 
980 
981  template<class Real, int oddBit>
982  void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
983  const int dim[4],
984  const Real* const P3,
985  const Real* const Qprod, // why?
986  const Real* const link,
987  int sig, int mu,
988  Real coeff, Real accumu_coeff,
989  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
990  Real* const shortP,
991  Real* const newOprod
992  )
993  {
994 
995  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
996  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
997 
998  Locator<oddBit> locator(dim);
999  int point_d;
1000  int ad_link_nbr_idx;
1001  int X = locator.getFullFromHalfIndex(half_lattice_index);
1002 
1003  int err;
1004  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1005  point_d = new_mem_idx >> 1;
1006  ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1007 
1008 
1010  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY;
1011 
1012  ls.loadMatrixFromField(P3, oddBit, half_lattice_index, &colorMatY);
1013 
1014  if(shortP){
1015  if(mu_positive){
1016  ad_link_nbr_idx = point_d;
1017  ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);
1018  }else{
1019  ad_link_nbr_idx = half_lattice_index;
1020  ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);
1021  }
1022  colorMatW = (mu_positive) ? ad_link*colorMatY : conj(ad_link)*colorMatY;
1023  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1024  } // if(shortP)
1025 
1026 
1027  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1028 
1029  if(Qprod){
1030  ls.loadMatrixFromField(Qprod, 1-oddBit, point_d, &colorMatX);
1031  if(mu_positive){
1032  colorMatW = colorMatY*colorMatX;
1033  if(!oddBit){ mycoeff = -mycoeff; }
1034  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, mycoeff, newOprod);
1035  }else{
1036  colorMatW = conj(colorMatX)*conj(colorMatY);
1037  if(oddBit){ mycoeff = -mycoeff; }
1038  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1039  }
1040  }
1041 
1042  if(!Qprod){
1043  if(mu_positive){
1044  if(!oddBit){ mycoeff = -mycoeff; }
1045  ls.addMatrixToField(colorMatY, 1-oddBit, mu, point_d, mycoeff, newOprod);
1046  }else{
1047  if(oddBit){ mycoeff = -mycoeff; }
1048  colorMatW = conj(colorMatY);
1049  ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);
1050  }
1051  } // if !(Qprod)
1052 
1053  return;
1054  } // computeSideLinkSite
1055 
1056 
1057  // Maybe change to computeSideLinkField
1058  template<class Real>
1059  void computeSideLinkField(const int dim[4],
1060  const Real* const P3,
1061  const Real* const Qprod, // why?
1062  const Real* const link,
1063  int sig, int mu,
1064  Real coeff, Real accumu_coeff,
1065  Real* const shortP,
1066  Real* const newOprod
1067  )
1068  {
1069  // Need some way of setting half_volume
1070  int volume = 1;
1071  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1072 #ifdef MULTI_GPU
1073  const int loop_count = Vh_ex;
1074 #else
1075  const int loop_count = volume/2;
1076 #endif
1077  LoadStore<Real> ls(volume);
1078 
1079  for(int site=0; site<loop_count; ++site){
1080  computeSideLinkSite<Real,0>(site, dim,
1081  P3, Qprod, link,
1082  sig, mu,
1083  coeff, accumu_coeff,
1084  ls, shortP, newOprod);
1085  }
1086 
1087  for(int site=0; site<loop_count; ++site){
1088  computeSideLinkSite<Real,1>(site, dim,
1089  P3, Qprod, link,
1090  sig, mu,
1091  coeff, accumu_coeff,
1092  ls, shortP, newOprod);
1093  }
1094 
1095  return;
1096  }
1097 
1098 
1099 
1100 
1101 
1102  template<class Real, int oddBit>
1103  void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
1104  const int dim[4],
1105  const Real* const oprod,
1106  const Real* const Qprev,
1107  const Real* const link,
1108  int sig, int mu,
1109  Real coeff, Real accumu_coeff,
1110  const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields
1111  Real* const shortP,
1112  Real* const newOprod)
1113  {
1114 
1115  const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
1116  const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
1117 
1119  typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;
1120 
1121 
1123 
1124  Locator<oddBit> locator(dim);
1125  int X = locator.getFullFromHalfIndex(half_lattice_index);
1126 
1127  int err;
1128  int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;
1129  point_d = new_mem_idx >> 1;
1130 
1131  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;
1132  point_c = new_mem_idx >> 1;
1133 
1134  new_mem_idx = locator.getNeighborFromFullIndex(X,sig, &err); RETURN_IF_ERR;
1135  point_b = new_mem_idx >> 1;
1136  ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1137 
1138 
1139 
1140  Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1141 
1142  if(mu_positive){
1143  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1144  ls.loadMatrixFromField(link, 1-oddBit, mu, point_d, &ad_link);
1145  // compute point_c
1146  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1147  ls.loadMatrixFromField(link, oddBit, mu, point_c, &bc_link);
1148  colorMatZ = conj(bc_link)*colorMatY; // okay
1149 
1150  if(sig_positive)
1151  {
1152  colorMatY = colorMatX*ad_link;
1153  colorMatW = colorMatZ*colorMatY;
1154  ls.addMatrixToField(colorMatW, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1155  }
1156 
1157  if(sig_positive){
1158  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1159  }else{
1160  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1161  }
1162  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // okay
1163  colorMatW = colorMatY*colorMatX;
1164  ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, -Sign<oddBit>::result*mycoeff, newOprod);
1165  colorMatW = ad_link*colorMatY;
1166  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP); // Warning! Need to check this!
1167  }else{ //negative mu
1168  mu = OPP_DIR(mu);
1169  ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);
1170  ls.loadMatrixFromField(link, oddBit, mu, half_lattice_index, &ad_link);
1171  ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
1172  ls.loadMatrixFromField(link, 1-oddBit, mu, point_b, &bc_link);
1173 
1174  if(sig_positive) colorMatW = colorMatX*conj(ad_link);
1175  colorMatZ = bc_link*colorMatY;
1176  if(sig_positive){
1177  colorMatY = colorMatZ*colorMatW;
1178  ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1179  }
1180 
1181  if(sig_positive){
1182  ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);
1183  }else{
1184  ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);
1185  }
1186 
1187  colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // 611
1188  colorMatW = conj(colorMatX)*conj(colorMatY);
1189 
1190  ls.addMatrixToField(colorMatW, oddBit, mu, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);
1191  colorMatW = conj(ad_link)*colorMatY;
1192  ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
1193  } // end mu
1194  return;
1195  } // allLinkKernel
1196 
1197 
1198 
1199  template<class Real>
1200  void computeAllLinkField(const int dim[4],
1201  const Real* const oprod,
1202  const Real* const Qprev,
1203  const Real* const link,
1204  int sig, int mu,
1205  Real coeff, Real accumu_coeff,
1206  Real* const shortP,
1207  Real* const newOprod)
1208  {
1209  int volume = 1;
1210  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1211 #ifdef MULTI_GPU
1212  const int loop_count = Vh_ex;
1213 #else
1214  const int loop_count = volume/2;
1215 #endif
1216 
1217  LoadStore<Real> ls(volume);
1218  for(int site=0; site<loop_count; ++site){
1219 
1220  computeAllLinkSite<Real,0>(site, dim,
1221  oprod, Qprev, link,
1222  sig, mu,
1224  ls,
1225  shortP, newOprod);
1226  }
1227 
1228  for(int site=0; site<loop_count; ++site){
1229  computeAllLinkSite<Real, 1>(site, dim,
1230  oprod, Qprev, link,
1231  sig, mu,
1233  ls,
1234  shortP, newOprod);
1235  }
1236 
1237  return;
1238  }
1239 
1240 #define Pmu tempmat[0]
1241 #define P3 tempmat[1]
1242 #define P5 tempmat[2]
1243 #define Pnumu tempmat[3]
1244 #define Qmu tempmat[4]
1245 #define Qnumu tempmat[5]
1246 
1247  template<class Real>
1249  {
1250  Real one;
1251  Real three;
1252  Real five;
1253  Real seven;
1254  Real naik;
1255  Real lepage;
1256  };
1257 
1258 
1259  template<class Real>
1260  void doHisqStaplesForceCPU(const int dim[4],
1261  PathCoefficients<double> staple_coeff,
1262  Real* oprod,
1263  Real* link,
1264  Real** tempmat,
1265  Real* newOprod)
1266  {
1267  Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
1268 
1269  OneLink = staple_coeff.one;
1270  ThreeSt = staple_coeff.three;
1271  FiveSt = staple_coeff.five;
1272  SevenSt = staple_coeff.seven;
1273  Lepage = staple_coeff.lepage;
1274 
1275  for(int sig=0; sig<4; ++sig){
1276  computeOneLinkField(dim,
1277  oprod,
1278  sig, OneLink,
1279  newOprod);
1280  }
1281 
1282 
1283  // sig labels the net displacement of the staple
1284  for(int sig=0; sig<8; ++sig){
1285  for(int mu=0; mu<8; ++mu){
1286  if( mu == sig || mu == OPP_DIR(sig) ) continue;
1287 
1288  computeMiddleLinkField<Real>(dim,
1289  oprod, NULL, link,
1290  sig, mu, -ThreeSt,
1291  Pmu, P3, Qmu,
1292  newOprod);
1293 
1294  for(int nu=0; nu<8; ++nu){
1295  if( nu==mu || nu==OPP_DIR(mu)
1296  || nu==sig || nu==OPP_DIR(sig) ) continue;
1297 
1298  computeMiddleLinkField<Real>(dim,
1299  Pmu, Qmu, link,
1300  sig, nu, staple_coeff.five,
1301  Pnumu, P5, Qnumu,
1302  newOprod);
1303 
1304 
1305  for(int rho=0; rho<8; ++rho){
1306  if( rho == sig || rho == OPP_DIR(sig)
1307  || rho == mu || rho == OPP_DIR(mu)
1308  || rho == nu || rho == OPP_DIR(nu) )
1309  {
1310  continue;
1311  }
1312 
1313  if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1314  computeAllLinkField<Real>(dim,
1315  Pnumu, Qnumu, link,
1316  sig, rho, staple_coeff.seven, coeff,
1317  P5, newOprod);
1318 
1319  } // rho
1320 
1321  // 5-staple: side link
1322  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1323  computeSideLinkField<Real>(dim,
1324  P5, Qmu, link,
1325  sig, nu, -FiveSt, coeff,
1326  P3, newOprod);
1327 
1328 
1329  } // nu
1330 
1331 
1332  // lepage
1333  if(staple_coeff.lepage != 0.){
1334  computeMiddleLinkField<Real>(dim,
1335  Pmu, Qmu, link,
1336  sig, mu, Lepage,
1337  NULL, P5, NULL,
1338  newOprod);
1339 
1340  if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
1341  computeSideLinkField<Real>(dim,
1342  P5, Qmu, link,
1343  sig, mu, -Lepage, coeff,
1344  P3, newOprod);
1345  } // lepage != 0
1346 
1347 
1348  computeSideLinkField<Real>(dim,
1349  P3, NULL, link,
1350  sig, mu, ThreeSt, 0.,
1351  NULL, newOprod);
1352  } // mu
1353  } // sig
1354 
1355  // Need also to compute the one-link contribution
1356  return;
1357  }
1358 
1359 #undef Pmu
1360 #undef P3
1361 #undef P5
1362 #undef Pnumu
1363 #undef Qmu
1364 #undef Qnumu
1365 
1366 
1367  void hisqStaplesForceCPU(const double* path_coeff,
1368  const QudaGaugeParam &param,
1369  cpuGaugeField &oprod,
1370  cpuGaugeField &link,
1371  cpuGaugeField* newOprod)
1372  {
1373  int volume = 1;
1374  for(int dir=0; dir<4; ++dir) volume *= param.X[dir];
1375 
1376 #ifdef MULTI_GPU
1377  int len = Vh_ex*2;
1378 #else
1379  int len = volume;
1380 #endif
1381  // allocate memory for temporary fields
1382  void* tempmat[6];
1383  if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1384  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(double));
1385  }else{
1386  for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(float));
1387  }
1388 
1389  PathCoefficients<double> act_path_coeff;
1390  act_path_coeff.one = path_coeff[0];
1391  act_path_coeff.naik = path_coeff[1];
1392  act_path_coeff.three = path_coeff[2];
1393  act_path_coeff.five = path_coeff[3];
1394  act_path_coeff.seven = path_coeff[4];
1395  act_path_coeff.lepage = path_coeff[5];
1396 
1397  if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1398  doHisqStaplesForceCPU<double>(param.X,
1399  act_path_coeff,
1400  (double*)oprod.Gauge_p(),
1401  (double*)link.Gauge_p(),
1402  (double**)tempmat,
1403  (double*)newOprod->Gauge_p()
1404  );
1405 
1406  }else if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1407  doHisqStaplesForceCPU<float>(param.X,
1408  act_path_coeff,
1409  (float*)oprod.Gauge_p(),
1410  (float*)link.Gauge_p(),
1411  (float**)tempmat,
1412  (float*)newOprod->Gauge_p()
1413  );
1414  }else{
1415  errorQuda("Unsupported precision");
1416  }
1417 
1418  for(int i=0; i<6; ++i){
1419  free(tempmat[i]);
1420  }
1421  return;
1422  }
1423 
1424 
1425 
1426  template<class Real, int oddBit>
1427  void computeLongLinkSite(int half_lattice_index,
1428  const int dim[4],
1429  const Real* const oprod,
1430  const Real* const link,
1431  int sig, Real coeff,
1432  const LoadStore<Real>& ls,
1433  Real* const output)
1434  {
1435  if( GOES_FORWARDS(sig) ){
1436 
1437  Locator<oddBit> locator(dim);
1438 
1440  typename ColorMatrix<Real>::Type colorMatU, colorMatV, colorMatW, colorMatX, colorMatY, colorMatZ;
1441 
1443 #ifdef MULTI_GPU
1444  int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1445 #else
1446  int idx = half_lattice_index;
1447 #endif
1448 
1449  int X = locator.getFullFromHalfIndex(idx);
1450  point_c = idx;
1451 
1452  int new_mem_idx = locator.getNeighborFromFullIndex(X,sig);
1453  point_d = new_mem_idx >> 1;
1454 
1455  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, sig);
1456  point_e = new_mem_idx >> 1;
1457 
1458  new_mem_idx = locator.getNeighborFromFullIndex(X, OPP_DIR(sig));
1459  point_b = new_mem_idx >> 1;
1460 
1461  new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, OPP_DIR(sig));
1462  point_a = new_mem_idx >> 1;
1463 
1464  ls.loadMatrixFromField(link, oddBit, sig, point_a, &ab_link);
1465  ls.loadMatrixFromField(link, 1-oddBit, sig, point_b, &bc_link);
1466  ls.loadMatrixFromField(link, 1-oddBit, sig, point_d, &de_link);
1467  ls.loadMatrixFromField(link, oddBit, sig, point_e, &ef_link);
1468 
1469  ls.loadMatrixFromField(oprod, oddBit, sig, point_c, &colorMatZ);
1470  ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_b, &colorMatY);
1471  ls.loadMatrixFromField(oprod, oddBit, sig, point_a, &colorMatX);
1472 
1473  colorMatV = de_link*ef_link*colorMatZ
1474  - de_link*colorMatY*bc_link
1475  + colorMatX*ab_link*bc_link;
1476 
1477  ls.addMatrixToField(colorMatV, oddBit, sig, point_c, coeff, output);
1478  }
1479  return;
1480  }
1481 
1482  template<class Real>
1483  void computeLongLinkField(const int dim[4],
1484  const Real* const oprod,
1485  const Real* const link,
1486  int sig, Real coeff,
1487  Real* const output)
1488  {
1489  int volume = 1;
1490  for(int dir=0; dir<4; ++dir) volume *= dim[dir];
1491  const int half_volume = volume/2;
1492 
1493  LoadStore<Real> ls(volume);
1494  for(int site=0; site<half_volume; ++site){
1495  computeLongLinkSite<Real,0>(site,
1496  dim,
1497  oprod,
1498  link,
1499  sig, coeff, ls,
1500  output);
1501 
1502  }
1503  // Loop over odd lattice sites
1504  for(int site=0; site<half_volume; ++site){
1505  computeLongLinkSite<Real,1>(site,
1506  dim,
1507  oprod,
1508  link,
1509  sig, coeff, ls,
1510  output);
1511  }
1512  return;
1513  }
1514 
1515 
1517  const QudaGaugeParam &param,
1518  cpuGaugeField &oprod,
1519  cpuGaugeField &link,
1520  cpuGaugeField *newOprod)
1521  {
1522  for(int sig=0; sig<4; ++sig){
1523  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1524  computeLongLinkField<float>(param.X,
1525  (float*)oprod.Gauge_p(),
1526  (float*)link.Gauge_p(),
1527  sig, coeff,
1528  (float*)newOprod->Gauge_p());
1529  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1530  computeLongLinkField<double>(param.X,
1531  (double*)oprod.Gauge_p(),
1532  (double*)link.Gauge_p(),
1533  sig, coeff,
1534  (double*)newOprod->Gauge_p());
1535  }else {
1536  errorQuda("Unrecognised precision\n");
1537  }
1538  } // sig
1539  return;
1540  }
1541 
1542 
1543 template<class Real, int oddBit>
1544 void completeForceSite(int half_lattice_index,
1545  const int dim[4],
1546  const Real* const oprod,
1547  const Real* const link,
1548  int sig,
1549  const LoadStore<Real>& ls,
1550  Real* const mom)
1551 {
1552 
1553  typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
1554 
1555 #ifdef MULTI_GPU
1556  int half_lattice_index_ex = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);
1557  int idx = half_lattice_index_ex;
1558 #else
1559  int idx = half_lattice_index;
1560 #endif
1561  ls.loadMatrixFromField(link, oddBit, sig, idx, &linkW);
1562  ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatX);
1563 
1564  const Real coeff = (oddBit) ? -1 : 1;
1565  colorMatY = linkW*colorMatX;
1566 
1567  ls.storeMatrixToMomentumField(colorMatY, oddBit, sig, half_lattice_index, coeff, mom);
1568  return;
1569 }
1570 
1571 template <class Real>
1572 void completeForceField(const int dim[4],
1573  const Real* const oprod,
1574  const Real* const link,
1575  int sig,
1576  Real* const mom)
1577 {
1578  int volume = dim[0]*dim[1]*dim[2]*dim[3];
1579  const int half_volume = volume/2;
1580  LoadStore<Real> ls(volume);
1581 
1582 
1583  for(int site=0; site<half_volume; ++site){
1584  completeForceSite<Real,0>(site,
1585  dim,
1586  oprod, link,
1587  sig,
1588  ls,
1589  mom);
1590 
1591  }
1592  for(int site=0; site<half_volume; ++site){
1593  completeForceSite<Real,1>(site,
1594  dim,
1595  oprod, link,
1596  sig,
1597  ls,
1598  mom);
1599  }
1600  return;
1601 }
1602 
1603 
1605  cpuGaugeField &oprod,
1606  cpuGaugeField &link,
1607  cpuGaugeField* mom)
1608  {
1609  for(int sig=0; sig<4; ++sig){
1610  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
1611  completeForceField<float>(param.X,
1612  (float*)oprod.Gauge_p(),
1613  (float*)link.Gauge_p(),
1614  sig,
1615  (float*)mom->Gauge_p());
1616  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
1617  completeForceField<double>(param.X,
1618  (double*)oprod.Gauge_p(),
1619  (double*)link.Gauge_p(),
1620  sig,
1621  (double*)mom->Gauge_p());
1622  }else{
1623  errorQuda("Unrecognised precision\n");
1624  }
1625  } // loop over sig
1626  return;
1627  }
1628 
1629 
1630 //} // namespace quda
1631 
1632 
1633 
1634