QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
llfat_reference.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda.h>
6 #include "gauge_field.h"
7 #include <test_util.h>
8 #include <unitarization_links.h>
9 #include "misc.h"
10 #include <string.h>
11 
12 #include <llfat_reference.h>
13 
14 #include <quda_internal.h>
15 #include <complex>
16 
17 #define XUP 0
18 #define YUP 1
19 #define ZUP 2
20 #define TUP 3
21 
22 using namespace quda;
23 
24 
25 template<typename real> struct su3_matrix { std::complex<real> e[3][3]; };
26 template<typename real> struct su3_vector { std::complex<real> e[3]; };
27 
28 static int Vs[4];
29 static int Vsh[4];
30 
31 template<typename su3_matrix, typename Real>
32  void
34 {
35 
36  int i,j;
37  for(i=0;i<3;i++) for(j=0;j<3;j++){
38  b->e[i][j] = s*a->e[i][j];
39  }
40 
41  return;
42 }
43 
44 template<typename su3_matrix, typename Real>
45  void
47 {
48  int i,j;
49  for(i=0;i<3;i++) for(j=0;j<3;j++){
50  c->e[i][j] = a->e[i][j] + s*b->e[i][j];
51  }
52 
53 }
54 
55 template <typename su3_matrix>
56  void
58 {
59  int i,j,k;
60  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
61  for(i=0;i<3;i++)for(j=0;j<3;j++){
62  x=0.0;
63  for(k=0;k<3;k++){
64  y = a->e[i][k] * conj(b->e[j][k]);
65  x += y;
66  }
67  c->e[i][j] = x;
68  }
69 }
70 
71 template <typename su3_matrix>
72  void
74 {
75  int i,j,k;
76  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
77  for(i=0;i<3;i++)for(j=0;j<3;j++){
78  x=0.0;
79  for(k=0;k<3;k++){
80  y = a->e[i][k] * b->e[k][j];
81  x += y;
82  }
83  c->e[i][j] = x;
84  }
85 }
86 
87 template<typename su3_matrix>
88  void
90 {
91  int i,j,k;
92  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
93  for(i=0;i<3;i++)for(j=0;j<3;j++){
94  x=0.0;
95  for(k=0;k<3;k++){
96  y = conj(a->e[k][i]) * b->e[k][j];
97  x += y;
98  }
99  c->e[i][j] = x;
100  }
101 }
102 
103 
104 
105 
106 
107 template<typename su3_matrix>
108  void
110 {
111  int i,j;
112  for(i=0;i<3;i++)for(j=0;j<3;j++){
113  c->e[i][j] = a->e[i][j] + b->e[i][j];
114  }
115 }
116 
117 
118 
119 template<typename su3_matrix, typename Real>
120  void
122  su3_matrix* mulink, su3_matrix** sitelink, void** fatlink, Real coef,
123  int use_staple)
124 {
125  su3_matrix tmat1,tmat2;
126  int i ;
127  su3_matrix *fat1;
128 
129  /* Upper staple */
130  /* Computes the staple :
131  * mu (B)
132  * +-------+
133  * nu | |
134  * (A) | |(C)
135  * X X
136  *
137  * Where the mu link can be any su3_matrix. The result is saved in staple.
138  * if staple==NULL then the result is not saved.
139  * It also adds the computed staple to the fatlink[mu] with weight coef.
140  */
141 
142  int dx[4];
143 
144  /* upper staple */
145 
146  for(i=0;i < V;i++){
147 
148  fat1 = ((su3_matrix*)fatlink[mu]) + i;
149  su3_matrix* A = sitelink[nu] + i;
150 
151  memset(dx, 0, sizeof(dx));
152  dx[nu] =1;
153  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
154  su3_matrix* B;
155  if (use_staple){
156  B = mulink + nbr_idx;
157  }else{
158  B = mulink + nbr_idx;
159  }
160 
161  memset(dx, 0, sizeof(dx));
162  dx[mu] =1;
163  nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]);
164  su3_matrix* C = sitelink[nu] + nbr_idx;
165 
166  llfat_mult_su3_nn( A, B,&tmat1);
167 
168  if(staple!=NULL){/* Save the staple */
169  llfat_mult_su3_na( &tmat1, C, &staple[i]);
170  } else{ /* No need to save the staple. Add it to the fatlinks */
171  llfat_mult_su3_na( &tmat1, C, &tmat2);
172  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
173  }
174  }
175  /***************lower staple****************
176  *
177  * X X
178  * nu | |
179  * (A) | |(C)
180  * +-------+
181  * mu (B)
182  *
183  *********************************************/
184 
185  for(i=0;i < V;i++){
186 
187  fat1 = ((su3_matrix*)fatlink[mu]) + i;
188  memset(dx, 0, sizeof(dx));
189  dx[nu] = -1;
190  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
191  if (nbr_idx >= V || nbr_idx <0){
192  fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__);
193  exit(1);
194  }
195  su3_matrix* A = sitelink[nu] + nbr_idx;
196 
197  su3_matrix* B;
198  if (use_staple){
199  B = mulink + nbr_idx;
200  }else{
201  B = mulink + nbr_idx;
202  }
203 
204  memset(dx, 0, sizeof(dx));
205  dx[mu] = 1;
206  nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]);
207  su3_matrix* C = sitelink[nu] + nbr_idx;
208 
209  llfat_mult_su3_an( A, B,&tmat1);
210  llfat_mult_su3_nn( &tmat1, C,&tmat2);
211 
212  if(staple!=NULL){/* Save the staple */
213  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
214  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
215 
216  } else{ /* No need to save the staple. Add it to the fatlinks */
217  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
218  }
219  }
220 
221 } /* compute_gen_staple_site */
222 
223 
224 
225 /* Optimized fattening code for the Asq and Asqtad actions.
226  * I assume that:
227  * path 0 is the one link
228  * path 2 the 3-staple
229  * path 3 the 5-staple
230  * path 4 the 7-staple
231  * path 5 the Lapage term.
232  * Path 1 is the Naik term
233  *
234  */
235  template <typename su3_matrix, typename Float>
236 void llfat_cpu(void** fatlink, su3_matrix** sitelink, Float* act_path_coeff)
237 {
238 
239  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
240  if(staple == NULL){
241  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
242  exit(1);
243  }
244 
245  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
246  if(tempmat1 == NULL){
247  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
248  exit(1);
249  }
250 
251  /* to fix up the Lepage term, included by a trick below */
252  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
253 
254 
255  for (int dir=XUP; dir<=TUP; dir++){
256 
257  /* Intialize fat links with c_1*U_\mu(x) */
258  for(int i=0;i < V;i ++){
259  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
260  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
261  }
262  }
263 
264 
265 
266 
267  for (int dir=XUP; dir<=TUP; dir++){
268  for(int nu=XUP; nu<=TUP; nu++){
269  if(nu!=dir){
270  llfat_compute_gen_staple_field(staple,dir,nu,sitelink[dir], sitelink,fatlink, act_path_coeff[2], 0);
271 
272  /* The Lepage term */
273  /* Note this also involves modifying c_1 (above) */
274 
275  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1);
276 
277  for(int rho=XUP; rho<=TUP; rho++) {
278  if((rho!=dir)&&(rho!=nu)){
279  llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1);
280 
281  for(int sig=XUP; sig<=TUP; sig++){
282  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
283  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1);
284  }
285  }/* sig */
286 
287  }
288 
289  }/* rho */
290  }
291 
292  }/* nu */
293 
294  }/* dir */
295 
296 
297  free(staple);
298  free(tempmat1);
299 
300 }
301 
302 #ifndef MULTI_GPU
303  template<typename su3_matrix, typename Float>
304 void computeLongLinkCPU(void** longlink, su3_matrix** sitelink,
305  Float* act_path_coeff)
306 {
307 
308  su3_matrix temp;
309  for(int dir=XUP; dir<=TUP; ++dir){
310  int dx[4] = {0,0,0,0};
311  for(int i=0; i<V; ++i){
312  // Initialize the longlinks
313  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + i;
314  llfat_scalar_mult_su3_matrix(sitelink[dir]+i, act_path_coeff[1], llink);
315  dx[dir] = 1;
316  int nbr_idx = neighborIndexFullLattice(Z, i, dx);
317  llfat_mult_su3_nn(llink, sitelink[dir]+nbr_idx, &temp);
318  dx[dir] = 2;
319  nbr_idx = neighborIndexFullLattice(Z, i, dx);
320  llfat_mult_su3_nn(&temp, sitelink[dir]+nbr_idx, llink);
321  }
322  }
323  return;
324 }
325 #else
326  template<typename su3_matrix, typename Float>
327 void computeLongLinkCPU(void** longlink, su3_matrix** sitelinkEx, Float* act_path_coeff)
328 {
329  int E[4];
330  for(int dir=0; dir<4; ++dir) E[dir] = Z[dir]+4;
331 
332 
333  const int extended_volume = E[3]*E[2]*E[1]*E[0];
334 
335  su3_matrix temp;
336  for(int t=0; t<Z[3]; ++t){
337  for(int z=0; z<Z[2]; ++z){
338  for(int y=0; y<Z[1]; ++y){
339  for(int x=0; x<Z[0]; ++x){
340  const int oddBit = (x+y+z+t)&1;
341  int little_index = ((((t*Z[2] + z)*Z[1] + y)*Z[0] + x)/2) + oddBit*Vh;
342  int large_index = (((((t+2)*E[2] + (z+2))*E[1] + (y+2))*E[0] + x+2)/2) + oddBit*(extended_volume/2);
343 
344 
345  for(int dir=XUP; dir<=TUP; ++dir){
346  int dx[4] = {0,0,0,0};
347  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + little_index;
348  llfat_scalar_mult_su3_matrix(sitelinkEx[dir]+large_index, act_path_coeff[1], llink);
349  dx[dir] = 1;
350  int nbr_index = neighborIndexFullLattice(E, large_index, dx);
351  llfat_mult_su3_nn(llink, sitelinkEx[dir]+nbr_index, &temp);
352  dx[dir] = 2;
353  nbr_index = neighborIndexFullLattice(E, large_index, dx);
354  llfat_mult_su3_nn(&temp, sitelinkEx[dir]+nbr_index, llink);
355  }
356  } // x
357  } // y
358  } // z
359  } // t
360  return;
361 }
362 #endif
363 
364 
365 void computeLongLinkCPU(void** longlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
366 {
367  if(longlink){
368  switch(prec){
369  case QUDA_DOUBLE_PRECISION:
370  computeLongLinkCPU((void**)longlink, (su3_matrix<double>**)sitelink, (double*)act_path_coeff);
371  break;
372 
373  case QUDA_SINGLE_PRECISION:
374  computeLongLinkCPU((void**)longlink, (su3_matrix<float>**)sitelink, (float*)act_path_coeff);
375  break;
376  default:
377  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
378  exit(1);
379  break;
380  }
381  } // if(longlink)
382 
383 }
384 
385 
386  void
387 llfat_reference(void** fatlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
388 {
389  Vs[0] = Vs_x;
390  Vs[1] = Vs_y;
391  Vs[2] = Vs_z;
392  Vs[3] = Vs_t;
393 
394  Vsh[0] = Vsh_x;
395  Vsh[1] = Vsh_y;
396  Vsh[2] = Vsh_z;
397  Vsh[3] = Vsh_t;
398 
399 
400  switch(prec){
402  llfat_cpu((void**)fatlink, (su3_matrix<double>**)sitelink, (double*) act_path_coeff);
403  break;
404 
406  llfat_cpu((void**)fatlink, (su3_matrix<float>**)sitelink, (float*) act_path_coeff);
407  break;
408 
409  default:
410  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
411  exit(1);
412  break;
413 
414  }
415 
416  return;
417 
418 }
419 
420 #ifdef MULTI_GPU
421 
422 template<typename su3_matrix, typename Real>
423  void
424 llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu,
425  su3_matrix* mulink, su3_matrix** ghost_mulink,
426  su3_matrix** sitelink, su3_matrix** ghost_sitelink, su3_matrix** ghost_sitelink_diag,
427  void** fatlink, Real coef,
428  int use_staple)
429 {
430  su3_matrix tmat1,tmat2;
431  int i ;
432  su3_matrix *fat1;
433 
434 
435  int X1 = Z[0];
436  int X2 = Z[1];
437  int X3 = Z[2];
438  //int X4 = Z[3];
439  int X1h =X1/2;
440 
441  int X2X1 = X1*X2;
442  int X3X2 = X3*X2;
443  int X3X1 = X3*X1;
444 
445  /* Upper staple */
446  /* Computes the staple :
447  * mu (B)
448  * +-------+
449  * nu | |
450  * (A) | |(C)
451  * X X
452  *
453  * Where the mu link can be any su3_matrix. The result is saved in staple.
454  * if staple==NULL then the result is not saved.
455  * It also adds the computed staple to the fatlink[mu] with weight coef.
456  */
457 
458  int dx[4];
459 
460  /* upper staple */
461 
462  for(i=0;i < V;i++){
463 
464  int half_index = i;
465  int oddBit =0;
466  if (i >= Vh){
467  oddBit = 1;
468  half_index = i -Vh;
469  }
470  //int x4 = x4_from_full_index(i);
471 
472 
473 
474  int sid =half_index;
475  int za = sid/X1h;
476  int x1h = sid - za*X1h;
477  int zb = za/X2;
478  int x2 = za - zb*X2;
479  int x4 = zb/X3;
480  int x3 = zb - x4*X3;
481  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
482  int x1 = 2*x1h + x1odd;
483  int x[4] = {x1,x2,x3,x4};
484  int space_con[4]={
485  (x4*X3X2+x3*X2+x2)/2,
486  (x4*X3X1+x3*X1+x1)/2,
487  (x4*X2X1+x2*X1+x1)/2,
488  (x3*X2X1+x2*X1+x1)/2
489  };
490 
491  fat1 = ((su3_matrix*)fatlink[mu]) + i;
492  su3_matrix* A = sitelink[nu] + i;
493 
494  memset(dx, 0, sizeof(dx));
495  dx[nu] =1;
496  int nbr_idx;
497 
498  su3_matrix* B;
499  if (use_staple){
500  if (x[nu] + dx[nu] >= Z[nu]){
501  B = ghost_mulink[nu] + Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
502  }else{
503  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
504  B = mulink + nbr_idx;
505  }
506  }else{
507  if(x[nu]+dx[nu] >= Z[nu]){ //out of boundary, use ghost data
508  B = ghost_sitelink[nu] + 4*Vs[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
509  }else{
510  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
511  B = sitelink[mu] + nbr_idx;
512  }
513  }
514 
515 
516  //we could be in the ghost link area if mu is T and we are at high T boundary
517  su3_matrix* C;
518  memset(dx, 0, sizeof(dx));
519  dx[mu] =1;
520  if(x[mu] + dx[mu] >= Z[mu]){ //out of boundary, use ghost data
521  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + (1-oddBit)*Vsh[mu] + space_con[mu];
522  }else{
523  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
524  C = sitelink[nu] + nbr_idx;
525  }
526 
527  llfat_mult_su3_nn( A, B,&tmat1);
528 
529  if(staple!=NULL){/* Save the staple */
530  llfat_mult_su3_na( &tmat1, C, &staple[i]);
531  } else{ /* No need to save the staple. Add it to the fatlinks */
532  llfat_mult_su3_na( &tmat1, C, &tmat2);
533  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
534  }
535  }
536  /***************lower staple****************
537  *
538  * X X
539  * nu | |
540  * (A) | |(C)
541  * +-------+
542  * mu (B)
543  *
544  *********************************************/
545 
546  for(i=0;i < V;i++){
547 
548  int half_index = i;
549  int oddBit =0;
550  if (i >= Vh){
551  oddBit = 1;
552  half_index = i -Vh;
553  }
554 
555  int sid =half_index;
556  int za = sid/X1h;
557  int x1h = sid - za*X1h;
558  int zb = za/X2;
559  int x2 = za - zb*X2;
560  int x4 = zb/X3;
561  int x3 = zb - x4*X3;
562  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
563  int x1 = 2*x1h + x1odd;
564  int x[4] = {x1,x2,x3,x4};
565  int space_con[4]={
566  (x4*X3X2+x3*X2+x2)/2,
567  (x4*X3X1+x3*X1+x1)/2,
568  (x4*X2X1+x2*X1+x1)/2,
569  (x3*X2X1+x2*X1+x1)/2
570  };
571 
572  //int x4 = x4_from_full_index(i);
573 
574  fat1 = ((su3_matrix*)fatlink[mu]) + i;
575 
576  //we could be in the ghost link area if nu is T and we are at low T boundary
577  su3_matrix* A;
578  memset(dx, 0, sizeof(dx));
579  dx[nu] = -1;
580 
581  int nbr_idx;
582  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
583  A = ghost_sitelink[nu] + nu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
584  }else{
585  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
586  A = sitelink[nu] + nbr_idx;
587  }
588 
589  su3_matrix* B;
590  if (use_staple){
591  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
592  if (x[nu] + dx[nu] < 0){
593  B = ghost_mulink[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
594  }else{
595  B = mulink + nbr_idx;
596  }
597  }else{
598  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
599  B = ghost_sitelink[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
600  }else{
601  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
602  B = sitelink[mu] + nbr_idx;
603  }
604  }
605 
606  //we could be in the ghost link area if nu is T and we are at low T boundary
607  // or mu is T and we are on high T boundary
608  su3_matrix* C;
609  memset(dx, 0, sizeof(dx));
610  dx[nu] = -1;
611  dx[mu] = 1;
612  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
613 
614  //space con must be recomputed because we have coodinates change in 2 directions
615  int new_x1, new_x2, new_x3, new_x4;
616  new_x1 = (x[0] + dx[0] + Z[0])%Z[0];
617  new_x2 = (x[1] + dx[1] + Z[1])%Z[1];
618  new_x3 = (x[2] + dx[2] + Z[2])%Z[2];
619  new_x4 = (x[3] + dx[3] + Z[3])%Z[3];
620  int new_x[4] = {new_x1, new_x2, new_x3, new_x4};
621  space_con[0] = (new_x4*X3X2 + new_x3*X2 + new_x2)/2;
622  space_con[1] = (new_x4*X3X1 + new_x3*X1 + new_x1)/2;
623  space_con[2] = (new_x4*X2X1 + new_x2*X1 + new_x1)/2;
624  space_con[3] = (new_x3*X2X1 + new_x2*X1 + new_x1)/2;
625 
626  if( (x[nu] + dx[nu]) < 0 && (x[mu] + dx[mu] >= Z[mu])){
627  //find the other 2 directions, dir1, dir2
628  //with dir2 the slowest changing direction
629  int dir1, dir2; //other two dimensions
630  for(dir1=0; dir1 < 4; dir1 ++){
631  if(dir1 != nu && dir1 != mu){
632  break;
633  }
634  }
635  for(dir2=0; dir2 < 4; dir2 ++){
636  if(dir2 != nu && dir2 != mu && dir2 != dir1){
637  break;
638  }
639  }
640  C = ghost_sitelink_diag[nu*4+mu] + oddBit*Z[dir1]*Z[dir2]/2 + (new_x[dir2]*Z[dir1]+new_x[dir1])/2;
641  }else if (x[nu] + dx[nu] < 0){
642  C = ghost_sitelink[nu] + nu*Vs[nu] + oddBit*Vsh[nu]+ space_con[nu];
643  }else if (x[mu] + dx[mu] >= Z[mu]){
644  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + oddBit*Vsh[mu]+space_con[mu];
645  }else{
646  C = sitelink[nu] + nbr_idx;
647  }
648  llfat_mult_su3_an( A, B,&tmat1);
649  llfat_mult_su3_nn( &tmat1, C,&tmat2);
650 
651  if(staple!=NULL){/* Save the staple */
652  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
653  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
654 
655  } else{ /* No need to save the staple. Add it to the fatlinks */
656  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
657  }
658  }
659 
660 } /* compute_gen_staple_site */
661 
662 
663  template <typename su3_matrix, typename Float>
664 void llfat_cpu_mg(void** fatlink, su3_matrix** sitelink, su3_matrix** ghost_sitelink,
665  su3_matrix** ghost_sitelink_diag, Float* act_path_coeff)
666 {
668  if (sizeof(Float) == 4){
669  prec = QUDA_SINGLE_PRECISION;
670  }else{
671  prec = QUDA_DOUBLE_PRECISION;
672  }
673 
674  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
675  if(staple == NULL){
676  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
677  exit(1);
678  }
679 
680 
681  su3_matrix* ghost_staple[4];
682  su3_matrix* ghost_staple1[4];
683 
684  for(int i=0;i < 4;i++){
685  ghost_staple[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
686  if (ghost_staple[i] == NULL){
687  fprintf(stderr, "Error: malloc failed for ghost staple in function %s\n", __FUNCTION__);
688  exit(1);
689  }
690 
691  ghost_staple1[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
692  if (ghost_staple1[i] == NULL){
693  fprintf(stderr, "Error: malloc failed for ghost staple1 in function %s\n", __FUNCTION__);
694  exit(1);
695  }
696  }
697 
698  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
699  if(tempmat1 == NULL){
700  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
701  exit(1);
702  }
703 
704  /* to fix up the Lepage term, included by a trick below */
705  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
706 
707 
708  for (int dir=XUP; dir<=TUP; dir++){
709 
710  /* Intialize fat links with c_1*U_\mu(x) */
711  for(int i=0;i < V;i ++){
712  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
713  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
714  }
715  }
716 
717  for (int dir=XUP; dir<=TUP; dir++){
718  for(int nu=XUP; nu<=TUP; nu++){
719  if(nu!=dir){
720  llfat_compute_gen_staple_field_mg(staple,dir,nu,
721  sitelink[dir], (su3_matrix**)NULL,
722  sitelink, ghost_sitelink, ghost_sitelink_diag,
723  fatlink, act_path_coeff[2], 0);
724  /* The Lepage term */
725  /* Note this also involves modifying c_1 (above) */
726 
727  exchange_cpu_staple(Z, staple, (void**)ghost_staple, prec);
728 
729  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,nu,
730  staple,ghost_staple,
731  sitelink, ghost_sitelink, ghost_sitelink_diag,
732  fatlink, act_path_coeff[5],1);
733 
734  for(int rho=XUP; rho<=TUP; rho++) {
735  if((rho!=dir)&&(rho!=nu)){
736  llfat_compute_gen_staple_field_mg( tempmat1, dir, rho,
737  staple,ghost_staple,
738  sitelink, ghost_sitelink, ghost_sitelink_diag,
739  fatlink, act_path_coeff[3], 1);
740 
741 
742  exchange_cpu_staple(Z, tempmat1, (void**)ghost_staple1, prec);
743 
744  for(int sig=XUP; sig<=TUP; sig++){
745  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
746 
747  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,sig,
748  tempmat1, ghost_staple1,
749  sitelink, ghost_sitelink, ghost_sitelink_diag,
750  fatlink, act_path_coeff[4], 1);
751  //FIXME
752  //return;
753 
754  }
755  }/* sig */
756  }
757  }/* rho */
758  }
759 
760  }/* nu */
761 
762  }/* dir */
763 
764  free(staple);
765  for(int i=0;i < 4;i++){
766  free(ghost_staple[i]);
767  free(ghost_staple1[i]);
768  }
769  free(tempmat1);
770 
771 }
772 
773 
774 
775  void
776 llfat_reference_mg(void** fatlink, void** sitelink, void** ghost_sitelink,
777  void** ghost_sitelink_diag, QudaPrecision prec, void* act_path_coeff)
778 {
779 
780  Vs[0] = Vs_x;
781  Vs[1] = Vs_y;
782  Vs[2] = Vs_z;
783  Vs[3] = Vs_t;
784 
785  Vsh[0] = Vsh_x;
786  Vsh[1] = Vsh_y;
787  Vsh[2] = Vsh_z;
788  Vsh[3] = Vsh_t;
789 
790  switch(prec){
791  case QUDA_DOUBLE_PRECISION:{
792  llfat_cpu_mg((void**)fatlink, (su3_matrix<double>**)sitelink, (su3_matrix<double>**)ghost_sitelink,
793  (su3_matrix<double>**)ghost_sitelink_diag, (double*) act_path_coeff);
794  break;
795  }
796  case QUDA_SINGLE_PRECISION:{
797  llfat_cpu_mg((void**)fatlink, (su3_matrix<float>**)sitelink, (su3_matrix<float>**)ghost_sitelink,
798  (su3_matrix<float>**)ghost_sitelink_diag, (float*) act_path_coeff);
799  break;
800  }
801  default:
802  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
803  exit(1);
804  break;
805 
806  }
807 
808  return;
809 
810 }
811 
812 
813 #endif
814 
815 // CPU-style BLAS routines
816 void cpu_axy(QudaPrecision prec, double a, void* x, void* y, int size)
817 {
818  if (prec == QUDA_DOUBLE_PRECISION) {
819  double* dst = (double*)y;
820  double* src = (double*)x;
821  for (int i = 0; i < size; i++)
822  {
823  dst[i] = a*src[i];
824  }
825  } else { // QUDA_SINGLE_PRECISION
826  float* dst = (float*)y;
827  float* src = (float*)x;
828  for (int i = 0; i < size; i++)
829  {
830  dst[i] = a*src[i];
831  }
832  }
833 }
834 
835 void cpu_xpy(QudaPrecision prec, void* x, void* y, int size)
836 {
837  if (prec == QUDA_DOUBLE_PRECISION) {
838  double* dst = (double*)y;
839  double* src = (double*)x;
840  for (int i = 0; i < size; i++)
841  {
842  dst[i] += src[i];
843  }
844  } else { // QUDA_SINGLE_PRECISION
845  float* dst = (float*)y;
846  float* src = (float*)x;
847  for (int i = 0; i < size; i++)
848  {
849  dst[i] += src[i];
850  }
851  }
852 }
853 
854  // data reordering routines
855  template <typename Out, typename In>
856  void reorderQDPtoMILC(Out* milc_out, In** qdp_in, int V, int siteSize) {
857  for (int i = 0; i < V; i++) {
858  for (int dir = 0; dir < 4; dir++) {
859  for (int j = 0; j < siteSize; j++) {
860  milc_out[(i*4+dir)*siteSize+j] = static_cast<Out>(qdp_in[dir][i*siteSize+j]);
861  }
862  }
863  }
864  }
865 
866  void reorderQDPtoMILC(void* milc_out, void** qdp_in, int V, int siteSize, QudaPrecision out_precision, QudaPrecision in_precision) {
867  if (out_precision == QUDA_SINGLE_PRECISION) {
868  if (in_precision == QUDA_SINGLE_PRECISION) {
869  reorderQDPtoMILC<float,float>((float*)milc_out, (float**)qdp_in, V, siteSize);
870  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
871  reorderQDPtoMILC<float,double>((float*)milc_out, (double**)qdp_in, V, siteSize);
872  }
873  } else if (out_precision == QUDA_DOUBLE_PRECISION) {
874  if (in_precision == QUDA_SINGLE_PRECISION) {
875  reorderQDPtoMILC<double,float>((double*)milc_out, (float**)qdp_in, V, siteSize);
876  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
877  reorderQDPtoMILC<double,double>((double*)milc_out, (double**)qdp_in, V, siteSize);
878  }
879  }
880  }
881 
882  template <typename Out, typename In>
883  void reorderMILCtoQDP(Out** qdp_out, In* milc_in, int V, int siteSize) {
884  for (int i = 0; i < V; i++) {
885  for (int dir = 0; dir < 4; dir++) {
886  for (int j = 0; j < siteSize; j++) {
887  qdp_out[dir][i*siteSize+j] = static_cast<Out>(milc_in[(i*4+dir)*siteSize+j]);
888  }
889  }
890  }
891  }
892 
893  void reorderMILCtoQDP(void** qdp_out, void* milc_in, int V, int siteSize, QudaPrecision out_precision, QudaPrecision in_precision) {
894  if (out_precision == QUDA_SINGLE_PRECISION) {
895  if (in_precision == QUDA_SINGLE_PRECISION) {
896  reorderMILCtoQDP<float,float>((float**)qdp_out, (float*)milc_in, V, siteSize);
897  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
898  reorderMILCtoQDP<float,double>((float**)qdp_out, (double*)milc_in, V, siteSize);
899  }
900  } else if (out_precision == QUDA_DOUBLE_PRECISION) {
901  if (in_precision == QUDA_SINGLE_PRECISION) {
902  reorderMILCtoQDP<double,float>((double**)qdp_out, (float*)milc_in, V, siteSize);
903  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
904  reorderMILCtoQDP<double,double>((double**)qdp_out, (double*)milc_in, V, siteSize);
905  }
906  }
907  }
908 
909 // Compute the full HISQ stencil on the CPU.
910 // If "eps_naik" is 0, there's no naik correction,
911 // and this routine skips building the paths in "act_path_coeffs[2]"
913  void** fatlink_eps, void** longlink_eps,
914  void** sitelink, void* qudaGaugeParamPtr,
915  double** act_path_coeffs, double eps_naik)
916 {
917  // Prepare various things
918  QudaGaugeParam& qudaGaugeParam = *((QudaGaugeParam*)qudaGaugeParamPtr);
919  // Needed for unitarization, following "unitarize_link_test.cpp"
920  GaugeFieldParam gParam(0, qudaGaugeParam);
921  gParam.pad = 0;
922  gParam.link_type = QUDA_GENERAL_LINKS;
924  gParam.order = QUDA_MILC_GAUGE_ORDER; // must be true!
925 
926  const QudaPrecision prec = qudaGaugeParam.cpu_prec;
927  const size_t gSize = prec;
928 
929  // Compute n_naiks
930  const int n_naiks = (eps_naik == 0.0 ? 1 : 2);
931 
933  // Create extended CPU field //
935 
936  void* sitelink_ex[4];
937  for(int i=0;i < 4;i++) sitelink_ex[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);
938 
939 
940 #ifdef MULTI_GPU
941  void* ghost_sitelink[4];
942  void* ghost_sitelink_diag[16];
943 #endif
944 
945  int X1=Z[0];
946  int X2=Z[1];
947  int X3=Z[2];
948  int X4=Z[3];
949 
950  for(int i=0; i < V_ex; i++){
951  int sid = i;
952  int oddBit=0;
953  if(i >= Vh_ex){
954  sid = i - Vh_ex;
955  oddBit = 1;
956  }
957 
958  int za = sid/E1h;
959  int x1h = sid - za*E1h;
960  int zb = za/E2;
961  int x2 = za - zb*E2;
962  int x4 = zb/E3;
963  int x3 = zb - x4*E3;
964  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
965  int x1 = 2*x1h + x1odd;
966 
967 
968  if( x1< 2 || x1 >= X1 +2
969  || x2< 2 || x2 >= X2 +2
970  || x3< 2 || x3 >= X3 +2
971  || x4< 2 || x4 >= X4 +2){
972 #ifdef MULTI_GPU
973  continue;
974 #endif
975  }
976 
977 
978 
979  x1 = (x1 - 2 + X1) % X1;
980  x2 = (x2 - 2 + X2) % X2;
981  x3 = (x3 - 2 + X3) % X3;
982  x4 = (x4 - 2 + X4) % X4;
983 
984  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
985  if(oddBit){
986  idx += Vh;
987  }
988  for(int dir= 0; dir < 4; dir++){
989  char* src = (char*)sitelink[dir];
990  char* dst = (char*)sitelink_ex[dir];
991  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
992  }//dir
993  }//i
994 
996  // Allocate all CPU intermediaries //
998 
999  void* v_reflink[4]; // V link -- fat7 smeared link
1000  void* w_reflink[4]; // unitarized V link
1001  void* w_reflink_ex[4]; // extended W link
1002  for(int i=0;i < 4;i++){
1003  v_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
1004  w_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
1005  w_reflink_ex[i] = safe_malloc(V_ex*gaugeSiteSize*gSize);
1006  }
1007 
1008 #ifdef MULTI_GPU
1009  void* ghost_wlink[4];
1010  void* ghost_wlink_diag[16];
1011 #endif
1012 
1013  // Copy of V link needed for CPU unitarization routines
1014  void* v_sitelink = pinned_malloc(4*V*gaugeSiteSize*gSize);
1015 
1016 
1017  //FIXME: we have this complication because references takes coeff as float/double
1018  // depending on the precision while the GPU code aways take coeff as double
1019  void* coeff;
1020  double coeff_dp[6];
1021  float coeff_sp[6];
1022 
1024  // Create V links (fat7 links), 1st path table set //
1026 
1027  for (int i=0; i < 6;i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[0][i];
1028  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void*)coeff_dp : (void*)coeff_sp;
1029 
1030  // Only need fat links.
1031 #ifdef MULTI_GPU
1032  int optflag = 0;
1033  //we need x,y,z site links in the back and forward T slice
1034  // so it is 3*2*Vs_t
1035  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
1036  for (int i=0; i < 4; i++) ghost_sitelink[i] = safe_malloc(8*Vs[i]*gaugeSiteSize*gSize);
1037 
1038  /*
1039  nu | |
1040  |_____|
1041  mu
1042  */
1043 
1044  for(int nu=0;nu < 4;nu++){
1045  for(int mu=0; mu < 4;mu++){
1046  if(nu == mu){
1047  ghost_sitelink_diag[nu*4+mu] = NULL;
1048  }else{
1049  //the other directions
1050  int dir1, dir2;
1051  for(dir1= 0; dir1 < 4; dir1++){
1052  if(dir1 !=nu && dir1 != mu){
1053  break;
1054  }
1055  }
1056  for(dir2=0; dir2 < 4; dir2++){
1057  if(dir2 != nu && dir2 != mu && dir2 != dir1){
1058  break;
1059  }
1060  }
1061  ghost_sitelink_diag[nu*4+mu] = safe_malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
1062  memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
1063  }
1064 
1065  }
1066  }
1067  exchange_cpu_sitelink(gParam.x, sitelink, ghost_sitelink, ghost_sitelink_diag, prec, &qudaGaugeParam, optflag);
1068  llfat_reference_mg(v_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, prec, coeff);
1069 #else
1070  llfat_reference(v_reflink, sitelink, prec, coeff);
1071 #endif
1072 
1074  // Create W links (unitarized V links) //
1076 
1077  // This is based on "unitarize_link_test.cpp"
1078 
1079  // Format change
1080  reorderQDPtoMILC(v_sitelink,v_reflink,V,gaugeSiteSize,prec,prec);
1081  /*if (prec == QUDA_DOUBLE_PRECISION){
1082  double* link = reinterpret_cast<double*>(v_sitelink);
1083  for(int dir=0; dir<4; ++dir){
1084  double* slink = reinterpret_cast<double*>(v_reflink[dir]);
1085  for(int i=0; i<V; ++i){
1086  for(int j=0; j<gaugeSiteSize; j++){
1087  link[(i*4 + dir)*gaugeSiteSize + j] = slink[i*gaugeSiteSize + j];
1088  }
1089  }
1090  }
1091  } else if(prec == QUDA_SINGLE_PRECISION){
1092  float* link = reinterpret_cast<float*>(v_sitelink);
1093  for(int dir=0; dir<4; ++dir){
1094  float* slink = reinterpret_cast<float*>(v_reflink[dir]);
1095  for(int i=0; i<V; ++i){
1096  for(int j=0; j<gaugeSiteSize; j++){
1097  link[(i*4 + dir)*gaugeSiteSize + j] = slink[i*gaugeSiteSize + j];
1098  }
1099  }
1100  }
1101  }*/
1102 
1103  // Prepare cpuGaugeFields for unitarization
1105  gParam.gauge = v_sitelink;
1106  cpuGaugeField* cpuVLink = new cpuGaugeField(gParam);
1107 
1108  gParam.create = QUDA_ZERO_FIELD_CREATE;
1109  cpuGaugeField* cpuWLink = new cpuGaugeField(gParam);
1110 
1111  // unitarize
1112  unitarizeLinksCPU(*cpuWLink, *cpuVLink);
1113 
1114  // Copy back into "w_reflink"
1115  reorderMILCtoQDP(w_reflink,cpuWLink->Gauge_p(),V,gaugeSiteSize,prec,prec);
1116  /*if (prec == QUDA_DOUBLE_PRECISION){
1117  double* link = reinterpret_cast<double*>(cpuWLink->Gauge_p());
1118  for(int dir=0; dir<4; ++dir){
1119  double* slink = reinterpret_cast<double*>(w_reflink[dir]);
1120  for(int i=0; i<V; ++i){
1121  for(int j=0; j<gaugeSiteSize; j++){
1122  slink[i*gaugeSiteSize + j] = link[(i*4 + dir)*gaugeSiteSize + j];
1123  }
1124  }
1125  }
1126  } else if(prec == QUDA_SINGLE_PRECISION){
1127  float* link = reinterpret_cast<float*>(cpuWLink->Gauge_p());
1128  for(int dir=0; dir<4; ++dir){
1129  float* slink = reinterpret_cast<float*>(w_reflink[dir]);
1130  for(int i=0; i<V; ++i){
1131  for(int j=0; j<gaugeSiteSize; j++){
1132  slink[i*gaugeSiteSize + j] = link[(i*4 + dir)*gaugeSiteSize + j];
1133  }
1134  }
1135  }
1136  }*/
1137 
1138 
1139  // Clean up cpuGaugeFields, we don't need them anymore.
1140 
1141  delete cpuVLink;
1142  delete cpuWLink;
1143 
1145  // Prepare for extended W fields //
1147 
1148  for(int i=0; i < V_ex; i++) {
1149  int sid = i;
1150  int oddBit=0;
1151  if(i >= Vh_ex){
1152  sid = i - Vh_ex;
1153  oddBit = 1;
1154  }
1155 
1156  int za = sid/E1h;
1157  int x1h = sid - za*E1h;
1158  int zb = za/E2;
1159  int x2 = za - zb*E2;
1160  int x4 = zb/E3;
1161  int x3 = zb - x4*E3;
1162  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
1163  int x1 = 2*x1h + x1odd;
1164 
1165 
1166  if( x1< 2 || x1 >= X1 +2
1167  || x2< 2 || x2 >= X2 +2
1168  || x3< 2 || x3 >= X3 +2
1169  || x4< 2 || x4 >= X4 +2){
1170 #ifdef MULTI_GPU
1171  continue;
1172 #endif
1173  }
1174 
1175 
1176 
1177  x1 = (x1 - 2 + X1) % X1;
1178  x2 = (x2 - 2 + X2) % X2;
1179  x3 = (x3 - 2 + X3) % X3;
1180  x4 = (x4 - 2 + X4) % X4;
1181 
1182  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
1183  if(oddBit){
1184  idx += Vh;
1185  }
1186  for(int dir= 0; dir < 4; dir++){
1187  char* src = (char*)w_reflink[dir];
1188  char* dst = (char*)w_reflink_ex[dir];
1189  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
1190  }//dir
1191  }//i
1192 
1194  // Create extended W fields //
1196 
1197 #ifdef MULTI_GPU
1198  optflag = 0;
1199  //we need x,y,z site links in the back and forward T slice
1200  // so it is 3*2*Vs_t
1201  for (int i=0; i < 4; i++) ghost_wlink[i] = safe_malloc(8*Vs[i]*gaugeSiteSize*gSize);
1202 
1203  /*
1204  nu | |
1205  |_____|
1206  mu
1207  */
1208 
1209  for(int nu=0;nu < 4;nu++){
1210  for(int mu=0; mu < 4;mu++){
1211  if(nu == mu){
1212  ghost_wlink_diag[nu*4+mu] = NULL;
1213  }else{
1214  //the other directions
1215  int dir1, dir2;
1216  for(dir1= 0; dir1 < 4; dir1++){
1217  if(dir1 !=nu && dir1 != mu){
1218  break;
1219  }
1220  }
1221  for(dir2=0; dir2 < 4; dir2++){
1222  if(dir2 != nu && dir2 != mu && dir2 != dir1){
1223  break;
1224  }
1225  }
1226  ghost_wlink_diag[nu*4+mu] = safe_malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
1227  memset(ghost_wlink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
1228  }
1229 
1230  }
1231  }
1232 #endif
1233 
1235  // Prepare to create Naiks, 3rd table set //
1237 
1238  if (n_naiks > 1) {
1239 
1240  for (int i=0; i < 6;i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[2][i];
1241  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void*)coeff_dp : (void*)coeff_sp;
1242 
1243 #ifdef MULTI_GPU
1244 
1245  exchange_cpu_sitelink(qudaGaugeParam.X, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
1246  llfat_reference_mg(fatlink, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, coeff);
1247 
1248  {
1249  int R[4] = {2,2,2,2};
1250  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, w_reflink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
1251  computeLongLinkCPU(longlink, w_reflink_ex, qudaGaugeParam.cpu_prec, coeff);
1252  }
1253 #else
1254  llfat_reference(fatlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
1255  computeLongLinkCPU(longlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
1256 #endif
1257 
1258  // Rescale fat and long links into eps links
1259  for (int i = 0; i < 4; i++) {
1260  cpu_axy(prec, eps_naik, fatlink[i], fatlink_eps[i], V*gaugeSiteSize);
1261  cpu_axy(prec, eps_naik, longlink[i], longlink_eps[i], V*gaugeSiteSize);
1262  }
1263  }
1264 
1266  // Prepare to create X links and long links, 2nd table set //
1268 
1269  for (int i=0; i < 6;i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[1][i];
1270  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void*)coeff_dp : (void*)coeff_sp;
1271 
1272 #ifdef MULTI_GPU
1273  optflag = 0;
1274 
1275  // We've already built the extended W fields.
1276 
1277  exchange_cpu_sitelink(qudaGaugeParam.X, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
1278  llfat_reference_mg(fatlink, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, coeff);
1279 
1280  {
1281  int R[4] = {2,2,2,2};
1282  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, w_reflink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
1283  computeLongLinkCPU(longlink, w_reflink_ex, qudaGaugeParam.cpu_prec, coeff);
1284  }
1285 #else
1286  llfat_reference(fatlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
1287  computeLongLinkCPU(longlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
1288 #endif
1289 
1290  if (n_naiks > 1) {
1291  // Accumulate into eps links.
1292  for (int i = 0; i < 4; i++) {
1293  cpu_xpy(prec, fatlink[i], fatlink_eps[i], V*gaugeSiteSize);
1294  cpu_xpy(prec, longlink[i], longlink_eps[i], V*gaugeSiteSize);
1295  }
1296  }
1297 
1299  // Clean up //
1301 
1302  for(int i=0; i < 4; i++){
1303  host_free(sitelink_ex[i]);
1304  host_free(v_reflink[i]);
1305  host_free(w_reflink[i]);
1306  host_free(w_reflink_ex[i]);
1307  }
1308  host_free(v_sitelink);
1309 
1310 #ifdef MULTI_GPU
1311  for(int i=0; i<4; i++){
1312  host_free(ghost_sitelink[i]);
1313  host_free(ghost_wlink[i]);
1314  for(int j=0;j <4; j++){
1315  if (i==j) continue;
1316  host_free(ghost_sitelink_diag[i*4+j]);
1317  host_free(ghost_wlink_diag[i*4+j]);
1318  }
1319  }
1320 #endif
1321 
1322 }
1323 
static QudaGaugeParam qudaGaugeParam
static size_t gSize
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
int Z[4]
Definition: test_util.cpp:26
double mu
Definition: test_util.cpp:1648
#define pinned_malloc(size)
Definition: malloc_quda.h:67
enum QudaPrecision_s QudaPrecision
int Vs_z
Definition: test_util.cpp:29
static int X2
Definition: face_gauge.cpp:42
void llfat_mult_su3_nn(su3_matrix *a, su3_matrix *b, su3_matrix *c)
#define host_free(ptr)
Definition: malloc_quda.h:71
int V_ex
Definition: test_util.cpp:36
int Vs_y
Definition: test_util.cpp:29
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:556
#define XUP
static int R[4]
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
Definition: face_gauge.cpp:512
int E[4]
Definition: test_util.cpp:35
int Vsh_y
Definition: test_util.cpp:30
void llfat_add_su3_matrix(su3_matrix *a, su3_matrix *b, su3_matrix *c)
void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink, void **fatlink, Real coef, int use_staple)
int Vsh_t
Definition: test_util.cpp:30
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
void llfat_scalar_mult_su3_matrix(su3_matrix *a, Real s, su3_matrix *b)
int Vs_x
Definition: test_util.cpp:29
double eps_naik
Definition: test_util.cpp:1652
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
Definition: face_gauge.cpp:980
static int Vs[4]
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
int Vh_ex
Definition: test_util.cpp:36
void cpu_xpy(QudaPrecision prec, void *x, void *y, int size)
void llfat_scalar_mult_add_su3_matrix(su3_matrix *a, su3_matrix *b, Real s, su3_matrix *c)
constexpr int size
QudaGaugeFieldOrder order
Definition: gauge_field.h:17
void llfat_mult_su3_na(su3_matrix *a, su3_matrix *b, su3_matrix *c)
void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
int X[4]
Definition: quda.h:36
static int X3
Definition: face_gauge.cpp:42
#define safe_malloc(size)
Definition: malloc_quda.h:66
int V
Definition: test_util.cpp:27
void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize)
static int X1
Definition: face_gauge.cpp:42
static int n_naiks
std::complex< real > e[3][3]
void * memset(void *s, int c, size_t n)
#define TUP
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:601
void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff)
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)
GaugeFieldParam gParam
Main header file for the QUDA library.
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
int E2
Definition: test_util.cpp:34
__shared__ float s[]
QudaLinkType link_type
Definition: gauge_field.h:19
void cpu_axy(QudaPrecision prec, double a, void *x, void *y, int size)
void llfat_mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c)
int Vsh_z
Definition: test_util.cpp:30
QudaFieldCreate create
Definition: gauge_field.h:26
void * longlink
int Vs_t
Definition: test_util.cpp:29
void * fatlink
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
int E3
Definition: test_util.cpp:34
QudaPrecision prec
Definition: test_util.cpp:1608
QudaPrecision cpu_prec
Definition: quda.h:47
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
Definition: face_gauge.cpp:644
int Vsh_x
Definition: test_util.cpp:30
int E1h
Definition: test_util.cpp:34
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
static int X4
Definition: face_gauge.cpp:42
#define gaugeSiteSize
Definition: face_gauge.cpp:34
static int Vsh[4]
int Vh
Definition: test_util.cpp:28