QUDA  0.9.0
llfat_reference.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda.h>
6 #include <test_util.h>
7 #include "llfat_reference.h"
8 #include "misc.h"
9 #include <string.h>
10 
11 #include <quda_internal.h>
12 #include <complex>
13 
14 #define XUP 0
15 #define YUP 1
16 #define ZUP 2
17 #define TUP 3
18 
19 
20 template<typename real> struct su3_matrix { std::complex<real> e[3][3]; };
21 template<typename real> struct su3_vector { std::complex<real> e[3]; };
22 
23 static int Vs[4];
24 static int Vsh[4];
25 
26 template<typename su3_matrix, typename Real>
27  void
29 {
30 
31  int i,j;
32  for(i=0;i<3;i++) for(j=0;j<3;j++){
33  b->e[i][j] = s*a->e[i][j];
34  }
35 
36  return;
37 }
38 
39 template<typename su3_matrix, typename Real>
40  void
42 {
43  int i,j;
44  for(i=0;i<3;i++) for(j=0;j<3;j++){
45  c->e[i][j] = a->e[i][j] + s*b->e[i][j];
46  }
47 
48 }
49 
50 template <typename su3_matrix>
51  void
53 {
54  int i,j,k;
55  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
56  for(i=0;i<3;i++)for(j=0;j<3;j++){
57  x=0.0;
58  for(k=0;k<3;k++){
59  y = a->e[i][k] * conj(b->e[j][k]);
60  x += y;
61  }
62  c->e[i][j] = x;
63  }
64 }
65 
66 template <typename su3_matrix>
67  void
69 {
70  int i,j,k;
71  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
72  for(i=0;i<3;i++)for(j=0;j<3;j++){
73  x=0.0;
74  for(k=0;k<3;k++){
75  y = a->e[i][k] * b->e[k][j];
76  x += y;
77  }
78  c->e[i][j] = x;
79  }
80 }
81 
82 template<typename su3_matrix>
83  void
85 {
86  int i,j,k;
87  typename std::remove_reference<decltype(a->e[0][0])>::type x,y;
88  for(i=0;i<3;i++)for(j=0;j<3;j++){
89  x=0.0;
90  for(k=0;k<3;k++){
91  y = conj(a->e[k][i]) * b->e[k][j];
92  x += y;
93  }
94  c->e[i][j] = x;
95  }
96 }
97 
98 
99 
100 
101 
102 template<typename su3_matrix>
103  void
105 {
106  int i,j;
107  for(i=0;i<3;i++)for(j=0;j<3;j++){
108  c->e[i][j] = a->e[i][j] + b->e[i][j];
109  }
110 }
111 
112 
113 
114 template<typename su3_matrix, typename Real>
115  void
117  su3_matrix* mulink, su3_matrix** sitelink, void** fatlink, Real coef,
118  int use_staple)
119 {
120  su3_matrix tmat1,tmat2;
121  int i ;
122  su3_matrix *fat1;
123 
124  /* Upper staple */
125  /* Computes the staple :
126  * mu (B)
127  * +-------+
128  * nu | |
129  * (A) | |(C)
130  * X X
131  *
132  * Where the mu link can be any su3_matrix. The result is saved in staple.
133  * if staple==NULL then the result is not saved.
134  * It also adds the computed staple to the fatlink[mu] with weight coef.
135  */
136 
137  int dx[4];
138 
139  /* upper staple */
140 
141  for(i=0;i < V;i++){
142 
143  fat1 = ((su3_matrix*)fatlink[mu]) + i;
144  su3_matrix* A = sitelink[nu] + i;
145 
146  memset(dx, 0, sizeof(dx));
147  dx[nu] =1;
148  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
149  su3_matrix* B;
150  if (use_staple){
151  B = mulink + nbr_idx;
152  }else{
153  B = mulink + nbr_idx;
154  }
155 
156  memset(dx, 0, sizeof(dx));
157  dx[mu] =1;
158  nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]);
159  su3_matrix* C = sitelink[nu] + nbr_idx;
160 
161  llfat_mult_su3_nn( A, B,&tmat1);
162 
163  if(staple!=NULL){/* Save the staple */
164  llfat_mult_su3_na( &tmat1, C, &staple[i]);
165  } else{ /* No need to save the staple. Add it to the fatlinks */
166  llfat_mult_su3_na( &tmat1, C, &tmat2);
167  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
168  }
169  }
170  /***************lower staple****************
171  *
172  * X X
173  * nu | |
174  * (A) | |(C)
175  * +-------+
176  * mu (B)
177  *
178  *********************************************/
179 
180  for(i=0;i < V;i++){
181 
182  fat1 = ((su3_matrix*)fatlink[mu]) + i;
183  memset(dx, 0, sizeof(dx));
184  dx[nu] = -1;
185  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
186  if (nbr_idx >= V || nbr_idx <0){
187  fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__);
188  exit(1);
189  }
190  su3_matrix* A = sitelink[nu] + nbr_idx;
191 
192  su3_matrix* B;
193  if (use_staple){
194  B = mulink + nbr_idx;
195  }else{
196  B = mulink + nbr_idx;
197  }
198 
199  memset(dx, 0, sizeof(dx));
200  dx[mu] = 1;
201  nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]);
202  su3_matrix* C = sitelink[nu] + nbr_idx;
203 
204  llfat_mult_su3_an( A, B,&tmat1);
205  llfat_mult_su3_nn( &tmat1, C,&tmat2);
206 
207  if(staple!=NULL){/* Save the staple */
208  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
209  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
210 
211  } else{ /* No need to save the staple. Add it to the fatlinks */
212  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
213  }
214  }
215 
216 } /* compute_gen_staple_site */
217 
218 
219 
220 /* Optimized fattening code for the Asq and Asqtad actions.
221  * I assume that:
222  * path 0 is the one link
223  * path 2 the 3-staple
224  * path 3 the 5-staple
225  * path 4 the 7-staple
226  * path 5 the Lapage term.
227  * Path 1 is the Naik term
228  *
229  */
230  template <typename su3_matrix, typename Float>
231 void llfat_cpu(void** fatlink, su3_matrix** sitelink, Float* act_path_coeff)
232 {
233 
234  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
235  if(staple == NULL){
236  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
237  exit(1);
238  }
239 
240  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
241  if(tempmat1 == NULL){
242  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
243  exit(1);
244  }
245 
246  /* to fix up the Lepage term, included by a trick below */
247  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
248 
249 
250  for (int dir=XUP; dir<=TUP; dir++){
251 
252  /* Intialize fat links with c_1*U_\mu(x) */
253  for(int i=0;i < V;i ++){
254  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
255  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
256  }
257  }
258 
259 
260 
261 
262  for (int dir=XUP; dir<=TUP; dir++){
263  for(int nu=XUP; nu<=TUP; nu++){
264  if(nu!=dir){
265  llfat_compute_gen_staple_field(staple,dir,nu,sitelink[dir], sitelink,fatlink, act_path_coeff[2], 0);
266 
267  /* The Lepage term */
268  /* Note this also involves modifying c_1 (above) */
269 
270  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1);
271 
272  for(int rho=XUP; rho<=TUP; rho++) {
273  if((rho!=dir)&&(rho!=nu)){
274  llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1);
275 
276  for(int sig=XUP; sig<=TUP; sig++){
277  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
278  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1);
279  }
280  }/* sig */
281 
282  }
283 
284  }/* rho */
285  }
286 
287  }/* nu */
288 
289  }/* dir */
290 
291 
292  free(staple);
293  free(tempmat1);
294 
295 }
296 
297 #ifndef MULTI_GPU
298  template<typename su3_matrix, typename Float>
299 void computeLongLinkCPU(void** longlink, su3_matrix** sitelink,
300  Float* act_path_coeff)
301 {
302 
303  su3_matrix temp;
304  for(int dir=XUP; dir<=TUP; ++dir){
305  int dx[4] = {0,0,0,0};
306  for(int i=0; i<V; ++i){
307  // Initialize the longlinks
308  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + i;
309  llfat_scalar_mult_su3_matrix(sitelink[dir]+i, act_path_coeff[1], llink);
310  dx[dir] = 1;
311  int nbr_idx = neighborIndexFullLattice(Z, i, dx);
312  llfat_mult_su3_nn(llink, sitelink[dir]+nbr_idx, &temp);
313  dx[dir] = 2;
314  nbr_idx = neighborIndexFullLattice(Z, i, dx);
315  llfat_mult_su3_nn(&temp, sitelink[dir]+nbr_idx, llink);
316  }
317  }
318  return;
319 }
320 #else
321  template<typename su3_matrix, typename Float>
322 void computeLongLinkCPU(void** longlink, su3_matrix** sitelinkEx, Float* act_path_coeff)
323 {
324  int E[4];
325  for(int dir=0; dir<4; ++dir) E[dir] = Z[dir]+4;
326 
327 
328  const int extended_volume = E[3]*E[2]*E[1]*E[0];
329 
330  su3_matrix temp;
331  for(int t=0; t<Z[3]; ++t){
332  for(int z=0; z<Z[2]; ++z){
333  for(int y=0; y<Z[1]; ++y){
334  for(int x=0; x<Z[0]; ++x){
335  const int oddBit = (x+y+z+t)&1;
336  int little_index = ((((t*Z[2] + z)*Z[1] + y)*Z[0] + x)/2) + oddBit*Vh;
337  int large_index = (((((t+2)*E[2] + (z+2))*E[1] + (y+2))*E[0] + x+2)/2) + oddBit*(extended_volume/2);
338 
339 
340  for(int dir=XUP; dir<=TUP; ++dir){
341  int dx[4] = {0,0,0,0};
342  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + little_index;
343  llfat_scalar_mult_su3_matrix(sitelinkEx[dir]+large_index, act_path_coeff[1], llink);
344  dx[dir] = 1;
345  int nbr_index = neighborIndexFullLattice(E, large_index, dx);
346  llfat_mult_su3_nn(llink, sitelinkEx[dir]+nbr_index, &temp);
347  dx[dir] = 2;
348  nbr_index = neighborIndexFullLattice(E, large_index, dx);
349  llfat_mult_su3_nn(&temp, sitelinkEx[dir]+nbr_index, llink);
350  }
351  } // x
352  } // y
353  } // z
354  } // t
355  return;
356 }
357 #endif
358 
359 
360 void computeLongLinkCPU(void** longlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
361 {
362  if(longlink){
363  switch(prec){
364  case QUDA_DOUBLE_PRECISION:
365  computeLongLinkCPU((void**)longlink, (su3_matrix<double>**)sitelink, (double*)act_path_coeff);
366  break;
367 
368  case QUDA_SINGLE_PRECISION:
369  computeLongLinkCPU((void**)longlink, (su3_matrix<float>**)sitelink, (float*)act_path_coeff);
370  break;
371  default:
372  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
373  exit(1);
374  break;
375  }
376  } // if(longlink)
377 
378 }
379 
380 
381  void
382 llfat_reference(void** fatlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
383 {
384  Vs[0] = Vs_x;
385  Vs[1] = Vs_y;
386  Vs[2] = Vs_z;
387  Vs[3] = Vs_t;
388 
389  Vsh[0] = Vsh_x;
390  Vsh[1] = Vsh_y;
391  Vsh[2] = Vsh_z;
392  Vsh[3] = Vsh_t;
393 
394 
395  switch(prec){
397  llfat_cpu((void**)fatlink, (su3_matrix<double>**)sitelink, (double*) act_path_coeff);
398  break;
399 
401  llfat_cpu((void**)fatlink, (su3_matrix<float>**)sitelink, (float*) act_path_coeff);
402  break;
403 
404  default:
405  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
406  exit(1);
407  break;
408 
409  }
410 
411  return;
412 
413 }
414 
415 #ifdef MULTI_GPU
416 
417 template<typename su3_matrix, typename Real>
418  void
419 llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu,
420  su3_matrix* mulink, su3_matrix** ghost_mulink,
421  su3_matrix** sitelink, su3_matrix** ghost_sitelink, su3_matrix** ghost_sitelink_diag,
422  void** fatlink, Real coef,
423  int use_staple)
424 {
425  su3_matrix tmat1,tmat2;
426  int i ;
427  su3_matrix *fat1;
428 
429 
430  int X1 = Z[0];
431  int X2 = Z[1];
432  int X3 = Z[2];
433  //int X4 = Z[3];
434  int X1h =X1/2;
435 
436  int X2X1 = X1*X2;
437  int X3X2 = X3*X2;
438  int X3X1 = X3*X1;
439 
440  /* Upper staple */
441  /* Computes the staple :
442  * mu (B)
443  * +-------+
444  * nu | |
445  * (A) | |(C)
446  * X X
447  *
448  * Where the mu link can be any su3_matrix. The result is saved in staple.
449  * if staple==NULL then the result is not saved.
450  * It also adds the computed staple to the fatlink[mu] with weight coef.
451  */
452 
453  int dx[4];
454 
455  /* upper staple */
456 
457  for(i=0;i < V;i++){
458 
459  int half_index = i;
460  int oddBit =0;
461  if (i >= Vh){
462  oddBit = 1;
463  half_index = i -Vh;
464  }
465  //int x4 = x4_from_full_index(i);
466 
467 
468 
469  int sid =half_index;
470  int za = sid/X1h;
471  int x1h = sid - za*X1h;
472  int zb = za/X2;
473  int x2 = za - zb*X2;
474  int x4 = zb/X3;
475  int x3 = zb - x4*X3;
476  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
477  int x1 = 2*x1h + x1odd;
478  int x[4] = {x1,x2,x3,x4};
479  int space_con[4]={
480  (x4*X3X2+x3*X2+x2)/2,
481  (x4*X3X1+x3*X1+x1)/2,
482  (x4*X2X1+x2*X1+x1)/2,
483  (x3*X2X1+x2*X1+x1)/2
484  };
485 
486  fat1 = ((su3_matrix*)fatlink[mu]) + i;
487  su3_matrix* A = sitelink[nu] + i;
488 
489  memset(dx, 0, sizeof(dx));
490  dx[nu] =1;
491  int nbr_idx;
492 
493  su3_matrix* B;
494  if (use_staple){
495  if (x[nu] + dx[nu] >= Z[nu]){
496  B = ghost_mulink[nu] + Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
497  }else{
498  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
499  B = mulink + nbr_idx;
500  }
501  }else{
502  if(x[nu]+dx[nu] >= Z[nu]){ //out of boundary, use ghost data
503  B = ghost_sitelink[nu] + 4*Vs[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
504  }else{
505  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
506  B = sitelink[mu] + nbr_idx;
507  }
508  }
509 
510 
511  //we could be in the ghost link area if mu is T and we are at high T boundary
512  su3_matrix* C;
513  memset(dx, 0, sizeof(dx));
514  dx[mu] =1;
515  if(x[mu] + dx[mu] >= Z[mu]){ //out of boundary, use ghost data
516  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + (1-oddBit)*Vsh[mu] + space_con[mu];
517  }else{
518  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
519  C = sitelink[nu] + nbr_idx;
520  }
521 
522  llfat_mult_su3_nn( A, B,&tmat1);
523 
524  if(staple!=NULL){/* Save the staple */
525  llfat_mult_su3_na( &tmat1, C, &staple[i]);
526  } else{ /* No need to save the staple. Add it to the fatlinks */
527  llfat_mult_su3_na( &tmat1, C, &tmat2);
528  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
529  }
530  }
531  /***************lower staple****************
532  *
533  * X X
534  * nu | |
535  * (A) | |(C)
536  * +-------+
537  * mu (B)
538  *
539  *********************************************/
540 
541  for(i=0;i < V;i++){
542 
543  int half_index = i;
544  int oddBit =0;
545  if (i >= Vh){
546  oddBit = 1;
547  half_index = i -Vh;
548  }
549 
550  int sid =half_index;
551  int za = sid/X1h;
552  int x1h = sid - za*X1h;
553  int zb = za/X2;
554  int x2 = za - zb*X2;
555  int x4 = zb/X3;
556  int x3 = zb - x4*X3;
557  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
558  int x1 = 2*x1h + x1odd;
559  int x[4] = {x1,x2,x3,x4};
560  int space_con[4]={
561  (x4*X3X2+x3*X2+x2)/2,
562  (x4*X3X1+x3*X1+x1)/2,
563  (x4*X2X1+x2*X1+x1)/2,
564  (x3*X2X1+x2*X1+x1)/2
565  };
566 
567  //int x4 = x4_from_full_index(i);
568 
569  fat1 = ((su3_matrix*)fatlink[mu]) + i;
570 
571  //we could be in the ghost link area if nu is T and we are at low T boundary
572  su3_matrix* A;
573  memset(dx, 0, sizeof(dx));
574  dx[nu] = -1;
575 
576  int nbr_idx;
577  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
578  A = ghost_sitelink[nu] + nu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
579  }else{
580  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
581  A = sitelink[nu] + nbr_idx;
582  }
583 
584  su3_matrix* B;
585  if (use_staple){
586  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
587  if (x[nu] + dx[nu] < 0){
588  B = ghost_mulink[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
589  }else{
590  B = mulink + nbr_idx;
591  }
592  }else{
593  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
594  B = ghost_sitelink[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
595  }else{
596  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
597  B = sitelink[mu] + nbr_idx;
598  }
599  }
600 
601  //we could be in the ghost link area if nu is T and we are at low T boundary
602  // or mu is T and we are on high T boundary
603  su3_matrix* C;
604  memset(dx, 0, sizeof(dx));
605  dx[nu] = -1;
606  dx[mu] = 1;
607  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
608 
609  //space con must be recomputed because we have coodinates change in 2 directions
610  int new_x1, new_x2, new_x3, new_x4;
611  new_x1 = (x[0] + dx[0] + Z[0])%Z[0];
612  new_x2 = (x[1] + dx[1] + Z[1])%Z[1];
613  new_x3 = (x[2] + dx[2] + Z[2])%Z[2];
614  new_x4 = (x[3] + dx[3] + Z[3])%Z[3];
615  int new_x[4] = {new_x1, new_x2, new_x3, new_x4};
616  space_con[0] = (new_x4*X3X2 + new_x3*X2 + new_x2)/2;
617  space_con[1] = (new_x4*X3X1 + new_x3*X1 + new_x1)/2;
618  space_con[2] = (new_x4*X2X1 + new_x2*X1 + new_x1)/2;
619  space_con[3] = (new_x3*X2X1 + new_x2*X1 + new_x1)/2;
620 
621  if( (x[nu] + dx[nu]) < 0 && (x[mu] + dx[mu] >= Z[mu])){
622  //find the other 2 directions, dir1, dir2
623  //with dir2 the slowest changing direction
624  int dir1, dir2; //other two dimensions
625  for(dir1=0; dir1 < 4; dir1 ++){
626  if(dir1 != nu && dir1 != mu){
627  break;
628  }
629  }
630  for(dir2=0; dir2 < 4; dir2 ++){
631  if(dir2 != nu && dir2 != mu && dir2 != dir1){
632  break;
633  }
634  }
635  C = ghost_sitelink_diag[nu*4+mu] + oddBit*Z[dir1]*Z[dir2]/2 + (new_x[dir2]*Z[dir1]+new_x[dir1])/2;
636  }else if (x[nu] + dx[nu] < 0){
637  C = ghost_sitelink[nu] + nu*Vs[nu] + oddBit*Vsh[nu]+ space_con[nu];
638  }else if (x[mu] + dx[mu] >= Z[mu]){
639  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + oddBit*Vsh[mu]+space_con[mu];
640  }else{
641  C = sitelink[nu] + nbr_idx;
642  }
643  llfat_mult_su3_an( A, B,&tmat1);
644  llfat_mult_su3_nn( &tmat1, C,&tmat2);
645 
646  if(staple!=NULL){/* Save the staple */
647  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
648  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
649 
650  } else{ /* No need to save the staple. Add it to the fatlinks */
651  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
652  }
653  }
654 
655 } /* compute_gen_staple_site */
656 
657 
658  template <typename su3_matrix, typename Float>
659 void llfat_cpu_mg(void** fatlink, su3_matrix** sitelink, su3_matrix** ghost_sitelink,
660  su3_matrix** ghost_sitelink_diag, Float* act_path_coeff)
661 {
663  if (sizeof(Float) == 4){
665  }else{
667  }
668 
669  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
670  if(staple == NULL){
671  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
672  exit(1);
673  }
674 
675 
676  su3_matrix* ghost_staple[4];
677  su3_matrix* ghost_staple1[4];
678 
679  for(int i=0;i < 4;i++){
680  ghost_staple[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
681  if (ghost_staple[i] == NULL){
682  fprintf(stderr, "Error: malloc failed for ghost staple in function %s\n", __FUNCTION__);
683  exit(1);
684  }
685 
686  ghost_staple1[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
687  if (ghost_staple1[i] == NULL){
688  fprintf(stderr, "Error: malloc failed for ghost staple1 in function %s\n", __FUNCTION__);
689  exit(1);
690  }
691  }
692 
693  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
694  if(tempmat1 == NULL){
695  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
696  exit(1);
697  }
698 
699  /* to fix up the Lepage term, included by a trick below */
700  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
701 
702 
703  for (int dir=XUP; dir<=TUP; dir++){
704 
705  /* Intialize fat links with c_1*U_\mu(x) */
706  for(int i=0;i < V;i ++){
707  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
708  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
709  }
710  }
711 
712  for (int dir=XUP; dir<=TUP; dir++){
713  for(int nu=XUP; nu<=TUP; nu++){
714  if(nu!=dir){
715  llfat_compute_gen_staple_field_mg(staple,dir,nu,
716  sitelink[dir], (su3_matrix**)NULL,
717  sitelink, ghost_sitelink, ghost_sitelink_diag,
718  fatlink, act_path_coeff[2], 0);
719  /* The Lepage term */
720  /* Note this also involves modifying c_1 (above) */
721 
722  exchange_cpu_staple(Z, staple, (void**)ghost_staple, prec);
723 
724  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,nu,
725  staple,ghost_staple,
726  sitelink, ghost_sitelink, ghost_sitelink_diag,
727  fatlink, act_path_coeff[5],1);
728 
729  for(int rho=XUP; rho<=TUP; rho++) {
730  if((rho!=dir)&&(rho!=nu)){
731  llfat_compute_gen_staple_field_mg( tempmat1, dir, rho,
732  staple,ghost_staple,
733  sitelink, ghost_sitelink, ghost_sitelink_diag,
734  fatlink, act_path_coeff[3], 1);
735 
736 
737  exchange_cpu_staple(Z, tempmat1, (void**)ghost_staple1, prec);
738 
739  for(int sig=XUP; sig<=TUP; sig++){
740  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
741 
742  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,sig,
743  tempmat1, ghost_staple1,
744  sitelink, ghost_sitelink, ghost_sitelink_diag,
745  fatlink, act_path_coeff[4], 1);
746  //FIXME
747  //return;
748 
749  }
750  }/* sig */
751  }
752  }/* rho */
753  }
754 
755  }/* nu */
756 
757  }/* dir */
758 
759  free(staple);
760  for(int i=0;i < 4;i++){
761  free(ghost_staple[i]);
762  free(ghost_staple1[i]);
763  }
764  free(tempmat1);
765 
766 }
767 
768 
769 
770  void
771 llfat_reference_mg(void** fatlink, void** sitelink, void** ghost_sitelink,
772  void** ghost_sitelink_diag, QudaPrecision prec, void* act_path_coeff)
773 {
774 
775  Vs[0] = Vs_x;
776  Vs[1] = Vs_y;
777  Vs[2] = Vs_z;
778  Vs[3] = Vs_t;
779 
780  Vsh[0] = Vsh_x;
781  Vsh[1] = Vsh_y;
782  Vsh[2] = Vsh_z;
783  Vsh[3] = Vsh_t;
784 
785  switch(prec){
786  case QUDA_DOUBLE_PRECISION:{
787  llfat_cpu_mg((void**)fatlink, (su3_matrix<double>**)sitelink, (su3_matrix<double>**)ghost_sitelink,
788  (su3_matrix<double>**)ghost_sitelink_diag, (double*) act_path_coeff);
789  break;
790  }
791  case QUDA_SINGLE_PRECISION:{
792  llfat_cpu_mg((void**)fatlink, (su3_matrix<float>**)sitelink, (su3_matrix<float>**)ghost_sitelink,
793  (su3_matrix<float>**)ghost_sitelink_diag, (float*) act_path_coeff);
794  break;
795  }
796  default:
797  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
798  exit(1);
799  break;
800 
801  }
802 
803  return;
804 
805 }
806 
807 
808 #endif
void free(void *)
double mu
Definition: test_util.cpp:1643
enum QudaPrecision_s QudaPrecision
int Vs_z
Definition: test_util.cpp:30
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
void llfat_mult_su3_nn(su3_matrix *a, su3_matrix *b, su3_matrix *c)
int Vs_y
Definition: test_util.cpp:30
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:563
#define XUP
std::complex< real > e[3]
int E[4]
Definition: test_util.cpp:36
int Vsh_y
Definition: test_util.cpp:31
void llfat_add_su3_matrix(su3_matrix *a, su3_matrix *b, su3_matrix *c)
void exit(int) __attribute__((noreturn))
void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink, void **fatlink, Real coef, int use_staple)
#define b
int Vsh_t
Definition: test_util.cpp:31
void llfat_scalar_mult_su3_matrix(su3_matrix *a, Real s, su3_matrix *b)
void * longlink[4]
int Vs_x
Definition: test_util.cpp:30
static int Vs[4]
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
void llfat_scalar_mult_add_su3_matrix(su3_matrix *a, su3_matrix *b, Real s, su3_matrix *c)
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
int V
Definition: test_util.cpp:28
int Z[4]
Definition: test_util.cpp:27
void llfat_mult_su3_na(su3_matrix *a, su3_matrix *b, su3_matrix *c)
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
std::complex< real > e[3][3]
#define TUP
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
void * memset(void *__b, int __c, size_t __len)
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:611
void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff)
void * fatlink[4]
Main header file for the QUDA library.
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
void llfat_mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c)
int Vh
Definition: test_util.cpp:29
int Vsh_z
Definition: test_util.cpp:31
const void * c
int Vs_t
Definition: test_util.cpp:30
QudaPrecision prec
Definition: test_util.cpp:1615
#define a
int Vsh_x
Definition: test_util.cpp:31
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
static int Vsh[4]