QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_reference.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 
5 #include <quda.h>
6 #include <test_util.h>
7 #include "llfat_reference.h"
8 #include "misc.h"
9 #include <string.h>
10 
11 #include <quda_internal.h>
12 #include "face_quda.h"
13 
14 #define XUP 0
15 #define YUP 1
16 #define ZUP 2
17 #define TUP 3
18 
19 typedef struct {
20  float real;
21  float imag;
22 } fcomplex;
23 
24 /* specific for double complex */
25 typedef struct {
26  double real;
27  double imag;
28 } dcomplex;
29 
30 typedef struct { fcomplex e[3][3]; } fsu3_matrix;
31 typedef struct { fcomplex c[3]; } fsu3_vector;
32 typedef struct { dcomplex e[3][3]; } dsu3_matrix;
33 typedef struct { dcomplex c[3]; } dsu3_vector;
34 
35 
36 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \
37  (c).imag = (a).imag + (b).imag; }
38 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \
39  (c).imag = (a).real*(b).imag + (a).imag*(b).real; }
40 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; }
41 
42 /* c = a* * b */
43 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
44  (c).imag = (a).real*(b).imag - (a).imag*(b).real; }
45 
46 /* c = a * b* */
47 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
48  (c).imag = (a).imag*(b).real - (a).real*(b).imag; }
49 
50 static int Vs[4];
51 static int Vsh[4];
52 
53 template<typename su3_matrix, typename Real>
54  void
55 llfat_scalar_mult_su3_matrix( su3_matrix *a, Real s, su3_matrix *b )
56 {
57 
58  int i,j;
59  for(i=0;i<3;i++)for(j=0;j<3;j++){
60  b->e[i][j].real = s*a->e[i][j].real;
61  b->e[i][j].imag = s*a->e[i][j].imag;
62  }
63 
64  return;
65 }
66 
67 template<typename su3_matrix, typename Real>
68  void
69 llfat_scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Real s, su3_matrix *c)
70 {
71  int i,j;
72  for(i=0;i<3;i++)for(j=0;j<3;j++){
73  c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real;
74  c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag;
75  }
76 
77 }
78 
79 template <typename su3_matrix>
80  void
81 llfat_mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c )
82 {
83  int i,j,k;
84  typeof(a->e[0][0]) x,y;
85  for(i=0;i<3;i++)for(j=0;j<3;j++){
86  x.real=x.imag=0.0;
87  for(k=0;k<3;k++){
88  CMUL_J( a->e[i][k] , b->e[j][k] , y );
89  CSUM( x , y );
90  }
91  c->e[i][j] = x;
92  }
93 }
94 
95 template <typename su3_matrix>
96  void
97 llfat_mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
98 {
99  int i,j,k;
100  typeof(a->e[0][0]) x,y;
101  for(i=0;i<3;i++)for(j=0;j<3;j++){
102  x.real=x.imag=0.0;
103  for(k=0;k<3;k++){
104  CMUL( a->e[i][k] , b->e[k][j] , y );
105  CSUM( x , y );
106  }
107  c->e[i][j] = x;
108  }
109 }
110 
111 template<typename su3_matrix>
112  void
113 llfat_mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c )
114 {
115  int i,j,k;
116  typeof(a->e[0][0]) x,y;
117  for(i=0;i<3;i++)for(j=0;j<3;j++){
118  x.real=x.imag=0.0;
119  for(k=0;k<3;k++){
120  CMULJ_( a->e[k][i] , b->e[k][j], y );
121  CSUM( x , y );
122  }
123  c->e[i][j] = x;
124  }
125 }
126 
127 
128 
129 
130 
131 template<typename su3_matrix>
132  void
133 llfat_add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c )
134 {
135  int i,j;
136  for(i=0;i<3;i++)for(j=0;j<3;j++){
137  CADD( a->e[i][j], b->e[i][j], c->e[i][j] );
138  }
139 }
140 
141 
142 
143 template<typename su3_matrix, typename Real>
144  void
145 llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu,
146  su3_matrix* mulink, su3_matrix** sitelink, void** fatlink, Real coef,
147  int use_staple)
148 {
149  su3_matrix tmat1,tmat2;
150  int i ;
151  su3_matrix *fat1;
152 
153  /* Upper staple */
154  /* Computes the staple :
155  * mu (B)
156  * +-------+
157  * nu | |
158  * (A) | |(C)
159  * X X
160  *
161  * Where the mu link can be any su3_matrix. The result is saved in staple.
162  * if staple==NULL then the result is not saved.
163  * It also adds the computed staple to the fatlink[mu] with weight coef.
164  */
165 
166  int dx[4];
167 
168  /* upper staple */
169 
170  for(i=0;i < V;i++){
171 
172  fat1 = ((su3_matrix*)fatlink[mu]) + i;
173  su3_matrix* A = sitelink[nu] + i;
174 
175  memset(dx, 0, sizeof(dx));
176  dx[nu] =1;
177  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
178  su3_matrix* B;
179  if (use_staple){
180  B = mulink + nbr_idx;
181  }else{
182  B = mulink + nbr_idx;
183  }
184 
185  memset(dx, 0, sizeof(dx));
186  dx[mu] =1;
187  nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]);
188  su3_matrix* C = sitelink[nu] + nbr_idx;
189 
190  llfat_mult_su3_nn( A, B,&tmat1);
191 
192  if(staple!=NULL){/* Save the staple */
193  llfat_mult_su3_na( &tmat1, C, &staple[i]);
194  } else{ /* No need to save the staple. Add it to the fatlinks */
195  llfat_mult_su3_na( &tmat1, C, &tmat2);
196  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
197  }
198  }
199  /***************lower staple****************
200  *
201  * X X
202  * nu | |
203  * (A) | |(C)
204  * +-------+
205  * mu (B)
206  *
207  *********************************************/
208 
209  for(i=0;i < V;i++){
210 
211  fat1 = ((su3_matrix*)fatlink[mu]) + i;
212  memset(dx, 0, sizeof(dx));
213  dx[nu] = -1;
214  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
215  if (nbr_idx >= V || nbr_idx <0){
216  fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__);
217  exit(1);
218  }
219  su3_matrix* A = sitelink[nu] + nbr_idx;
220 
221  su3_matrix* B;
222  if (use_staple){
223  B = mulink + nbr_idx;
224  }else{
225  B = mulink + nbr_idx;
226  }
227 
228  memset(dx, 0, sizeof(dx));
229  dx[mu] = 1;
230  nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]);
231  su3_matrix* C = sitelink[nu] + nbr_idx;
232 
233  llfat_mult_su3_an( A, B,&tmat1);
234  llfat_mult_su3_nn( &tmat1, C,&tmat2);
235 
236  if(staple!=NULL){/* Save the staple */
237  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
238  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
239 
240  } else{ /* No need to save the staple. Add it to the fatlinks */
241  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
242  }
243  }
244 
245 } /* compute_gen_staple_site */
246 
247 
248 
249 /* Optimized fattening code for the Asq and Asqtad actions.
250  * I assume that:
251  * path 0 is the one link
252  * path 2 the 3-staple
253  * path 3 the 5-staple
254  * path 4 the 7-staple
255  * path 5 the Lapage term.
256  * Path 1 is the Naik term
257  *
258  */
259  template <typename su3_matrix, typename Float>
260 void llfat_cpu(void** fatlink, su3_matrix** sitelink, Float* act_path_coeff)
261 {
262 
263  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
264  if(staple == NULL){
265  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
266  exit(1);
267  }
268 
269  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
270  if(tempmat1 == NULL){
271  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
272  exit(1);
273  }
274 
275  /* to fix up the Lepage term, included by a trick below */
276  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
277 
278 
279  for (int dir=XUP; dir<=TUP; dir++){
280 
281  /* Intialize fat links with c_1*U_\mu(x) */
282  for(int i=0;i < V;i ++){
283  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
284  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
285  }
286  }
287 
288 
289 
290 
291  for (int dir=XUP; dir<=TUP; dir++){
292  for(int nu=XUP; nu<=TUP; nu++){
293  if(nu!=dir){
294  llfat_compute_gen_staple_field(staple,dir,nu,sitelink[dir], sitelink,fatlink, act_path_coeff[2], 0);
295 
296  /* The Lepage term */
297  /* Note this also involves modifying c_1 (above) */
298 
299  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1);
300 
301  for(int rho=XUP; rho<=TUP; rho++) {
302  if((rho!=dir)&&(rho!=nu)){
303  llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1);
304 
305  for(int sig=XUP; sig<=TUP; sig++){
306  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
307  llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1);
308  }
309  }/* sig */
310 
311  }
312 
313  }/* rho */
314  }
315 
316  }/* nu */
317 
318  }/* dir */
319 
320 
321  free(staple);
322  free(tempmat1);
323 
324 }
325 
326 #ifndef MULTI_GPU
327  template<typename su3_matrix, typename Float>
328 void computeLongLinkCPU(void** longlink, su3_matrix** sitelink,
329  Float* act_path_coeff)
330 {
331 
332  su3_matrix temp;
333  for(int dir=XUP; dir<=TUP; ++dir){
334  int dx[4] = {0,0,0,0};
335  for(int i=0; i<V; ++i){
336  // Initialize the longlinks
337  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + i;
338  llfat_scalar_mult_su3_matrix(sitelink[dir]+i, act_path_coeff[1], llink);
339  dx[dir] = 1;
340  int nbr_idx = neighborIndexFullLattice(Z, i, dx);
341  llfat_mult_su3_nn(llink, sitelink[dir]+nbr_idx, &temp);
342  dx[dir] = 2;
343  nbr_idx = neighborIndexFullLattice(Z, i, dx);
344  llfat_mult_su3_nn(&temp, sitelink[dir]+nbr_idx, llink);
345  }
346  }
347  return;
348 }
349 #else
350  template<typename su3_matrix, typename Float>
351 void computeLongLinkCPU(void** longlink, su3_matrix** sitelinkEx, Float* act_path_coeff)
352 {
353  int E[4];
354  for(int dir=0; dir<4; ++dir) E[dir] = Z[dir]+4;
355 
356 
357  const int extended_volume = E[3]*E[2]*E[1]*E[0];
358 
359  su3_matrix temp;
360  for(int t=0; t<Z[3]; ++t){
361  for(int z=0; z<Z[2]; ++z){
362  for(int y=0; y<Z[1]; ++y){
363  for(int x=0; x<Z[0]; ++x){
364  const int oddBit = (x+y+z+t)&1;
365  int little_index = ((((t*Z[2] + z)*Z[1] + y)*Z[0] + x)/2) + oddBit*Vh;
366  int large_index = (((((t+2)*E[2] + (z+2))*E[1] + (y+2))*E[0] + x+2)/2) + oddBit*(extended_volume/2);
367 
368 
369  for(int dir=XUP; dir<=TUP; ++dir){
370  int dx[4] = {0,0,0,0};
371  su3_matrix* llink = ((su3_matrix*)longlink[dir]) + little_index;
372  llfat_scalar_mult_su3_matrix(sitelinkEx[dir]+large_index, act_path_coeff[1], llink);
373  dx[dir] = 1;
374  int nbr_index = neighborIndexFullLattice(E, large_index, dx);
375  llfat_mult_su3_nn(llink, sitelinkEx[dir]+nbr_index, &temp);
376  dx[dir] = 2;
377  nbr_index = neighborIndexFullLattice(E, large_index, dx);
378  llfat_mult_su3_nn(&temp, sitelinkEx[dir]+nbr_index, llink);
379  }
380  } // x
381  } // y
382  } // z
383  } // t
384  return;
385 }
386 #endif
387 
388 
389 void computeLongLinkCPU(void** longlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
390 {
391  if(longlink){
392  switch(prec){
393  case QUDA_DOUBLE_PRECISION:
394  computeLongLinkCPU((void**)longlink, (dsu3_matrix**)sitelink, (double*)act_path_coeff);
395  break;
396 
397  case QUDA_SINGLE_PRECISION:
398  computeLongLinkCPU((void**)longlink, (fsu3_matrix**)sitelink, (float*)act_path_coeff);
399  break;
400  default:
401  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
402  exit(1);
403  break;
404  }
405  } // if(longlink)
406 
407 }
408 
409 
410  void
411 llfat_reference(void** fatlink, void** sitelink, QudaPrecision prec, void* act_path_coeff)
412 {
413  Vs[0] = Vs_x;
414  Vs[1] = Vs_y;
415  Vs[2] = Vs_z;
416  Vs[3] = Vs_t;
417 
418  Vsh[0] = Vsh_x;
419  Vsh[1] = Vsh_y;
420  Vsh[2] = Vsh_z;
421  Vsh[3] = Vsh_t;
422 
423 
424  switch(prec){
426  llfat_cpu((void**)fatlink, (dsu3_matrix**)sitelink, (double*) act_path_coeff);
427  break;
428 
430  llfat_cpu((void**)fatlink, (fsu3_matrix**)sitelink, (float*) act_path_coeff);
431  break;
432 
433  default:
434  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
435  exit(1);
436  break;
437 
438  }
439 
440  return;
441 
442 }
443 
444 #ifdef MULTI_GPU
445 
446 template<typename su3_matrix, typename Real>
447  void
448 llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu,
449  su3_matrix* mulink, su3_matrix** ghost_mulink,
450  su3_matrix** sitelink, su3_matrix** ghost_sitelink, su3_matrix** ghost_sitelink_diag,
451  void** fatlink, Real coef,
452  int use_staple)
453 {
454  su3_matrix tmat1,tmat2;
455  int i ;
456  su3_matrix *fat1;
457 
458 
459  int X1 = Z[0];
460  int X2 = Z[1];
461  int X3 = Z[2];
462  //int X4 = Z[3];
463  int X1h =X1/2;
464 
465  int X2X1 = X1*X2;
466  int X3X2 = X3*X2;
467  int X3X1 = X3*X1;
468 
469  /* Upper staple */
470  /* Computes the staple :
471  * mu (B)
472  * +-------+
473  * nu | |
474  * (A) | |(C)
475  * X X
476  *
477  * Where the mu link can be any su3_matrix. The result is saved in staple.
478  * if staple==NULL then the result is not saved.
479  * It also adds the computed staple to the fatlink[mu] with weight coef.
480  */
481 
482  int dx[4];
483 
484  /* upper staple */
485 
486  for(i=0;i < V;i++){
487 
488  int half_index = i;
489  int oddBit =0;
490  if (i >= Vh){
491  oddBit = 1;
492  half_index = i -Vh;
493  }
494  //int x4 = x4_from_full_index(i);
495 
496 
497 
498  int sid =half_index;
499  int za = sid/X1h;
500  int x1h = sid - za*X1h;
501  int zb = za/X2;
502  int x2 = za - zb*X2;
503  int x4 = zb/X3;
504  int x3 = zb - x4*X3;
505  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
506  int x1 = 2*x1h + x1odd;
507  int x[4] = {x1,x2,x3,x4};
508  int space_con[4]={
509  (x4*X3X2+x3*X2+x2)/2,
510  (x4*X3X1+x3*X1+x1)/2,
511  (x4*X2X1+x2*X1+x1)/2,
512  (x3*X2X1+x2*X1+x1)/2
513  };
514 
515  fat1 = ((su3_matrix*)fatlink[mu]) + i;
516  su3_matrix* A = sitelink[nu] + i;
517 
518  memset(dx, 0, sizeof(dx));
519  dx[nu] =1;
520  int nbr_idx;
521 
522  su3_matrix* B;
523  if (use_staple){
524  if (x[nu] + dx[nu] >= Z[nu]){
525  B = ghost_mulink[nu] + Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
526  }else{
527  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
528  B = mulink + nbr_idx;
529  }
530  }else{
531  if(x[nu]+dx[nu] >= Z[nu]){ //out of boundary, use ghost data
532  B = ghost_sitelink[nu] + 4*Vs[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
533  }else{
534  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
535  B = sitelink[mu] + nbr_idx;
536  }
537  }
538 
539 
540  //we could be in the ghost link area if mu is T and we are at high T boundary
541  su3_matrix* C;
542  memset(dx, 0, sizeof(dx));
543  dx[mu] =1;
544  if(x[mu] + dx[mu] >= Z[mu]){ //out of boundary, use ghost data
545  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + (1-oddBit)*Vsh[mu] + space_con[mu];
546  }else{
547  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
548  C = sitelink[nu] + nbr_idx;
549  }
550 
551  llfat_mult_su3_nn( A, B,&tmat1);
552 
553  if(staple!=NULL){/* Save the staple */
554  llfat_mult_su3_na( &tmat1, C, &staple[i]);
555  } else{ /* No need to save the staple. Add it to the fatlinks */
556  llfat_mult_su3_na( &tmat1, C, &tmat2);
557  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
558  }
559  }
560  /***************lower staple****************
561  *
562  * X X
563  * nu | |
564  * (A) | |(C)
565  * +-------+
566  * mu (B)
567  *
568  *********************************************/
569 
570  for(i=0;i < V;i++){
571 
572  int half_index = i;
573  int oddBit =0;
574  if (i >= Vh){
575  oddBit = 1;
576  half_index = i -Vh;
577  }
578 
579  int sid =half_index;
580  int za = sid/X1h;
581  int x1h = sid - za*X1h;
582  int zb = za/X2;
583  int x2 = za - zb*X2;
584  int x4 = zb/X3;
585  int x3 = zb - x4*X3;
586  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
587  int x1 = 2*x1h + x1odd;
588  int x[4] = {x1,x2,x3,x4};
589  int space_con[4]={
590  (x4*X3X2+x3*X2+x2)/2,
591  (x4*X3X1+x3*X1+x1)/2,
592  (x4*X2X1+x2*X1+x1)/2,
593  (x3*X2X1+x2*X1+x1)/2
594  };
595 
596  //int x4 = x4_from_full_index(i);
597 
598  fat1 = ((su3_matrix*)fatlink[mu]) + i;
599 
600  //we could be in the ghost link area if nu is T and we are at low T boundary
601  su3_matrix* A;
602  memset(dx, 0, sizeof(dx));
603  dx[nu] = -1;
604 
605  int nbr_idx;
606  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
607  A = ghost_sitelink[nu] + nu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
608  }else{
609  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
610  A = sitelink[nu] + nbr_idx;
611  }
612 
613  su3_matrix* B;
614  if (use_staple){
615  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
616  if (x[nu] + dx[nu] < 0){
617  B = ghost_mulink[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
618  }else{
619  B = mulink + nbr_idx;
620  }
621  }else{
622  if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data
623  B = ghost_sitelink[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu];
624  }else{
625  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]);
626  B = sitelink[mu] + nbr_idx;
627  }
628  }
629 
630  //we could be in the ghost link area if nu is T and we are at low T boundary
631  // or mu is T and we are on high T boundary
632  su3_matrix* C;
633  memset(dx, 0, sizeof(dx));
634  dx[nu] = -1;
635  dx[mu] = 1;
636  nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]);
637 
638  //space con must be recomputed because we have coodinates change in 2 directions
639  int new_x1, new_x2, new_x3, new_x4;
640  new_x1 = (x[0] + dx[0] + Z[0])%Z[0];
641  new_x2 = (x[1] + dx[1] + Z[1])%Z[1];
642  new_x3 = (x[2] + dx[2] + Z[2])%Z[2];
643  new_x4 = (x[3] + dx[3] + Z[3])%Z[3];
644  int new_x[4] = {new_x1, new_x2, new_x3, new_x4};
645  space_con[0] = (new_x4*X3X2 + new_x3*X2 + new_x2)/2;
646  space_con[1] = (new_x4*X3X1 + new_x3*X1 + new_x1)/2;
647  space_con[2] = (new_x4*X2X1 + new_x2*X1 + new_x1)/2;
648  space_con[3] = (new_x3*X2X1 + new_x2*X1 + new_x1)/2;
649 
650  if( (x[nu] + dx[nu]) < 0 && (x[mu] + dx[mu] >= Z[mu])){
651  //find the other 2 directions, dir1, dir2
652  //with dir2 the slowest changing direction
653  int dir1, dir2; //other two dimensions
654  for(dir1=0; dir1 < 4; dir1 ++){
655  if(dir1 != nu && dir1 != mu){
656  break;
657  }
658  }
659  for(dir2=0; dir2 < 4; dir2 ++){
660  if(dir2 != nu && dir2 != mu && dir2 != dir1){
661  break;
662  }
663  }
664  C = ghost_sitelink_diag[nu*4+mu] + oddBit*Z[dir1]*Z[dir2]/2 + (new_x[dir2]*Z[dir1]+new_x[dir1])/2;
665  }else if (x[nu] + dx[nu] < 0){
666  C = ghost_sitelink[nu] + nu*Vs[nu] + oddBit*Vsh[nu]+ space_con[nu];
667  }else if (x[mu] + dx[mu] >= Z[mu]){
668  C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + oddBit*Vsh[mu]+space_con[mu];
669  }else{
670  C = sitelink[nu] + nbr_idx;
671  }
672  llfat_mult_su3_an( A, B,&tmat1);
673  llfat_mult_su3_nn( &tmat1, C,&tmat2);
674 
675  if(staple!=NULL){/* Save the staple */
676  llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
677  llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
678 
679  } else{ /* No need to save the staple. Add it to the fatlinks */
680  llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);
681  }
682  }
683 
684 } /* compute_gen_staple_site */
685 
686 
687  template <typename su3_matrix, typename Float>
688 void llfat_cpu_mg(void** fatlink, su3_matrix** sitelink, su3_matrix** ghost_sitelink,
689  su3_matrix** ghost_sitelink_diag, Float* act_path_coeff)
690 {
692  if (sizeof(Float) == 4){
693  prec = QUDA_SINGLE_PRECISION;
694  }else{
695  prec = QUDA_DOUBLE_PRECISION;
696  }
697 
698  su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
699  if(staple == NULL){
700  fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
701  exit(1);
702  }
703 
704 
705  su3_matrix* ghost_staple[4];
706  su3_matrix* ghost_staple1[4];
707 
708  for(int i=0;i < 4;i++){
709  ghost_staple[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
710  if (ghost_staple[i] == NULL){
711  fprintf(stderr, "Error: malloc failed for ghost staple in function %s\n", __FUNCTION__);
712  exit(1);
713  }
714 
715  ghost_staple1[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix));
716  if (ghost_staple1[i] == NULL){
717  fprintf(stderr, "Error: malloc failed for ghost staple1 in function %s\n", __FUNCTION__);
718  exit(1);
719  }
720  }
721 
722  su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
723  if(tempmat1 == NULL){
724  fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__);
725  exit(1);
726  }
727 
728  /* to fix up the Lepage term, included by a trick below */
729  Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
730 
731 
732  for (int dir=XUP; dir<=TUP; dir++){
733 
734  /* Intialize fat links with c_1*U_\mu(x) */
735  for(int i=0;i < V;i ++){
736  su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i;
737  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 );
738  }
739  }
740 
741  for (int dir=XUP; dir<=TUP; dir++){
742  for(int nu=XUP; nu<=TUP; nu++){
743  if(nu!=dir){
744  llfat_compute_gen_staple_field_mg(staple,dir,nu,
745  sitelink[dir], (su3_matrix**)NULL,
746  sitelink, ghost_sitelink, ghost_sitelink_diag,
747  fatlink, act_path_coeff[2], 0);
748  /* The Lepage term */
749  /* Note this also involves modifying c_1 (above) */
750 
751  exchange_cpu_staple(Z, staple, (void**)ghost_staple, prec);
752 
753  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,nu,
754  staple,ghost_staple,
755  sitelink, ghost_sitelink, ghost_sitelink_diag,
756  fatlink, act_path_coeff[5],1);
757 
758  for(int rho=XUP; rho<=TUP; rho++) {
759  if((rho!=dir)&&(rho!=nu)){
760  llfat_compute_gen_staple_field_mg( tempmat1, dir, rho,
761  staple,ghost_staple,
762  sitelink, ghost_sitelink, ghost_sitelink_diag,
763  fatlink, act_path_coeff[3], 1);
764 
765 
766  exchange_cpu_staple(Z, tempmat1, (void**)ghost_staple1, prec);
767 
768  for(int sig=XUP; sig<=TUP; sig++){
769  if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
770 
771  llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,sig,
772  tempmat1, ghost_staple1,
773  sitelink, ghost_sitelink, ghost_sitelink_diag,
774  fatlink, act_path_coeff[4], 1);
775  //FIXME
776  //return;
777 
778  }
779  }/* sig */
780  }
781  }/* rho */
782  }
783 
784  }/* nu */
785 
786  }/* dir */
787 
788  free(staple);
789  for(int i=0;i < 4;i++){
790  free(ghost_staple[i]);
791  free(ghost_staple1[i]);
792  }
793  free(tempmat1);
794 
795 }
796 
797 
798 
799  void
800 llfat_reference_mg(void** fatlink, void** sitelink, void** ghost_sitelink,
801  void** ghost_sitelink_diag, QudaPrecision prec, void* act_path_coeff)
802 {
803 
804  Vs[0] = Vs_x;
805  Vs[1] = Vs_y;
806  Vs[2] = Vs_z;
807  Vs[3] = Vs_t;
808 
809  Vsh[0] = Vsh_x;
810  Vsh[1] = Vsh_y;
811  Vsh[2] = Vsh_z;
812  Vsh[3] = Vsh_t;
813 
814  switch(prec){
815  case QUDA_DOUBLE_PRECISION:{
816  llfat_cpu_mg((void**)fatlink, (dsu3_matrix**)sitelink, (dsu3_matrix**)ghost_sitelink,
817  (dsu3_matrix**)ghost_sitelink_diag, (double*) act_path_coeff);
818  break;
819  }
820  case QUDA_SINGLE_PRECISION:{
821  llfat_cpu_mg((void**)fatlink, (fsu3_matrix**)sitelink, (fsu3_matrix**)ghost_sitelink,
822  (fsu3_matrix**)ghost_sitelink_diag, (float*) act_path_coeff);
823  break;
824  }
825  default:
826  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
827  exit(1);
828  break;
829 
830  }
831 
832  return;
833 
834 }
835 
836 
837 #endif
__constant__ int Vh
__constant__ int X1h
__constant__ int X2
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
int y[4]
int Vs_z
Definition: test_util.cpp:31
__constant__ int Vsh
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
void llfat_mult_su3_nn(su3_matrix *a, su3_matrix *b, su3_matrix *c)
__constant__ int X1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int Vs_y
Definition: test_util.cpp:31
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:521
#define XUP
__constant__ int X3X2
#define CMULJ_(a, b, c)
#define Vsh_t
Definition: llfat_core.h:4
void llfat_add_su3_matrix(su3_matrix *a, su3_matrix *b, su3_matrix *c)
void * longlink[4]
void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink, void **fatlink, Real coef, int use_staple)
#define Vsh_z
Definition: llfat_core.h:3
__constant__ int Vs
int E[4]
void llfat_scalar_mult_su3_matrix(su3_matrix *a, Real s, su3_matrix *b)
int Vs_x
Definition: test_util.cpp:31
#define CADD(a, b, c)
FloatingPoint< float > Float
Definition: gtest.h:7350
void llfat_scalar_mult_add_su3_matrix(su3_matrix *a, su3_matrix *b, Real s, su3_matrix *c)
void llfat_mult_su3_na(su3_matrix *a, su3_matrix *b, su3_matrix *c)
#define CMUL_J(a, b, c)
int x[4]
short x1h
Definition: llfat_core.h:815
#define TUP
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
int dx[4]
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:569
void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff)
void * memset(void *s, int c, size_t n)
int Z[4]
Definition: test_util.cpp:28
Main header file for the QUDA library.
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
__constant__ int X3
#define Vsh_y
Definition: llfat_core.h:2
void llfat_mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c)
short x1odd
Definition: llfat_core.h:821
int Vs_t
Definition: test_util.cpp:31
#define Vsh_x
Definition: llfat_core.h:1
#define CSUM(a, b)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
VOLATILE spinorFloat * s
QudaPrecision prec
Definition: test_util.cpp:1551
__constant__ int X3X1
#define CMUL(a, b, c)
int oddBit
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
__constant__ int X2X1
void * fatlink[4]