QUDA  v1.1.0
A library for QCD on GPUs
gauge_force_reference.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <string.h>
5 #include <type_traits>
6 
7 #include "quda.h"
8 #include "gauge_field.h"
9 #include "host_utils.h"
10 #include "misc.h"
11 #include "gauge_force_reference.h"
12 
13 extern int Z[4];
14 extern int V;
15 extern int Vh;
16 extern int Vh_ex;
17 extern int E[4];
18 
19 #define CADD(a, b, c) \
20  { \
21  (c).real = (a).real + (b).real; \
22  (c).imag = (a).imag + (b).imag; \
23  }
24 #define CMUL(a, b, c) \
25  { \
26  (c).real = (a).real * (b).real - (a).imag * (b).imag; \
27  (c).imag = (a).real * (b).imag + (a).imag * (b).real; \
28  }
29 #define CSUM(a, b) \
30  { \
31  (a).real += (b).real; \
32  (a).imag += (b).imag; \
33  }
34 
35 /* c = a* * b */
36 #define CMULJ_(a, b, c) \
37  { \
38  (c).real = (a).real * (b).real + (a).imag * (b).imag; \
39  (c).imag = (a).real * (b).imag - (a).imag * (b).real; \
40  }
41 
42 /* c = a * b* */
43 #define CMUL_J(a, b, c) \
44  { \
45  (c).real = (a).real * (b).real + (a).imag * (b).imag; \
46  (c).imag = (a).imag * (b).real - (a).real * (b).imag; \
47  }
48 
49 #define CONJG(a, b) \
50  { \
51  (b).real = (a).real; \
52  (b).imag = -(a).imag; \
53  }
54 
55 typedef struct {
56  float real;
57  float imag;
58 } fcomplex;
59 
60 /* specific for double complex */
61 typedef struct {
62  double real;
63  double imag;
64 } dcomplex;
65 
66 typedef struct {
67  fcomplex e[3][3];
68 } fsu3_matrix;
69 typedef struct {
70  fcomplex c[3];
71 } fsu3_vector;
72 typedef struct {
73  dcomplex e[3][3];
74 } dsu3_matrix;
75 typedef struct {
76  dcomplex c[3];
77 } dsu3_vector;
78 
79 typedef struct {
80  fcomplex m01, m02, m12;
81  float m00im, m11im, m22im;
82  float space;
84 
85 typedef struct {
86  dcomplex m01, m02, m12;
87  double m00im, m11im, m22im;
88  double space;
90 
91 extern int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1);
92 
93 template <typename su3_matrix> void su3_adjoint(su3_matrix *a, su3_matrix *b)
94 {
95  for (int i = 0; i < 3; i++) {
96  for (int j = 0; j < 3; j++) { CONJG(a->e[j][i], b->e[i][j]); }
97  }
98 }
99 
100 template <typename su3_matrix, typename anti_hermitmat> void make_anti_hermitian(su3_matrix *m3, anti_hermitmat *ah3)
101 {
102  auto temp = (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag) * 0.33333333333333333;
103  ah3->m00im = m3->e[0][0].imag - temp;
104  ah3->m11im = m3->e[1][1].imag - temp;
105  ah3->m22im = m3->e[2][2].imag - temp;
106  ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real) * 0.5;
107  ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real) * 0.5;
108  ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real) * 0.5;
109  ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag) * 0.5;
110  ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag) * 0.5;
111  ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag) * 0.5;
112 }
113 
114 template <typename anti_hermitmat, typename su3_matrix>
115 static void uncompress_anti_hermitian(anti_hermitmat *mat_antihermit, su3_matrix *mat_su3)
116 {
117  typename std::remove_reference<decltype(mat_antihermit->m00im)>::type temp1;
118  mat_su3->e[0][0].imag = mat_antihermit->m00im;
119  mat_su3->e[0][0].real = 0.;
120  mat_su3->e[1][1].imag = mat_antihermit->m11im;
121  mat_su3->e[1][1].real = 0.;
122  mat_su3->e[2][2].imag = mat_antihermit->m22im;
123  mat_su3->e[2][2].real = 0.;
124  mat_su3->e[0][1].imag = mat_antihermit->m01.imag;
125  temp1 = mat_antihermit->m01.real;
126  mat_su3->e[0][1].real = temp1;
127  mat_su3->e[1][0].real = -temp1;
128  mat_su3->e[1][0].imag = mat_antihermit->m01.imag;
129  mat_su3->e[0][2].imag = mat_antihermit->m02.imag;
130  temp1 = mat_antihermit->m02.real;
131  mat_su3->e[0][2].real = temp1;
132  mat_su3->e[2][0].real = -temp1;
133  mat_su3->e[2][0].imag = mat_antihermit->m02.imag;
134  mat_su3->e[1][2].imag = mat_antihermit->m12.imag;
135  temp1 = mat_antihermit->m12.real;
136  mat_su3->e[1][2].real = temp1;
137  mat_su3->e[2][1].real = -temp1;
138  mat_su3->e[2][1].imag = mat_antihermit->m12.imag;
139 }
140 
141 template <typename su3_matrix, typename Float>
142 static void scalar_mult_sub_su3_matrix(su3_matrix *a, su3_matrix *b, Float s, su3_matrix *c)
143 {
144  for (int i = 0; i < 3; i++) {
145  for (int j = 0; j < 3; j++) {
146  c->e[i][j].real = a->e[i][j].real - s * b->e[i][j].real;
147  c->e[i][j].imag = a->e[i][j].imag - s * b->e[i][j].imag;
148  }
149  }
150 }
151 
152 template <typename su3_matrix, typename Float>
153 static void scalar_mult_add_su3_matrix(su3_matrix *a, su3_matrix *b, Float s, su3_matrix *c)
154 {
155  for (int i = 0; i < 3; i++) {
156  for (int j = 0; j < 3; j++) {
157  c->e[i][j].real = a->e[i][j].real + s * b->e[i][j].real;
158  c->e[i][j].imag = a->e[i][j].imag + s * b->e[i][j].imag;
159  }
160  }
161 }
162 
163 template <typename su3_matrix> static void mult_su3_nn(su3_matrix *a, su3_matrix *b, su3_matrix *c)
164 {
165  typename std::remove_reference<decltype(a->e[0][0])>::type x, y;
166  for (int i = 0; i < 3; i++) {
167  for (int j = 0; j < 3; j++) {
168  x.real = x.imag = 0.0;
169  for (int k = 0; k < 3; k++) {
170  CMUL(a->e[i][k], b->e[k][j], y);
171  CSUM(x, y);
172  }
173  c->e[i][j] = x;
174  }
175  }
176 }
177 
178 template <typename su3_matrix> static void mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c)
179 {
180  typename std::remove_reference<decltype(a->e[0][0])>::type x, y;
181  for (int i = 0; i < 3; i++) {
182  for (int j = 0; j < 3; j++) {
183  x.real = x.imag = 0.0;
184  for (int k = 0; k < 3; k++) {
185  CMULJ_(a->e[k][i], b->e[k][j], y);
186  CSUM(x, y);
187  }
188  c->e[i][j] = x;
189  }
190  }
191 }
192 
193 template <typename su3_matrix> static void mult_su3_na(su3_matrix *a, su3_matrix *b, su3_matrix *c)
194 {
195  typename std::remove_reference<decltype(a->e[0][0])>::type x, y;
196  for (int i = 0; i < 3; i++) {
197  for (int j = 0; j < 3; j++) {
198  x.real = x.imag = 0.0;
199  for (int k = 0; k < 3; k++) {
200  CMUL_J(a->e[i][k], b->e[j][k], y);
201  CSUM(x, y);
202  }
203  c->e[i][j] = x;
204  }
205  }
206 }
207 
208 template <typename su3_matrix> void print_su3_matrix(su3_matrix *a)
209 {
210  for (int i = 0; i < 3; i++) {
211  for (int j = 0; j < 3; j++) { printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag); }
212  printf("\n");
213  }
214 }
215 
216 int gf_neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
217 {
218  int oddBit = 0;
219  int half_idx = i;
220  if (i >= Vh) {
221  oddBit = 1;
222  half_idx = i - Vh;
223  }
224  int X1 = Z[0];
225  int X2 = Z[1];
226  int X3 = Z[2];
227  // int X4 = Z[3];
228  int X1h = X1 / 2;
229 
230  int za = half_idx / X1h;
231  int x1h = half_idx - za * X1h;
232  int zb = za / X2;
233  int x2 = za - zb * X2;
234  int x4 = zb / X3;
235  int x3 = zb - x4 * X3;
236  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
237  int x1 = 2 * x1h + x1odd;
238 
239 #ifdef MULTI_GPU
240  x4 = x4 + dx4;
241  x3 = x3 + dx3;
242  x2 = x2 + dx2;
243  x1 = x1 + dx1;
244 
245  int nbr_half_idx = ((x4 + 2) * (E[2] * E[1] * E[0]) + (x3 + 2) * (E[1] * E[0]) + (x2 + 2) * (E[0]) + (x1 + 2)) / 2;
246 #else
247  x4 = (x4 + dx4 + Z[3]) % Z[3];
248  x3 = (x3 + dx3 + Z[2]) % Z[2];
249  x2 = (x2 + dx2 + Z[1]) % Z[1];
250  x1 = (x1 + dx1 + Z[0]) % Z[0];
251 
252  int nbr_half_idx = (x4 * (Z[2] * Z[1] * Z[0]) + x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
253 #endif
254 
255  int oddBitChanged = (dx4 + dx3 + dx2 + dx1) % 2;
256  if (oddBitChanged) { oddBit = 1 - oddBit; }
257  int ret = nbr_half_idx;
258  if (oddBit) {
259 #ifdef MULTI_GPU
260  ret = Vh_ex + nbr_half_idx;
261 #else
262  ret = Vh + nbr_half_idx;
263 #endif
264  }
265 
266  return ret;
267 }
268 
269 // this functon compute one path for all lattice sites
270 template <typename su3_matrix, typename Float>
271 static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, su3_matrix **sitelink_ex_2d, int *path,
272  int len, Float loop_coeff, int dir)
273 {
274  su3_matrix prev_matrix, curr_matrix, tmat;
275  int dx[4];
276 
277  for (int i = 0; i < V; i++) {
278  memset(dx, 0, sizeof(dx));
279  memset(&curr_matrix, 0, sizeof(curr_matrix));
280 
281  curr_matrix.e[0][0].real = 1.0;
282  curr_matrix.e[1][1].real = 1.0;
283  curr_matrix.e[2][2].real = 1.0;
284 
285  dx[dir] = 1;
286  for (int j = 0; j < len; j++) {
287  int lnkdir;
288 
289  prev_matrix = curr_matrix;
290  if (GOES_FORWARDS(path[j])) {
291  // dx[path[j]] +=1;
292  lnkdir = path[j];
293  } else {
294  dx[OPP_DIR(path[j])] -= 1;
295  lnkdir = OPP_DIR(path[j]);
296  }
297 
298  int nbr_idx = gf_neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
299 #ifdef MULTI_GPU
300  su3_matrix *lnk = sitelink_ex_2d[lnkdir] + nbr_idx;
301 #else
302  su3_matrix *lnk = sitelink[lnkdir] + nbr_idx;
303 #endif
304  if (GOES_FORWARDS(path[j])) {
305  mult_su3_nn(&prev_matrix, lnk, &curr_matrix);
306  } else {
307  mult_su3_na(&prev_matrix, lnk, &curr_matrix);
308  }
309 
310  if (GOES_FORWARDS(path[j])) {
311  dx[path[j]] += 1;
312  } else {
313  // we already substract one in the code above
314  }
315 
316  } // j
317 
318  su3_adjoint(&curr_matrix, &tmat);
319  scalar_mult_add_su3_matrix(staple + i, &tmat, loop_coeff, staple + i);
320  } // i
321 }
322 
323 template <typename su3_matrix, typename anti_hermitmat, typename Float>
324 static void update_mom(anti_hermitmat *momentum, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3)
325 {
326  for (int i = 0; i < V; i++) {
327  su3_matrix tmat1;
328  su3_matrix tmat2;
329  su3_matrix tmat3;
330 
331  su3_matrix *lnk = sitelink[dir] + i;
332  su3_matrix *stp = staple + i;
333  anti_hermitmat *mom = momentum + 4 * i + dir;
334 
335  mult_su3_na(lnk, stp, &tmat1);
336  uncompress_anti_hermitian(mom, &tmat2);
337 
338  scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3);
339  make_anti_hermitian(&tmat3, mom);
340  }
341 }
342 
343 /* This function only computes one direction @dir
344  *
345  */
346 void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelink, void **sitelink_ex_2d,
347  QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths)
348 {
349  void *staple;
350  int gSize = prec;
351 
352  staple = malloc(V * gauge_site_size * gSize);
353  if (staple == NULL) {
354  fprintf(stderr, "ERROR: malloc failed for staple in functon %s\n", __FUNCTION__);
355  exit(1);
356  }
357 
358  memset(staple, 0, V * gauge_site_size * gSize);
359 
360  for (int i = 0; i < num_paths; i++) {
361  if (prec == QUDA_DOUBLE_PRECISION) {
362  double *my_loop_coeff = (double *)loop_coeff;
363  compute_path_product((dsu3_matrix *)staple, (dsu3_matrix **)sitelink, (dsu3_matrix **)sitelink_ex_2d, path_dir[i],
364  length[i], my_loop_coeff[i], dir);
365  } else {
366  float *my_loop_coeff = (float *)loop_coeff;
367  compute_path_product((fsu3_matrix *)staple, (fsu3_matrix **)sitelink, (fsu3_matrix **)sitelink_ex_2d, path_dir[i],
368  length[i], my_loop_coeff[i], dir);
369  }
370  }
371 
372  if (prec == QUDA_DOUBLE_PRECISION) {
373  update_mom((danti_hermitmat *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3);
374  } else {
375  update_mom((fanti_hermitmat *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3);
376  }
377 
378  free(staple);
379 }
380 
381 void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length,
382  void *loop_coeff, int num_paths)
383 {
384  // created extended field
385  int R[] = {2, 2, 2, 2};
390 
391  auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
392 
393  for (int dir = 0; dir < 4; dir++) {
394  gauge_force_reference_dir(refMom, dir, eb3, sitelink, (void **)qdp_ex->Gauge_p(), prec, path_dir[dir], length,
395  loop_coeff, num_paths);
396  }
397 
398  delete qdp_ex;
399 }
QudaPrecision prec
void * memset(void *s, int c, size_t n)
enum QudaPrecision_s QudaPrecision
@ QUDA_PERIODIC_T
Definition: enum_quda.h:57
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
#define gauge_site_size
Definition: face_gauge.cpp:34
int Vh
Definition: host_utils.cpp:38
void su3_adjoint(su3_matrix *a, su3_matrix *b)
#define CSUM(a, b)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:490
int Z[4]
Definition: host_utils.cpp:36
#define CMUL(a, b, c)
void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length, void *loop_coeff, int num_paths)
void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelink, void **sitelink_ex_2d, QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths)
void make_anti_hermitian(su3_matrix *m3, anti_hermitmat *ah3)
int gf_neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
int V
Definition: host_utils.cpp:37
#define CONJG(a, b)
int E[4]
Definition: host_utils.cpp:45
#define CMULJ_(a, b, c)
int Vh_ex
Definition: host_utils.cpp:46
#define CMUL_J(a, b, c)
void print_su3_matrix(su3_matrix *a)
int length[]
cpuGaugeField * refMom
#define OPP_DIR(dir)
Definition: misc.h:33
#define GOES_FORWARDS(dir)
Definition: misc.h:34
cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
FloatingPoint< float > Float
QudaGaugeParam param
Definition: pack_test.cpp:18
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42
QudaTboundary t_boundary
Definition: quda.h:44
std::complex< real > e[3][3]
Definition: llfat_utils.h:4
void setGaugeParam(QudaGaugeParam &gauge_param)
Definition: su3_test.cpp:64