QUDA  v1.1.0
A library for QCD on GPUs
staggered_host_utils.cpp
Go to the documentation of this file.
1 #include <complex>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <short.h>
6 
7 // QUDA headers
8 #include <unitarization_links.h>
9 
10 // External headers
11 #include <llfat_utils.h>
12 #include <staggered_gauge_utils.h>
13 #include <host_utils.h>
14 #include <command_line_params.h>
15 
16 #include <qio_field.h>
17 
18 #define MAX(a, b) ((a) > (b) ? (a) : (b))
19 
20 #define XUP 0
21 #define YUP 1
22 #define ZUP 2
23 #define TUP 3
24 
25 using namespace std;
26 
27 // Staggered gauge field utils
28 //------------------------------------------------------
29 void constructStaggeredHostGhostGaugeField(quda::GaugeField *cpuFat, quda::GaugeField *cpuLong, void *milc_fatlink,
30  void *milc_longlink, QudaGaugeParam &gauge_param)
31 {
32 
35 
36  GaugeFieldParam cpuFatParam(milc_fatlink, gauge_param);
38  cpuFat = GaugeField::Create(cpuFatParam);
39 
41  GaugeFieldParam cpuLongParam(milc_longlink, gauge_param);
43  cpuLong = GaugeField::Create(cpuLongParam);
44 }
45 
46 void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
47  void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
48  int argc, char **argv, bool &gauge_loaded)
49 {
50  // load a field WITHOUT PHASES
51  if (strcmp(latfile, "")) {
52  if (!gauge_loaded) {
53  read_gauge_field(latfile, qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
56  }
57  gauge_loaded = true;
58  } // else it's already been loaded
59  } else {
60  int construct_type = (unit_gauge) ? 0 : 1;
62  constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
63  } else {
64  constructFatLongGaugeField(qdp_inlink, qdp_longlink_cpu, construct_type, gauge_param.cpu_prec, &gauge_param,
66  }
67  }
68 
69  // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
70  // "compute" the fat/long links or not.
72  for (int dir = 0; dir < 4; dir++) {
73  memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
74  memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
75  memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
76  memset(qdp_longlink_cpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
77  }
78  } else {
79  // QUDA_ASQTAD_DSLASH
80  if (compute_fatlong) {
81  computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink,
83  } else {
84  // Not computing FatLong
85  for (int dir = 0; dir < 4; dir++) {
86  memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
87  memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
88  memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
89  }
90  }
91  }
92 }
93 
94 void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
95  QudaGaugeParam &gauge_param, int argc, char **argv)
96 {
98 
99  if (strcmp(latfile, "")) {
100  // load in the command line supplied gauge field using QIO and LIME
101  read_gauge_field(latfile, qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
104  }
105  } else {
106  int construct_type = (unit_gauge) ? 0 : 1;
108  constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
109  } else {
110  constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param,
112  }
113  }
114 
115  // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
116  // "compute" the fat/long links or not.
118  for (int dir = 0; dir < 4; dir++) {
119  memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
120  memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
121  }
122  } else {
123  // QUDA_ASQTAD_DSLASH
124  if (compute_fatlong) {
125  computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik);
126  } else {
127  for (int dir = 0; dir < 4; dir++) {
128  memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
129  }
130  }
131  }
132 }
133 
134 void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision,
136 {
137  if (type == 0) {
138  if (precision == QUDA_DOUBLE_PRECISION) {
139  constructUnitGaugeField((double **)fatlink, param);
140  constructUnitGaugeField((double **)longlink, param);
141  } else {
142  constructUnitGaugeField((float **)fatlink, param);
143  constructUnitGaugeField((float **)longlink, param);
144  } // apply phases
145 
147 
150 
151  } else {
152  if (precision == QUDA_DOUBLE_PRECISION) {
153  // if doing naive staggered then set to long links so that the staggered phase is applied
155  if (type != 3)
156  constructRandomGaugeField((double **)fatlink, param, dslash_type);
157  else
158  applyStaggeredScaling((double **)fatlink, param, type);
161  if (type != 3)
162  constructRandomGaugeField((double **)longlink, param, dslash_type);
163  else
164  applyStaggeredScaling((double **)longlink, param, type);
165  }
166  } else {
168  if (type != 3)
169  constructRandomGaugeField((float **)fatlink, param, dslash_type);
170  else
171  applyStaggeredScaling((float **)fatlink, param, type);
172 
175  if (type != 3)
176  constructRandomGaugeField((float **)longlink, param, dslash_type);
177  else
178  applyStaggeredScaling((float **)longlink, param, type);
179  }
180  }
181 
183  // incorporate non-trivial phase into long links
184  const double phase = (M_PI * rand()) / RAND_MAX;
185  const complex<double> z = polar(1.0, phase);
186  for (int dir = 0; dir < 4; ++dir) {
187  for (int i = 0; i < V; ++i) {
188  for (int j = 0; j < gauge_site_size; j += 2) {
189  if (precision == QUDA_DOUBLE_PRECISION) {
190  complex<double> *l = (complex<double> *)(&(((double *)longlink[dir])[i * gauge_site_size + j]));
191  *l *= z;
192  } else {
193  complex<float> *l = (complex<float> *)(&(((float *)longlink[dir])[i * gauge_site_size + j]));
194  *l *= z;
195  }
196  }
197  }
198  }
199  }
200 
201  if (type == 3) return;
202  }
203 
204  // set all links to zero to emulate the 1-link operator (needed for host comparison)
205  // FIXME: may break host comparison
207  for (int dir = 0; dir < 4; ++dir) {
208  for (int i = 0; i < V; ++i) {
209  for (int j = 0; j < gauge_site_size; j += 2) {
210  if (precision == QUDA_DOUBLE_PRECISION) {
211  ((double *)longlink[dir])[i * gauge_site_size + j] = 0.0;
212  ((double *)longlink[dir])[i * gauge_site_size + j + 1] = 0.0;
213  } else {
214  ((float *)longlink[dir])[i * gauge_site_size + j] = 0.0;
215  ((float *)longlink[dir])[i * gauge_site_size + j + 1] = 0.0;
216  }
217  }
218  }
219  }
220  }
221 }
222 
223 void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param)
224 {
225  // Specific gauge parameters for MILC
226  int pad_size = 0;
227 #ifdef MULTI_GPU
228  int x_face_size = gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3] / 2;
229  int y_face_size = gauge_param.X[0] * gauge_param.X[2] * gauge_param.X[3] / 2;
230  int z_face_size = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[3] / 2;
231  int t_face_size = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] / 2;
232  pad_size = MAX(x_face_size, y_face_size);
233  pad_size = MAX(pad_size, z_face_size);
234  pad_size = MAX(pad_size, t_face_size);
235 #endif
236 
237  int fat_pad = pad_size;
238  int link_pad = 3 * pad_size;
239 
243 
244  gauge_param.ga_pad = fat_pad;
249  } else {
253  }
255 
256  loadGaugeQuda(milc_fatlink, &gauge_param);
257 
260  gauge_param.ga_pad = link_pad;
266  loadGaugeQuda(milc_longlink, &gauge_param);
267  }
268 }
269 
270 #ifndef MULTI_GPU
271 template <typename su3_matrix, typename Float>
272 void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
273 {
274 
275  su3_matrix temp;
276  for (int dir = XUP; dir <= TUP; ++dir) {
277  int dx[4] = {0, 0, 0, 0};
278  for (int i = 0; i < V; ++i) {
279  // Initialize the longlinks
280  su3_matrix *llink = ((su3_matrix *)longlink[dir]) + i;
281  llfat_scalar_mult_su3_matrix(sitelink[dir] + i, act_path_coeff[1], llink);
282  dx[dir] = 1;
283  int nbr_idx = neighborIndexFullLattice(Z, i, dx);
284  llfat_mult_su3_nn(llink, sitelink[dir] + nbr_idx, &temp);
285  dx[dir] = 2;
286  nbr_idx = neighborIndexFullLattice(Z, i, dx);
287  llfat_mult_su3_nn(&temp, sitelink[dir] + nbr_idx, llink);
288  }
289  }
290  return;
291 }
292 #else
293 
294 template <typename su3_matrix, typename Float>
295 void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_path_coeff)
296 {
297  int E[4];
298  for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4;
299  const int extended_volume = E[3] * E[2] * E[1] * E[0];
300 
301  su3_matrix temp;
302  for (int t = 0; t < Z[3]; ++t) {
303  for (int z = 0; z < Z[2]; ++z) {
304  for (int y = 0; y < Z[1]; ++y) {
305  for (int x = 0; x < Z[0]; ++x) {
306  const int oddBit = (x + y + z + t) & 1;
307  int little_index = ((((t * Z[2] + z) * Z[1] + y) * Z[0] + x) / 2) + oddBit * Vh;
308  int large_index
309  = (((((t + 2) * E[2] + (z + 2)) * E[1] + (y + 2)) * E[0] + x + 2) / 2) + oddBit * (extended_volume / 2);
310 
311  for (int dir = XUP; dir <= TUP; ++dir) {
312  int dx[4] = {0, 0, 0, 0};
313  su3_matrix *llink = ((su3_matrix *)longlink[dir]) + little_index;
314  llfat_scalar_mult_su3_matrix(sitelinkEx[dir] + large_index, act_path_coeff[1], llink);
315  dx[dir] = 1;
316  int nbr_index = neighborIndexFullLattice(E, large_index, dx);
317  llfat_mult_su3_nn(llink, sitelinkEx[dir] + nbr_index, &temp);
318  dx[dir] = 2;
319  nbr_index = neighborIndexFullLattice(E, large_index, dx);
320  llfat_mult_su3_nn(&temp, sitelinkEx[dir] + nbr_index, llink);
321  }
322  } // x
323  } // y
324  } // z
325  } // t
326  return;
327 }
328 #endif
329 
330 void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
331 {
332  if (longlink) {
333  switch (prec) {
335  computeLongLinkCPU((void **)longlink, (su3_matrix<double> **)sitelink, (double *)act_path_coeff);
336  break;
337 
339  computeLongLinkCPU((void **)longlink, (su3_matrix<float> **)sitelink, (float *)act_path_coeff);
340  break;
341  default:
342  fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec);
343  exit(1);
344  break;
345  }
346  } // if(longlink)
347 }
348 
349 // Compute the full HISQ stencil on the CPU.
350 // If "eps_naik" is 0, there's no naik correction,
351 // and this routine skips building the paths in "act_path_coeffs[2]"
352 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
353  void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
354 {
355  // Prepare various things
356  QudaGaugeParam &qudaGaugeParam = *((QudaGaugeParam *)qudaGaugeParamPtr);
357  // Needed for unitarization, following "unitarize_link_test.cpp"
358  quda::GaugeFieldParam gParam(0, qudaGaugeParam);
359  gParam.pad = 0;
362  gParam.order = QUDA_MILC_GAUGE_ORDER; // must be true!
363 
364  const QudaPrecision prec = qudaGaugeParam.cpu_prec;
365  const size_t gSize = prec;
366 
367  // Compute n_naiks
368  const int n_naiks = (eps_naik == 0.0 ? 1 : 2);
369 
371  // Create extended CPU field //
373 
374  void *sitelink_ex[4];
375  for (int i = 0; i < 4; i++) sitelink_ex[i] = pinned_malloc(V_ex * gauge_site_size * gSize);
376 
377 #ifdef MULTI_GPU
378  void *ghost_sitelink[4];
379  void *ghost_sitelink_diag[16];
380 #endif
381 
382  int X1 = Z[0];
383  int X2 = Z[1];
384  int X3 = Z[2];
385  int X4 = Z[3];
386 
387  for (int i = 0; i < V_ex; i++) {
388  int sid = i;
389  int oddBit = 0;
390  if (i >= Vh_ex) {
391  sid = i - Vh_ex;
392  oddBit = 1;
393  }
394 
395  int za = sid / E1h;
396  int x1h = sid - za * E1h;
397  int zb = za / E2;
398  int x2 = za - zb * E2;
399  int x4 = zb / E3;
400  int x3 = zb - x4 * E3;
401  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
402  int x1 = 2 * x1h + x1odd;
403 
404  if (x1 < 2 || x1 >= X1 + 2 || x2 < 2 || x2 >= X2 + 2 || x3 < 2 || x3 >= X3 + 2 || x4 < 2 || x4 >= X4 + 2) {
405 #ifdef MULTI_GPU
406  continue;
407 #endif
408  }
409 
410  x1 = (x1 - 2 + X1) % X1;
411  x2 = (x2 - 2 + X2) % X2;
412  x3 = (x3 - 2 + X3) % X3;
413  x4 = (x4 - 2 + X4) % X4;
414 
415  int idx = (x4 * X3 * X2 * X1 + x3 * X2 * X1 + x2 * X1 + x1) >> 1;
416  if (oddBit) { idx += Vh; }
417  for (int dir = 0; dir < 4; dir++) {
418  char *src = (char *)sitelink[dir];
419  char *dst = (char *)sitelink_ex[dir];
420  memcpy(dst + i * gauge_site_size * gSize, src + idx * gauge_site_size * gSize, gauge_site_size * gSize);
421  } // dir
422  } // i
423 
425  // Allocate all CPU intermediaries //
427 
428  void *v_reflink[4]; // V link -- fat7 smeared link
429  void *w_reflink[4]; // unitarized V link
430  void *w_reflink_ex[4]; // extended W link
431  for (int i = 0; i < 4; i++) {
432  v_reflink[i] = safe_malloc(V * gauge_site_size * gSize);
433  w_reflink[i] = safe_malloc(V * gauge_site_size * gSize);
434  w_reflink_ex[i] = safe_malloc(V_ex * gauge_site_size * gSize);
435  }
436 
437 #ifdef MULTI_GPU
438  void *ghost_wlink[4];
439  void *ghost_wlink_diag[16];
440 #endif
441 
442  // Copy of V link needed for CPU unitarization routines
443  void *v_sitelink = pinned_malloc(4 * V * gauge_site_size * gSize);
444 
445  // FIXME: we have this complication because references takes coeff as float/double
446  // depending on the precision while the GPU code aways take coeff as double
447  void *coeff;
448  double coeff_dp[6];
449  float coeff_sp[6];
450 
452  // Create V links (fat7 links), 1st path table set //
454 
455  for (int i = 0; i < 6; i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[0][i];
456  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void *)coeff_dp : (void *)coeff_sp;
457 
458  // Only need fat links.
459 #ifdef MULTI_GPU
460  int optflag = 0;
461  // we need x,y,z site links in the back and forward T slice
462  // so it is 3*2*Vs_t
463  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
464  for (int i = 0; i < 4; i++) ghost_sitelink[i] = safe_malloc(8 * Vs[i] * gauge_site_size * gSize);
465 
466  // nu | |
467  // |_____|
468  // mu
469 
470  for (int nu = 0; nu < 4; nu++) {
471  for (int mu = 0; mu < 4; mu++) {
472  if (nu == mu) {
473  ghost_sitelink_diag[nu * 4 + mu] = NULL;
474  } else {
475  // the other directions
476  int dir1, dir2;
477  for (dir1 = 0; dir1 < 4; dir1++) {
478  if (dir1 != nu && dir1 != mu) { break; }
479  }
480  for (dir2 = 0; dir2 < 4; dir2++) {
481  if (dir2 != nu && dir2 != mu && dir2 != dir1) { break; }
482  }
483  ghost_sitelink_diag[nu * 4 + mu] = safe_malloc(Z[dir1] * Z[dir2] * gauge_site_size * gSize);
484  memset(ghost_sitelink_diag[nu * 4 + mu], 0, Z[dir1] * Z[dir2] * gauge_site_size * gSize);
485  }
486  }
487  }
488  exchange_cpu_sitelink(gParam.x, sitelink, ghost_sitelink, ghost_sitelink_diag, prec, &qudaGaugeParam, optflag);
489  llfat_reference_mg(v_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, prec, coeff);
490 #else
491  llfat_reference(v_reflink, sitelink, prec, coeff);
492 #endif
493 
495  // Create W links (unitarized V links) //
497 
498  // This is based on "unitarize_link_test.cpp"
499 
500  // Format change
501  reorderQDPtoMILC(v_sitelink, v_reflink, V, gauge_site_size, prec, prec);
502 
503  // Prepare cpuGaugeFields for unitarization
505  gParam.gauge = v_sitelink;
508 
511 
512  // unitarize
513  unitarizeLinksCPU(*cpuWLink, *cpuVLink);
514 
515  // Copy back into "w_reflink"
516  reorderMILCtoQDP(w_reflink, cpuWLink->Gauge_p(), V, gauge_site_size, prec, prec);
517 
518  // Clean up cpuGaugeFields, we don't need them anymore.
519  delete cpuVLink;
520  delete cpuWLink;
521 
523  // Prepare for extended W fields //
525 
526  for (int i = 0; i < V_ex; i++) {
527  int sid = i;
528  int oddBit = 0;
529  if (i >= Vh_ex) {
530  sid = i - Vh_ex;
531  oddBit = 1;
532  }
533 
534  int za = sid / E1h;
535  int x1h = sid - za * E1h;
536  int zb = za / E2;
537  int x2 = za - zb * E2;
538  int x4 = zb / E3;
539  int x3 = zb - x4 * E3;
540  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
541  int x1 = 2 * x1h + x1odd;
542 
543  if (x1 < 2 || x1 >= X1 + 2 || x2 < 2 || x2 >= X2 + 2 || x3 < 2 || x3 >= X3 + 2 || x4 < 2 || x4 >= X4 + 2) {
544 #ifdef MULTI_GPU
545  continue;
546 #endif
547  }
548 
549  x1 = (x1 - 2 + X1) % X1;
550  x2 = (x2 - 2 + X2) % X2;
551  x3 = (x3 - 2 + X3) % X3;
552  x4 = (x4 - 2 + X4) % X4;
553 
554  int idx = (x4 * X3 * X2 * X1 + x3 * X2 * X1 + x2 * X1 + x1) >> 1;
555  if (oddBit) { idx += Vh; }
556  for (int dir = 0; dir < 4; dir++) {
557  char *src = (char *)w_reflink[dir];
558  char *dst = (char *)w_reflink_ex[dir];
559  memcpy(dst + i * gauge_site_size * gSize, src + idx * gauge_site_size * gSize, gauge_site_size * gSize);
560  } // dir
561  } // i
562 
564  // Create extended W fields //
566 
567 #ifdef MULTI_GPU
568  optflag = 0;
569  // we need x,y,z site links in the back and forward T slice
570  // so it is 3*2*Vs_t
571  for (int i = 0; i < 4; i++) ghost_wlink[i] = safe_malloc(8 * Vs[i] * gauge_site_size * gSize);
572 
573  // nu | |
574  // |_____|
575  // mu
576 
577  for (int nu = 0; nu < 4; nu++) {
578  for (int mu = 0; mu < 4; mu++) {
579  if (nu == mu) {
580  ghost_wlink_diag[nu * 4 + mu] = NULL;
581  } else {
582  // the other directions
583  int dir1, dir2;
584  for (dir1 = 0; dir1 < 4; dir1++) {
585  if (dir1 != nu && dir1 != mu) { break; }
586  }
587  for (dir2 = 0; dir2 < 4; dir2++) {
588  if (dir2 != nu && dir2 != mu && dir2 != dir1) { break; }
589  }
590  ghost_wlink_diag[nu * 4 + mu] = safe_malloc(Z[dir1] * Z[dir2] * gauge_site_size * gSize);
591  memset(ghost_wlink_diag[nu * 4 + mu], 0, Z[dir1] * Z[dir2] * gauge_site_size * gSize);
592  }
593  }
594  }
595 #endif
596 
598  // Prepare to create Naiks, 3rd table set //
600 
601  if (n_naiks > 1) {
602 
603  for (int i = 0; i < 6; i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[2][i];
604  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void *)coeff_dp : (void *)coeff_sp;
605 
606 #ifdef MULTI_GPU
607 
608  exchange_cpu_sitelink(qudaGaugeParam.X, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec,
609  &qudaGaugeParam, optflag);
610  llfat_reference_mg(fatlink, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, coeff);
611 
612  {
613  int R[4] = {2, 2, 2, 2};
614  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, w_reflink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
615  computeLongLinkCPU(longlink, w_reflink_ex, qudaGaugeParam.cpu_prec, coeff);
616  }
617 #else
618  llfat_reference(fatlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
619  computeLongLinkCPU(longlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
620 #endif
621 
622  // Rescale fat and long links into eps links
623  for (int i = 0; i < 4; i++) {
624  cpu_axy(prec, eps_naik, fatlink[i], fatlink_eps[i], V * gauge_site_size);
625  cpu_axy(prec, eps_naik, longlink[i], longlink_eps[i], V * gauge_site_size);
626  }
627  }
628 
630  // Prepare to create X links and long links, 2nd table set //
632 
633  for (int i = 0; i < 6; i++) coeff_sp[i] = coeff_dp[i] = act_path_coeffs[1][i];
634  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void *)coeff_dp : (void *)coeff_sp;
635 
636 #ifdef MULTI_GPU
637  optflag = 0;
638 
639  // We've already built the extended W fields.
640 
641  exchange_cpu_sitelink(qudaGaugeParam.X, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec,
642  &qudaGaugeParam, optflag);
643  llfat_reference_mg(fatlink, w_reflink, ghost_wlink, ghost_wlink_diag, qudaGaugeParam.cpu_prec, coeff);
644 
645  {
646  int R[4] = {2, 2, 2, 2};
647  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, w_reflink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
648  computeLongLinkCPU(longlink, w_reflink_ex, qudaGaugeParam.cpu_prec, coeff);
649  }
650 #else
651  llfat_reference(fatlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
652  computeLongLinkCPU(longlink, w_reflink, qudaGaugeParam.cpu_prec, coeff);
653 #endif
654 
655  if (n_naiks > 1) {
656  // Accumulate into eps links.
657  for (int i = 0; i < 4; i++) {
658  cpu_xpy(prec, fatlink[i], fatlink_eps[i], V * gauge_site_size);
659  cpu_xpy(prec, longlink[i], longlink_eps[i], V * gauge_site_size);
660  }
661  }
662 
664  // Clean up //
666 
667  for (int i = 0; i < 4; i++) {
668  host_free(sitelink_ex[i]);
669  host_free(v_reflink[i]);
670  host_free(w_reflink[i]);
671  host_free(w_reflink_ex[i]);
672  }
673  host_free(v_sitelink);
674 
675 #ifdef MULTI_GPU
676  for (int i = 0; i < 4; i++) {
677  host_free(ghost_sitelink[i]);
678  host_free(ghost_wlink[i]);
679  for (int j = 0; j < 4; j++) {
680  if (i == j) continue;
681  host_free(ghost_sitelink_diag[i * 4 + j]);
682  host_free(ghost_wlink_diag[i * 4 + j]);
683  }
684  }
685 #endif
686 }
687 
690 {
691  // Lattice vector spacetime/colour/spin/parity properties
692  cs_param->nColor = 3;
693  cs_param->nSpin = 1;
694  cs_param->nDim = 5;
695  for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d];
697  if (pc) cs_param->x[0] /= 2;
698  cs_param->x[4] = 1;
700 
701  // Lattice vector data properties
702  cs_param->setPrecision(inv_param->cpu_prec);
703  cs_param->pad = 0;
706  cs_param->gammaBasis = inv_param->gamma_basis;
707  cs_param->create = QUDA_ZERO_FIELD_CREATE;
708  cs_param->location = QUDA_CPU_FIELD_LOCATION;
709 }
710 
711 // data reordering routines
712 template <typename Out, typename In> void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
713 {
714  for (int i = 0; i < V; i++) {
715  for (int dir = 0; dir < 4; dir++) {
716  for (int j = 0; j < siteSize; j++) {
717  milc_out[(i * 4 + dir) * siteSize + j] = static_cast<Out>(qdp_in[dir][i * siteSize + j]);
718  }
719  }
720  }
721 }
722 
723 void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPrecision out_precision,
724  QudaPrecision in_precision)
725 {
726  if (out_precision == QUDA_SINGLE_PRECISION) {
727  if (in_precision == QUDA_SINGLE_PRECISION) {
728  reorderQDPtoMILC<float, float>((float *)milc_out, (float **)qdp_in, V, siteSize);
729  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
730  reorderQDPtoMILC<float, double>((float *)milc_out, (double **)qdp_in, V, siteSize);
731  }
732  } else if (out_precision == QUDA_DOUBLE_PRECISION) {
733  if (in_precision == QUDA_SINGLE_PRECISION) {
734  reorderQDPtoMILC<double, float>((double *)milc_out, (float **)qdp_in, V, siteSize);
735  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
736  reorderQDPtoMILC<double, double>((double *)milc_out, (double **)qdp_in, V, siteSize);
737  }
738  }
739 }
740 
741 template <typename Out, typename In> void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize)
742 {
743  for (int i = 0; i < V; i++) {
744  for (int dir = 0; dir < 4; dir++) {
745  for (int j = 0; j < siteSize; j++) {
746  qdp_out[dir][i * siteSize + j] = static_cast<Out>(milc_in[(i * 4 + dir) * siteSize + j]);
747  }
748  }
749  }
750 }
751 
752 void reorderMILCtoQDP(void **qdp_out, void *milc_in, int V, int siteSize, QudaPrecision out_precision,
753  QudaPrecision in_precision)
754 {
755  if (out_precision == QUDA_SINGLE_PRECISION) {
756  if (in_precision == QUDA_SINGLE_PRECISION) {
757  reorderMILCtoQDP<float, float>((float **)qdp_out, (float *)milc_in, V, siteSize);
758  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
759  reorderMILCtoQDP<float, double>((float **)qdp_out, (double *)milc_in, V, siteSize);
760  }
761  } else if (out_precision == QUDA_DOUBLE_PRECISION) {
762  if (in_precision == QUDA_SINGLE_PRECISION) {
763  reorderMILCtoQDP<double, float>((double **)qdp_out, (float *)milc_in, V, siteSize);
764  } else if (in_precision == QUDA_DOUBLE_PRECISION) {
765  reorderMILCtoQDP<double, double>((double **)qdp_out, (double *)milc_in, V, siteSize);
766  }
767  }
768 }
769 
770 template <typename Float> void applyStaggeredScaling(Float **res, QudaGaugeParam *param, int type)
771 {
772 
774 
775  return;
776 }
777 
778 template <typename Float>
780 {
781  int X1h = param->X[0] / 2;
782  int X1 = param->X[0];
783  int X2 = param->X[1];
784  int X3 = param->X[2];
785  int X4 = param->X[3];
786 
787  // rescale long links by the appropriate coefficient
789  for (int d = 0; d < 4; d++) {
790  for (int i = 0; i < V * gauge_site_size; i++) {
791  gauge[d][i] /= (-24 * param->tadpole_coeff * param->tadpole_coeff);
792  }
793  }
794  }
795 
796  // apply the staggered phases
797  for (int d = 0; d < 3; d++) {
798 
799  // even
800  for (int i = 0; i < Vh; i++) {
801 
802  int index = fullLatticeIndex(i, 0);
803  int i4 = index / (X3 * X2 * X1);
804  int i3 = (index - i4 * (X3 * X2 * X1)) / (X2 * X1);
805  int i2 = (index - i4 * (X3 * X2 * X1) - i3 * (X2 * X1)) / X1;
806  int i1 = index - i4 * (X3 * X2 * X1) - i3 * (X2 * X1) - i2 * X1;
807  int sign = 1;
808 
809  if (d == 0) {
810  if (i4 % 2 == 1) { sign = -1; }
811  }
812 
813  if (d == 1) {
814  if ((i4 + i1) % 2 == 1) { sign = -1; }
815  }
816  if (d == 2) {
817  if ((i4 + i1 + i2) % 2 == 1) { sign = -1; }
818  }
819 
820  for (int j = 0; j < 18; j++) { gauge[d][i * gauge_site_size + j] *= sign; }
821  }
822  // odd
823  for (int i = 0; i < Vh; i++) {
824  int index = fullLatticeIndex(i, 1);
825  int i4 = index / (X3 * X2 * X1);
826  int i3 = (index - i4 * (X3 * X2 * X1)) / (X2 * X1);
827  int i2 = (index - i4 * (X3 * X2 * X1) - i3 * (X2 * X1)) / X1;
828  int i1 = index - i4 * (X3 * X2 * X1) - i3 * (X2 * X1) - i2 * X1;
829  int sign = 1;
830 
831  if (d == 0) {
832  if (i4 % 2 == 1) { sign = -1; }
833  }
834 
835  if (d == 1) {
836  if ((i4 + i1) % 2 == 1) { sign = -1; }
837  }
838  if (d == 2) {
839  if ((i4 + i1 + i2) % 2 == 1) { sign = -1; }
840  }
841 
842  for (int j = 0; j < 18; j++) { gauge[d][(Vh + i) * gauge_site_size + j] *= sign; }
843  }
844  }
845 
846  // Apply boundary conditions to temporal links
848  for (int j = 0; j < Vh; j++) {
849  int sign = 1;
851  if (j >= (X4 - 3) * X1h * X2 * X3) { sign = -1; }
852  } else {
853  if (j >= (X4 - 1) * X1h * X2 * X3) { sign = -1; }
854  }
855 
856  for (int i = 0; i < 18; i++) {
857  gauge[3][j * gauge_site_size + i] *= sign;
858  gauge[3][(Vh + j) * gauge_site_size + i] *= sign;
859  }
860  }
861  }
862 }
863 
866 {
868  applyGaugeFieldScaling_long((double **)gauge, Vh, param, dslash_type);
869  } else if (local_prec == QUDA_SINGLE_PRECISION) {
870  applyGaugeFieldScaling_long((float **)gauge, Vh, param, dslash_type);
871  } else {
872  errorQuda("Invalid type %d for applyGaugeFieldScaling_long\n", local_prec);
873  }
874 }
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
static GaugeField * Create(const GaugeFieldParam &param)
Create the gauge field, with meta data specified in the parameter struct.
virtual void * Gauge_p()
Definition: gauge_field.h:358
QudaReconstructType link_recon_sloppy
QudaReconstructType link_recon
QudaReconstructType link_recon_precondition
char latfile[256]
double mu
double eps_naik
QudaDslashType dslash_type
bool compute_fatlong
bool unit_gauge
QudaPrecision prec
int n_naiks
int Vh
Definition: host_utils.cpp:38
int Z[4]
Definition: host_utils.cpp:36
int V
Definition: host_utils.cpp:37
void * memset(void *s, int c, size_t n)
QudaGaugeParam gauge_param
Definition: covdev_test.cpp:26
QudaInvertParam inv_param
Definition: covdev_test.cpp:27
enum QudaPrecision_s QudaPrecision
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515
@ QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:97
@ QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:98
@ QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:101
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:56
enum QudaDslashType_s QudaDslashType
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_SU3_LINKS
Definition: enum_quda.h:24
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
Definition: face_gauge.cpp:644
#define gauge_site_size
Definition: face_gauge.cpp:34
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
Definition: face_gauge.cpp:512
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:490
int E[4]
Definition: host_utils.cpp:45
int Vh_ex
Definition: host_utils.cpp:46
GaugeFieldParam gParam
void cpu_axy(QudaPrecision prec, double a, void *x, void *y, int size)
Definition: host_blas.cpp:74
void cpu_xpy(QudaPrecision prec, void *x, void *y, int size)
Definition: host_blas.cpp:87
int Vs_y
Definition: host_utils.cpp:39
size_t host_gauge_data_type_size
Definition: host_utils.cpp:65
int E1h
Definition: host_utils.cpp:44
void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
int Vs_x
Definition: host_utils.cpp:39
int Vs_t
Definition: host_utils.cpp:39
int Vs_z
Definition: host_utils.cpp:39
int E2
Definition: host_utils.cpp:44
int E3
Definition: host_utils.cpp:44
void constructRandomGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type)
int V_ex
Definition: host_utils.cpp:46
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: host_utils.cpp:591
bool last_node_in_t()
Definition: host_utils.cpp:378
void constructQudaGaugeField(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: host_utils.cpp:146
QudaPrecision local_prec
Definition: host_utils.cpp:56
bool isPCSolution(QudaSolutionType solution_type)
Definition: host_utils.h:112
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
void llfat_mult_su3_nn(su3_matrix *a, su3_matrix *b, su3_matrix *c)
Definition: llfat_utils.h:41
void llfat_scalar_mult_su3_matrix(su3_matrix *a, Real s, su3_matrix *b)
Definition: llfat_utils.h:14
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pinned_malloc(size)
Definition: malloc_quda.h:107
#define host_free(ptr)
Definition: malloc_quda.h:115
__host__ __device__ complex< ValueType > polar(const ValueType &m, const ValueType &theta=0)
Returns the complex with magnitude m and angle theta in radians.
void unitarizeLinksCPU(GaugeField &outfield, const GaugeField &infield)
FloatingPoint< float > Float
QudaGaugeParam param
Definition: pack_test.cpp:18
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
Definition: qio_field.h:12
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
bool gauge_loaded
void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gauge_data_type_size, int n_naiks, double eps_naik)
void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gauge_data_type_size, int n_naiks, double eps_naik)
void applyStaggeredScaling(Float **res, QudaGaugeParam *param, int type)
void constructStaggeredHostGhostGaugeField(quda::GaugeField *cpuFat, quda::GaugeField *cpuLong, void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param)
void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded)
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
#define XUP
void constructStaggeredTestSpinorParam(quda::ColorSpinorParam *cs_param, const QudaInvertParam *inv_param, const QudaGaugeParam *gauge_param)
#define TUP
void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, QudaGaugeParam &gauge_param, int argc, char **argv)
void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param)
#define MAX(a, b)
void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize)
double tadpole_coeff
Definition: quda.h:38
QudaReconstructType reconstruct_precondition
Definition: quda.h:58
QudaReconstructType reconstruct
Definition: quda.h:49
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
QudaFieldLocation location
Definition: quda.h:33
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:55
QudaTboundary t_boundary
Definition: quda.h:44
QudaSolutionType solution_type
Definition: quda.h:228
QudaPrecision cpu_prec
Definition: quda.h:237
QudaGammaBasis gamma_basis
Definition: quda.h:246
QudaGaugeFieldOrder order
Definition: gauge_field.h:51
QudaLinkType link_type
Definition: gauge_field.h:53
QudaFieldCreate create
Definition: gauge_field.h:60
QudaFieldLocation location
Definition: gauge_field.h:46
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
QudaSiteSubset siteSubset
Definition: lattice_field.h:72
#define errorQuda(...)
Definition: util_quda.h:120