QUDA  v1.1.0
A library for QCD on GPUs
host_utils.cpp
Go to the documentation of this file.
1 #include <limits>
2 #include <complex>
3 #include <stdlib.h>
4 #include <stdio.h>
5 #include <string.h>
6 #include <short.h>
7 
8 #include <comm_quda.h>
9 
10 // This contains the appropriate ifdef guards already
11 #include <mpi_comm_handle.h>
12 
13 // QUDA headers
14 #include <color_spinor_field.h>
15 #include <unitarization_links.h>
16 #include <dslash_quda.h>
17 
18 // External headers
19 #include <llfat_utils.h>
20 #include <staggered_gauge_utils.h>
21 #include <host_utils.h>
22 #include <command_line_params.h>
23 
24 #include <misc.h>
25 #include <qio_field.h>
26 
27 #define MAX(a, b) ((a) > (b) ? (a) : (b))
28 
29 using namespace std;
30 
31 #define XUP 0
32 #define YUP 1
33 #define ZUP 2
34 #define TUP 3
35 
36 int Z[4];
37 int V;
38 int Vh;
39 int Vs_x, Vs_y, Vs_z, Vs_t;
41 int faceVolume[4];
42 
43 // extended volume, +4
44 int E1, E1h, E2, E3, E4;
45 int E[4];
46 int V_ex, Vh_ex;
47 
48 int Ls;
49 int V5;
50 int V5h;
51 double kappa5;
52 
53 extern float fat_link_max;
54 
55 // Set some local QUDA precision variables
64 
65 size_t host_gauge_data_type_size = (cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
66 size_t host_spinor_data_type_size = (cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
67 size_t host_clover_data_type_size = (cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
68 
70 {
79 }
80 
82 {
83  for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++) {
86  }
87 }
88 
90 {
91  // We give here some default values
92  for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++) {
95  num_setup_iter[i] = 1;
96  setup_tol[i] = 5e-6;
97  setup_maxiter[i] = 500;
98  setup_maxiter_refresh[i] = 20;
99  mu_factor[i] = 1.;
103  mg_schwarz_cycle[i] = 1;
105  smoother_tol[i] = 0.25;
107  coarse_solver_tol[i] = 0.25;
108  coarse_solver_maxiter[i] = 100;
109  nu_pre[i] = 2;
110  nu_post[i] = 2;
111  n_block_ortho[i] = 1;
112 
113  // Default eigensolver params
114  mg_eig[i] = false;
115  mg_eig_tol[i] = 1e-3;
116  mg_eig_n_ev[i] = nvec[i];
117  mg_eig_n_kr[i] = 3 * nvec[i];
121  mg_eig_check_interval[i] = 5;
122  mg_eig_max_restarts[i] = 100;
126  mg_eig_poly_deg[i] = 100;
127  mg_eig_amin[i] = 1.0;
128  mg_eig_amax[i] = -1.0; // use power iterations
130 
132  setup_ca_basis_size[i] = 4;
133  setup_ca_lambda_min[i] = 0.0;
134  setup_ca_lambda_max[i] = -1.0; // use power iterations
135 
139  coarse_solver_ca_lambda_max[i] = -1.0;
140 
141  strcpy(mg_vec_infile[i], "");
142  strcpy(mg_vec_outfile[i], "");
143  }
144 }
145 
146 void constructQudaGaugeField(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
147 {
148  if (type == 0) {
149  if (precision == QUDA_DOUBLE_PRECISION)
150  constructUnitGaugeField((double **)gauge, param);
151  else
152  constructUnitGaugeField((float **)gauge, param);
153  } else if (type == 1) {
154  if (precision == QUDA_DOUBLE_PRECISION)
155  constructRandomGaugeField((double **)gauge, param);
156  else
157  constructRandomGaugeField((float **)gauge, param);
158  } else {
159  if (precision == QUDA_DOUBLE_PRECISION)
160  applyGaugeFieldScaling((double **)gauge, Vh, param);
161  else
162  applyGaugeFieldScaling((float **)gauge, Vh, param);
163  }
164 }
165 
166 void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc, char **argv)
167 {
168 
169  // 0 = unit gauge
170  // 1 = random SU(3)
171  // 2 = supplied field
172  int construct_type = 0;
173  if (strcmp(latfile, "")) {
174  // load in the command line supplied gauge field using QIO and LIME
176  construct_type = 2;
177  } else {
178  if (unit_gauge)
179  construct_type = 0;
180  else
181  construct_type = 1;
182  }
183  constructQudaGaugeField(gauge, construct_type, gauge_param.cpu_prec, &gauge_param);
184 }
185 
186 void constructHostCloverField(void *clover, void *clover_inv, QudaInvertParam &inv_param)
187 {
188  double norm = 0.01; // clover components are random numbers in the range (-norm, norm)
189  double diag = 1.0; // constant added to the diagonal
190 
192 
197 }
198 
199 void constructQudaCloverField(void *clover, double norm, double diag, QudaPrecision precision)
200 {
201  if (precision == QUDA_DOUBLE_PRECISION)
202  constructCloverField((double *)clover, norm, diag);
203  else
204  constructCloverField((float *)clover, norm, diag);
205 }
206 
209 {
210  // Lattice vector spacetime/colour/spin/parity properties
211  cs_param->nColor = 3;
212  cs_param->nSpin = 4;
215  cs_param->nDim = 5;
216  cs_param->x[4] = inv_param->Ls;
217  } else {
218  cs_param->nDim = 4;
219  }
220  for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d];
222  if (pc) cs_param->x[0] /= 2;
224 
225  // Lattice vector data properties
226  cs_param->setPrecision(inv_param->cpu_prec);
227  cs_param->pad = 0;
230  cs_param->gammaBasis = inv_param->gamma_basis;
231  cs_param->create = QUDA_ZERO_FIELD_CREATE;
232  cs_param->location = QUDA_CPU_FIELD_LOCATION;
233 }
234 
235 void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision precision, QudaSolutionType sol_type,
236  const int *const x, quda::RNG &rng)
237 {
239  param.v = v;
240  param.nColor = nColor;
241  param.nSpin = nSpin;
242  param.setPrecision(precision);
245  param.nDim = 4;
247  param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
248  param.location = QUDA_CPU_FIELD_LOCATION; // DMH FIXME so one can construct device noise
249  for (int d = 0; d < 4; d++) param.x[d] = x[d];
250  if (isPCSolution(sol_type)) param.x[0] /= 2;
251  quda::cpuColorSpinorField spinor_in(param);
252  quda::spinorNoise(spinor_in, rng, QUDA_NOISE_UNIFORM);
253 }
254 
255 void initComms(int argc, char **argv, std::array<int, 4> &commDims) { initComms(argc, argv, commDims.data()); }
256 
257 void initComms(int argc, char **argv, int *const commDims)
258 {
259  if (getenv("QUDA_TEST_GRID_SIZE")) { get_size_from_env(commDims, "QUDA_TEST_GRID_SIZE"); }
260  if (getenv("QUDA_TEST_GRID_PARTITION")) { get_size_from_env(grid_partition.data(), "QUDA_TEST_GRID_PARTITION"); }
261 
262 #if defined(QMP_COMMS)
263  QMP_thread_level_t tl;
264  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
265 
266  // make sure the QMP logical ordering matches QUDA's
267  if (rank_order == 0) {
268  int map[] = {3, 2, 1, 0};
269  QMP_declare_logical_topology_map(commDims, 4, map, 4);
270  } else {
271  int map[] = {0, 1, 2, 3};
272  QMP_declare_logical_topology_map(commDims, 4, map, 4);
273  }
274 #elif defined(MPI_COMMS)
275  MPI_Init(&argc, &argv);
276 #endif
277 
279 
280  initCommsGridQuda(4, commDims, func, NULL);
281 
282  for (int d = 0; d < 4; d++) {
283  if (dim_partitioned[d]) { commDimPartitionedSet(d); }
284  }
285 
286  initRand();
287 
288  printfQuda("Rank order is %s major (%s running fastest)\n", rank_order == 0 ? "column" : "row",
289  rank_order == 0 ? "t" : "x");
290 }
291 
293 {
294  comm_finalize();
295 #if defined(QMP_COMMS)
296  QMP_finalize_msg_passing();
297 #elif defined(MPI_COMMS)
298  MPI_Finalize();
299 #endif
300 }
301 
302 void initRand()
303 {
304  int rank = 0;
305 
306 #if defined(QMP_COMMS)
307  rank = QMP_get_node_number();
308 #elif defined(MPI_COMMS)
309  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
310 #endif
311 
312  srand(17 * rank + 137);
313 }
314 
315 void setDims(int *X)
316 {
317  V = 1;
318  for (int d = 0; d < 4; d++) {
319  V *= X[d];
320  Z[d] = X[d];
321 
322  faceVolume[d] = 1;
323  for (int i = 0; i < 4; i++) {
324  if (i == d) continue;
325  faceVolume[d] *= X[i];
326  }
327  }
328  Vh = V / 2;
329 
330  Vs_x = X[1] * X[2] * X[3];
331  Vs_y = X[0] * X[2] * X[3];
332  Vs_z = X[0] * X[1] * X[3];
333  Vs_t = X[0] * X[1] * X[2];
334 
335  Vsh_x = Vs_x / 2;
336  Vsh_y = Vs_y / 2;
337  Vsh_z = Vs_z / 2;
338  Vsh_t = Vs_t / 2;
339 
340  E1 = X[0] + 4;
341  E2 = X[1] + 4;
342  E3 = X[2] + 4;
343  E4 = X[3] + 4;
344  E1h = E1 / 2;
345  E[0] = E1;
346  E[1] = E2;
347  E[2] = E3;
348  E[3] = E4;
349  V_ex = E1 * E2 * E3 * E4;
350  Vh_ex = V_ex / 2;
351 }
352 
353 void dw_setDims(int *X, const int L5)
354 {
355  V = 1;
356  for (int d = 0; d < 4; d++) {
357  V *= X[d];
358  Z[d] = X[d];
359 
360  faceVolume[d] = 1;
361  for (int i = 0; i < 4; i++) {
362  if (i == d) continue;
363  faceVolume[d] *= X[i];
364  }
365  }
366  Vh = V / 2;
367 
368  Ls = L5;
369  V5 = V * Ls;
370  V5h = Vh * Ls;
371 
372  Vs_t = Z[0] * Z[1] * Z[2] * Ls; //?
373  Vsh_t = Vs_t / 2; //?
374 }
375 
377 
379 {
380  // only apply T-boundary at edge nodes
381 #ifdef MULTI_GPU
382  return commCoords(3) == commDim(3) - 1;
383 #else
384  return true;
385 #endif
386 }
387 
388 int index_4d_cb_from_coordinate_4d(const int coordinate[4], const int dim[4])
389 {
390  return (((coordinate[3] * dim[2] + coordinate[2]) * dim[1] + coordinate[1]) * dim[0] + coordinate[0]) >> 1;
391 }
392 
393 void coordinate_from_shrinked_index(int coordinate[4], int shrinked_index, const int shrinked_dim[4],
394  const int shift[4], int parity)
395 {
396  int aux[4];
397  aux[0] = shrinked_index * 2;
398 
399  for (int i = 0; i < 3; i++) { aux[i + 1] = aux[i] / shrinked_dim[i]; }
400 
401  coordinate[0] = aux[0] - aux[1] * shrinked_dim[0];
402  coordinate[1] = aux[1] - aux[2] * shrinked_dim[1];
403  coordinate[2] = aux[2] - aux[3] * shrinked_dim[2];
404  coordinate[3] = aux[3];
405 
406  // Find the full coordinate in the shrinked volume.
407  coordinate[0] += (parity + coordinate[3] + coordinate[2] + coordinate[1]) & 1;
408 
409  // if(shrinked_index == 3691) printfQuda("coordinate[0] = %d\n", coordinate[0]);
410  for (int d = 0; d < 4; d++) { coordinate[d] += shift[d]; }
411 }
412 
413 // i represents a "half index" into an even or odd "half lattice".
414 // when oddBit={0,1} the half lattice is {even,odd}.
415 //
416 // the displacements, such as dx, refer to the full lattice coordinates.
417 //
418 // neighborIndex() takes a "half index", displaces it, and returns the
419 // new "half index", which can be an index into either the even or odd lattices.
420 // displacements of magnitude one always interchange odd and even lattices.
421 //
422 
423 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
424 {
425  int Y = fullLatticeIndex(i, oddBit);
426  int x4 = Y / (Z[2] * Z[1] * Z[0]);
427  int x3 = (Y / (Z[1] * Z[0])) % Z[2];
428  int x2 = (Y / Z[0]) % Z[1];
429  int x1 = Y % Z[0];
430 
431  // assert (oddBit == (x+y+z+t)%2);
432 
433  x4 = (x4 + dx4 + Z[3]) % Z[3];
434  x3 = (x3 + dx3 + Z[2]) % Z[2];
435  x2 = (x2 + dx2 + Z[1]) % Z[1];
436  x1 = (x1 + dx1 + Z[0]) % Z[0];
437 
438  return (x4 * (Z[2] * Z[1] * Z[0]) + x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
439 }
440 
441 int neighborIndex(int dim[4], int index, int oddBit, int dx[4])
442 {
443 
444  const int fullIndex = fullLatticeIndex(dim, index, oddBit);
445 
446  int x[4];
447  x[3] = fullIndex / (dim[2] * dim[1] * dim[0]);
448  x[2] = (fullIndex / (dim[1] * dim[0])) % dim[2];
449  x[1] = (fullIndex / dim[0]) % dim[1];
450  x[0] = fullIndex % dim[0];
451 
452  for (int dir = 0; dir < 4; ++dir) x[dir] = (x[dir] + dx[dir] + dim[dir]) % dim[dir];
453 
454  return (((x[3] * dim[2] + x[2]) * dim[1] + x[1]) * dim[0] + x[0]) / 2;
455 }
456 
457 int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
458 {
459  int ret;
460 
461  int Y = fullLatticeIndex(i, oddBit);
462  int x4 = Y / (Z[2] * Z[1] * Z[0]);
463  int x3 = (Y / (Z[1] * Z[0])) % Z[2];
464  int x2 = (Y / Z[0]) % Z[1];
465  int x1 = Y % Z[0];
466 
467  int ghost_x4 = x4 + dx4;
468 
469  // assert (oddBit == (x+y+z+t)%2);
470 
471  x4 = (x4 + dx4 + Z[3]) % Z[3];
472  x3 = (x3 + dx3 + Z[2]) % Z[2];
473  x2 = (x2 + dx2 + Z[1]) % Z[1];
474  x1 = (x1 + dx1 + Z[0]) % Z[0];
475 
476  if ((ghost_x4 >= 0 && ghost_x4 < Z[3]) || !comm_dim_partitioned(3)) {
477  ret = (x4 * (Z[2] * Z[1] * Z[0]) + x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
478  } else {
479  ret = (x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
480  }
481 
482  return ret;
483 }
484 
485 /*
486  * This is a computation of neighbor using the full index and the displacement in each direction
487  *
488  */
489 
490 int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
491 {
492  int oddBit = 0;
493  int half_idx = i;
494  if (i >= Vh) {
495  oddBit = 1;
496  half_idx = i - Vh;
497  }
498 
499  int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4, dx3, dx2, dx1);
500  int oddBitChanged = (dx4 + dx3 + dx2 + dx1) % 2;
501  if (oddBitChanged) { oddBit = 1 - oddBit; }
502  int ret = nbr_half_idx;
503  if (oddBit) { ret = Vh + nbr_half_idx; }
504 
505  return ret;
506 }
507 
508 int neighborIndexFullLattice(int dim[4], int index, int dx[4])
509 {
510  const int volume = dim[0] * dim[1] * dim[2] * dim[3];
511  const int halfVolume = volume / 2;
512  int oddBit = 0;
513  int halfIndex = index;
514 
515  if (index >= halfVolume) {
516  oddBit = 1;
517  halfIndex = index - halfVolume;
518  }
519 
520  int neighborHalfIndex = neighborIndex(dim, halfIndex, oddBit, dx);
521 
522  int oddBitChanged = (dx[0] + dx[1] + dx[2] + dx[3]) % 2;
523  if (oddBitChanged) { oddBit = 1 - oddBit; }
524 
525  return neighborHalfIndex + oddBit * halfVolume;
526 }
527 
528 int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
529 {
530  int ret;
531  int oddBit = 0;
532  int half_idx = i;
533  if (i >= Vh) {
534  oddBit = 1;
535  half_idx = i - Vh;
536  }
537 
538  int Y = fullLatticeIndex(half_idx, oddBit);
539  int x4 = Y / (Z[2] * Z[1] * Z[0]);
540  int x3 = (Y / (Z[1] * Z[0])) % Z[2];
541  int x2 = (Y / Z[0]) % Z[1];
542  int x1 = Y % Z[0];
543  int ghost_x4 = x4 + dx4;
544 
545  x4 = (x4 + dx4 + Z[3]) % Z[3];
546  x3 = (x3 + dx3 + Z[2]) % Z[2];
547  x2 = (x2 + dx2 + Z[1]) % Z[1];
548  x1 = (x1 + dx1 + Z[0]) % Z[0];
549 
550  if (ghost_x4 >= 0 && ghost_x4 < Z[3]) {
551  ret = (x4 * (Z[2] * Z[1] * Z[0]) + x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
552  } else {
553  ret = (x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
554  return ret;
555  }
556 
557  int oddBitChanged = (dx4 + dx3 + dx2 + dx1) % 2;
558  if (oddBitChanged) { oddBit = 1 - oddBit; }
559 
560  if (oddBit) { ret += Vh; }
561 
562  return ret;
563 }
564 
565 // X indexes the lattice site
566 void printSpinorElement(void *spinor, int X, QudaPrecision precision)
567 {
568  if (precision == QUDA_DOUBLE_PRECISION)
569  for (int s = 0; s < 4; s++) printVector((double *)spinor + X * 24 + s * 6);
570  else
571  for (int s = 0; s < 4; s++) printVector((float *)spinor + X * 24 + s * 6);
572 }
573 
574 // X indexes the full lattice
575 void printGaugeElement(void *gauge, int X, QudaPrecision precision)
576 {
577  if (getOddBit(X) == 0) {
578  if (precision == QUDA_DOUBLE_PRECISION)
579  for (int m = 0; m < 3; m++) printVector((double *)gauge + (X / 2) * gauge_site_size + m * 3 * 2);
580  else
581  for (int m = 0; m < 3; m++) printVector((float *)gauge + (X / 2) * gauge_site_size + m * 3 * 2);
582 
583  } else {
584  if (precision == QUDA_DOUBLE_PRECISION)
585  for (int m = 0; m < 3; m++) printVector((double *)gauge + (X / 2 + Vh) * gauge_site_size + m * 3 * 2);
586  else
587  for (int m = 0; m < 3; m++) printVector((float *)gauge + (X / 2 + Vh) * gauge_site_size + m * 3 * 2);
588  }
589 }
590 
591 int fullLatticeIndex(int dim[4], int index, int oddBit)
592 {
593 
594  int za = index / (dim[0] >> 1);
595  int zb = za / dim[1];
596  int x2 = za - zb * dim[1];
597  int x4 = zb / dim[2];
598  int x3 = zb - x4 * dim[2];
599 
600  return 2 * index + ((x2 + x3 + x4 + oddBit) & 1);
601 }
602 
603 // given a "half index" i into either an even or odd half lattice (corresponding
604 // to oddBit = {0, 1}), returns the corresponding full lattice index.
605 int fullLatticeIndex(int i, int oddBit)
606 {
607  /*
608  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
609  return 2*i + (boundaryCrossings + oddBit) % 2;
610  */
611 
612  int X1 = Z[0];
613  int X2 = Z[1];
614  int X3 = Z[2];
615  // int X4 = Z[3];
616  int X1h = X1 / 2;
617 
618  int sid = i;
619  int za = sid / X1h;
620  // int x1h = sid - za*X1h;
621  int zb = za / X2;
622  int x2 = za - zb * X2;
623  int x4 = zb / X3;
624  int x3 = zb - x4 * X3;
625  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
626  // int x1 = 2*x1h + x1odd;
627  int X = 2 * sid + x1odd;
628 
629  return X;
630 }
631 
632 extern "C" {
638 const char *__asan_default_options() { return "protect_shadow_gap=0"; }
639 }
640 
645 void get_size_from_env(int *const dims, const char env[])
646 {
647  char *grid_size_env = getenv(env);
648  if (grid_size_env) {
649  std::stringstream grid_list(grid_size_env);
650 
651  int dim;
652  int i = 0;
653  while (grid_list >> dim) {
654  if (i >= 4) errorQuda("Unexpected grid size array length");
655  dims[i] = dim;
656  if (grid_list.peek() == ',') grid_list.ignore();
657  i++;
658  }
659  }
660 }
661 
662 int lex_rank_from_coords_t(const int *coords, void *fdata)
663 {
664  int rank = coords[0];
665  for (int i = 1; i < 4; i++) { rank = gridsize_from_cmdline[i] * rank + coords[i]; }
666  return rank;
667 }
668 
669 int lex_rank_from_coords_x(const int *coords, void *fdata)
670 {
671  int rank = coords[3];
672  for (int i = 2; i >= 0; i--) { rank = gridsize_from_cmdline[i] * rank + coords[i]; }
673  return rank;
674 }
675 
676 template <typename Float> void printVector(Float *v)
677 {
678  printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
679 }
680 
681 // returns 0 or 1 if the full lattice index X is even or odd
682 int getOddBit(int Y)
683 {
684  int x4 = Y / (Z[2] * Z[1] * Z[0]);
685  int x3 = (Y / (Z[1] * Z[0])) % Z[2];
686  int x2 = (Y / Z[0]) % Z[1];
687  int x1 = Y % Z[0];
688  return (x4 + x3 + x2 + x1) % 2;
689 }
690 
691 // a+=b
692 template <typename Float> inline void complexAddTo(Float *a, Float *b)
693 {
694  a[0] += b[0];
695  a[1] += b[1];
696 }
697 
698 // a = b*c
699 template <typename Float> inline void complexProduct(Float *a, Float *b, Float *c)
700 {
701  a[0] = b[0] * c[0] - b[1] * c[1];
702  a[1] = b[0] * c[1] + b[1] * c[0];
703 }
704 
705 // a = conj(b)*conj(c)
706 template <typename Float> inline void complexConjugateProduct(Float *a, Float *b, Float *c)
707 {
708  a[0] = b[0] * c[0] - b[1] * c[1];
709  a[1] = -b[0] * c[1] - b[1] * c[0];
710 }
711 
712 // a = conj(b)*c
713 template <typename Float> inline void complexDotProduct(Float *a, Float *b, Float *c)
714 {
715  a[0] = b[0] * c[0] + b[1] * c[1];
716  a[1] = b[0] * c[1] - b[1] * c[0];
717 }
718 
719 // a += b*c
720 template <typename Float> inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
721 {
722  a[0] += sign * (b[0] * c[0] - b[1] * c[1]);
723  a[1] += sign * (b[0] * c[1] + b[1] * c[0]);
724 }
725 
726 // a += conj(b)*c)
727 template <typename Float> inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
728 {
729  a[0] += b[0] * c[0] + b[1] * c[1];
730  a[1] += b[0] * c[1] - b[1] * c[0];
731 }
732 
733 template <typename Float> inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
734 {
735  a[0] += sign * (b[0] * c[0] - b[1] * c[1]);
736  a[1] -= sign * (b[0] * c[1] + b[1] * c[0]);
737 }
738 
739 template <typename Float> inline void su3Construct12(Float *mat)
740 {
741  Float *w = mat + 12;
742  w[0] = 0.0;
743  w[1] = 0.0;
744  w[2] = 0.0;
745  w[3] = 0.0;
746  w[4] = 0.0;
747  w[5] = 0.0;
748 }
749 
750 // Stabilized Bunk and Sommer
751 template <typename Float> inline void su3Construct8(Float *mat)
752 {
753  mat[0] = atan2(mat[1], mat[0]);
754  mat[1] = atan2(mat[13], mat[12]);
755  for (int i = 8; i < 18; i++) mat[i] = 0.0;
756 }
757 
758 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
759 {
760  if (reconstruct == QUDA_RECONSTRUCT_12) {
761  if (precision == QUDA_DOUBLE_PRECISION)
762  su3Construct12((double *)mat);
763  else
764  su3Construct12((float *)mat);
765  } else {
766  if (precision == QUDA_DOUBLE_PRECISION)
767  su3Construct8((double *)mat);
768  else
769  su3Construct8((float *)mat);
770  }
771 }
772 
773 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
774 // as the cross product of the conjugate vectors: w = u* x v*
775 //
776 // 48 flops
777 template <typename Float> static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
778 {
779  Float *u = &mat[0 * (3 * 2)];
780  Float *v = &mat[1 * (3 * 2)];
781  Float *w = &mat[2 * (3 * 2)];
782  w[0] = 0.0;
783  w[1] = 0.0;
784  w[2] = 0.0;
785  w[3] = 0.0;
786  w[4] = 0.0;
787  w[5] = 0.0;
788  accumulateConjugateProduct(w + 0 * (2), u + 1 * (2), v + 2 * (2), +1);
789  accumulateConjugateProduct(w + 0 * (2), u + 2 * (2), v + 1 * (2), -1);
790  accumulateConjugateProduct(w + 1 * (2), u + 2 * (2), v + 0 * (2), +1);
791  accumulateConjugateProduct(w + 1 * (2), u + 0 * (2), v + 2 * (2), -1);
792  accumulateConjugateProduct(w + 2 * (2), u + 0 * (2), v + 1 * (2), +1);
793  accumulateConjugateProduct(w + 2 * (2), u + 1 * (2), v + 0 * (2), -1);
794  Float u0 = (dir < 3 ? param->anisotropy : (ga_idx >= (Z[3] - 1) * Z[0] * Z[1] * Z[2] / 2 ? param->t_boundary : 1));
795  w[0] *= u0;
796  w[1] *= u0;
797  w[2] *= u0;
798  w[3] *= u0;
799  w[4] *= u0;
800  w[5] *= u0;
801 }
802 
803 template <typename Float> static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
804 {
805  // First reconstruct first row
806  Float row_sum = 0.0;
807  row_sum += mat[2] * mat[2];
808  row_sum += mat[3] * mat[3];
809  row_sum += mat[4] * mat[4];
810  row_sum += mat[5] * mat[5];
811  Float u0 = (dir < 3 ? param->anisotropy : (ga_idx >= (Z[3] - 1) * Z[0] * Z[1] * Z[2] / 2 ? param->t_boundary : 1));
812  Float U00_mag = sqrt(1.f / (u0 * u0) - row_sum);
813 
814  mat[14] = mat[0];
815  mat[15] = mat[1];
816 
817  mat[0] = U00_mag * cos(mat[14]);
818  mat[1] = U00_mag * sin(mat[14]);
819 
820  Float column_sum = 0.0;
821  for (int i = 0; i < 2; i++) column_sum += mat[i] * mat[i];
822  for (int i = 6; i < 8; i++) column_sum += mat[i] * mat[i];
823  Float U20_mag = sqrt(1.f / (u0 * u0) - column_sum);
824 
825  mat[12] = U20_mag * cos(mat[15]);
826  mat[13] = U20_mag * sin(mat[15]);
827 
828  // First column now restored
829 
830  // finally reconstruct last elements from SU(2) rotation
831  Float r_inv2 = 1.0 / (u0 * row_sum);
832 
833  // U11
834  Float A[2];
835  complexDotProduct(A, mat + 0, mat + 6);
836  complexConjugateProduct(mat + 8, mat + 12, mat + 4);
837  accumulateComplexProduct(mat + 8, A, mat + 2, u0);
838  mat[8] *= -r_inv2;
839  mat[9] *= -r_inv2;
840 
841  // U12
842  complexConjugateProduct(mat + 10, mat + 12, mat + 2);
843  accumulateComplexProduct(mat + 10, A, mat + 4, -u0);
844  mat[10] *= r_inv2;
845  mat[11] *= r_inv2;
846 
847  // U21
848  complexDotProduct(A, mat + 0, mat + 12);
849  complexConjugateProduct(mat + 14, mat + 6, mat + 4);
850  accumulateComplexProduct(mat + 14, A, mat + 2, -u0);
851  mat[14] *= r_inv2;
852  mat[15] *= r_inv2;
853 
854  // U12
855  complexConjugateProduct(mat + 16, mat + 6, mat + 2);
856  accumulateComplexProduct(mat + 16, A, mat + 4, u0);
857  mat[16] *= -r_inv2;
858  mat[17] *= -r_inv2;
859 }
860 
861 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision,
863 {
864  if (reconstruct == QUDA_RECONSTRUCT_12) {
865  if (precision == QUDA_DOUBLE_PRECISION)
866  su3Reconstruct12((double *)mat, dir, ga_idx, param);
867  else
868  su3Reconstruct12((float *)mat, dir, ga_idx, param);
869  } else {
870  if (precision == QUDA_DOUBLE_PRECISION)
871  su3Reconstruct8((double *)mat, dir, ga_idx, param);
872  else
873  su3Reconstruct8((float *)mat, dir, ga_idx, param);
874  }
875 }
876 
877 template <typename Float> static int compareFloats(Float *a, Float *b, int len, double epsilon)
878 {
879  for (int i = 0; i < len; i++) {
880  double diff = fabs(a[i] - b[i]);
881  if (diff > epsilon || std::isnan(diff)) {
882  printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
883  return 0;
884  }
885  }
886  return 1;
887 }
888 
889 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
890 {
891  if (precision == QUDA_DOUBLE_PRECISION)
892  return compareFloats((double *)a, (double *)b, len, epsilon);
893  else
894  return compareFloats((float *)a, (float *)b, len, epsilon);
895 }
896 
897 // 4d checkerboard.
898 // given a "half index" i into either an even or odd half lattice (corresponding
899 // to oddBit = {0, 1}), returns the corresponding full lattice index.
900 // Cf. GPGPU code in dslash_core_ante.h.
901 // There, i is the thread index.
902 int fullLatticeIndex_4d(int i, int oddBit)
903 {
904  if (i >= Vh || i < 0) {
905  printf("i out of range in fullLatticeIndex_4d");
906  exit(-1);
907  }
908  /*
909  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
910  return 2*i + (boundaryCrossings + oddBit) % 2;
911  */
912 
913  int X1 = Z[0];
914  int X2 = Z[1];
915  int X3 = Z[2];
916  // int X4 = Z[3];
917  int X1h = X1 / 2;
918 
919  int sid = i;
920  int za = sid / X1h;
921  // int x1h = sid - za*X1h;
922  int zb = za / X2;
923  int x2 = za - zb * X2;
924  int x4 = zb / X3;
925  int x3 = zb - x4 * X3;
926  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
927  // int x1 = 2*x1h + x1odd;
928  int X = 2 * sid + x1odd;
929 
930  return X;
931 }
932 
933 // 5d checkerboard.
934 // given a "half index" i into either an even or odd half lattice (corresponding
935 // to oddBit = {0, 1}), returns the corresponding full lattice index.
936 // Cf. GPGPU code in dslash_core_ante.h.
937 // There, i is the thread index sid.
938 // This function is used by neighborIndex_5d in dslash_reference.cpp.
939 // ok
940 int fullLatticeIndex_5d(int i, int oddBit)
941 {
942  int boundaryCrossings
943  = i / (Z[0] / 2) + i / (Z[1] * Z[0] / 2) + i / (Z[2] * Z[1] * Z[0] / 2) + i / (Z[3] * Z[2] * Z[1] * Z[0] / 2);
944  return 2 * i + (boundaryCrossings + oddBit) % 2;
945 }
946 
947 int fullLatticeIndex_5d_4dpc(int i, int oddBit)
948 {
949  int boundaryCrossings = i / (Z[0] / 2) + i / (Z[1] * Z[0] / 2) + i / (Z[2] * Z[1] * Z[0] / 2);
950  return 2 * i + (boundaryCrossings + oddBit) % 2;
951 }
952 
954 {
955  int oddBit = 0;
956  int half_idx = i;
957  if (i >= Vh) {
958  oddBit = 1;
959  half_idx = i - Vh;
960  }
961 
962  int Y = fullLatticeIndex(half_idx, oddBit);
963  int x4 = Y / (Z[2] * Z[1] * Z[0]);
964 
965  return x4;
966 }
967 
968 template <typename Float> void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
969 {
970  // Apply spatial scaling factor (u0) to spatial links
971  for (int d = 0; d < 3; d++) {
972  for (int i = 0; i < gauge_site_size * Vh * 2; i++) { gauge[d][i] /= param->anisotropy; }
973  }
974 
975  // Apply boundary conditions to temporal links
977  for (int j = (Z[0] / 2) * Z[1] * Z[2] * (Z[3] - 1); j < Vh; j++) {
978  for (int i = 0; i < gauge_site_size; i++) {
979  gauge[3][j * gauge_site_size + i] *= -1.0;
980  gauge[3][(Vh + j) * gauge_site_size + i] *= -1.0;
981  }
982  }
983  }
984 
985  if (param->gauge_fix) {
986  // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
987  // to simulate fixing to the temporal gauge.
988  int iMax = (last_node_in_t() ? (Z[0] / 2) * Z[1] * Z[2] * (Z[3] - 1) : Vh);
989  int dir = 3; // time direction only
990  Float *even = gauge[dir];
991  Float *odd = gauge[dir] + Vh * gauge_site_size;
992  for (int i = 0; i < iMax; i++) {
993  for (int m = 0; m < 3; m++) {
994  for (int n = 0; n < 3; n++) {
995  even[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
996  even[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
997  odd[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
998  odd[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
999  }
1000  }
1001  }
1002  }
1003 }
1004 
1005 // static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
1006 template <typename Float> void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
1007 {
1008  Float *resOdd[4], *resEven[4];
1009  for (int dir = 0; dir < 4; dir++) {
1010  resEven[dir] = res[dir];
1011  resOdd[dir] = res[dir] + Vh * gauge_site_size;
1012  }
1013 
1014  for (int dir = 0; dir < 4; dir++) {
1015  for (int i = 0; i < Vh; i++) {
1016  for (int m = 0; m < 3; m++) {
1017  for (int n = 0; n < 3; n++) {
1018  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
1019  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
1020  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
1021  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
1022  }
1023  }
1024  }
1025  }
1026 
1028 }
1029 
1030 template void constructUnitGaugeField(float **res, QudaGaugeParam *param);
1031 template void constructUnitGaugeField(double **res, QudaGaugeParam *param);
1032 
1033 // normalize the vector a
1034 template <typename Float> static void normalize(complex<Float> *a, int len)
1035 {
1036  double sum = 0.0;
1037  for (int i = 0; i < len; i++) sum += norm(a[i]);
1038  for (int i = 0; i < len; i++) a[i] /= sqrt(sum);
1039 }
1040 
1041 // orthogonalize vector b to vector a
1042 template <typename Float> static void orthogonalize(complex<Float> *a, complex<Float> *b, int len)
1043 {
1044  complex<double> dot = 0.0;
1045  for (int i = 0; i < len; i++) dot += conj(a[i]) * b[i];
1046  for (int i = 0; i < len; i++) b[i] -= (complex<Float>)dot * a[i];
1047 }
1048 
1050 {
1051  Float *resOdd[4], *resEven[4];
1052  for (int dir = 0; dir < 4; dir++) {
1053  resEven[dir] = res[dir];
1054  resOdd[dir] = res[dir] + Vh * gauge_site_size;
1055  }
1056 
1057  for (int dir = 0; dir < 4; dir++) {
1058  for (int i = 0; i < Vh; i++) {
1059  for (int m = 1; m < 3; m++) { // last 2 rows
1060  for (int n = 0; n < 3; n++) { // 3 columns
1061  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (Float)RAND_MAX;
1062  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
1063  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (Float)RAND_MAX;
1064  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
1065  }
1066  }
1067  normalize((complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2), 3);
1068  orthogonalize((complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2),
1069  (complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1070  normalize((complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1071 
1072  normalize((complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2), 3);
1073  orthogonalize((complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2),
1074  (complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1075  normalize((complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1076 
1077  {
1078  Float *w = resEven[dir] + (i * 3 + 0) * 3 * 2;
1079  Float *u = resEven[dir] + (i * 3 + 1) * 3 * 2;
1080  Float *v = resEven[dir] + (i * 3 + 2) * 3 * 2;
1081 
1082  for (int n = 0; n < 6; n++) w[n] = 0.0;
1083  accumulateConjugateProduct(w + 0 * (2), u + 1 * (2), v + 2 * (2), +1);
1084  accumulateConjugateProduct(w + 0 * (2), u + 2 * (2), v + 1 * (2), -1);
1085  accumulateConjugateProduct(w + 1 * (2), u + 2 * (2), v + 0 * (2), +1);
1086  accumulateConjugateProduct(w + 1 * (2), u + 0 * (2), v + 2 * (2), -1);
1087  accumulateConjugateProduct(w + 2 * (2), u + 0 * (2), v + 1 * (2), +1);
1088  accumulateConjugateProduct(w + 2 * (2), u + 1 * (2), v + 0 * (2), -1);
1089  }
1090 
1091  {
1092  Float *w = resOdd[dir] + (i * 3 + 0) * 3 * 2;
1093  Float *u = resOdd[dir] + (i * 3 + 1) * 3 * 2;
1094  Float *v = resOdd[dir] + (i * 3 + 2) * 3 * 2;
1095 
1096  for (int n = 0; n < 6; n++) w[n] = 0.0;
1097  accumulateConjugateProduct(w + 0 * (2), u + 1 * (2), v + 2 * (2), +1);
1098  accumulateConjugateProduct(w + 0 * (2), u + 2 * (2), v + 1 * (2), -1);
1099  accumulateConjugateProduct(w + 1 * (2), u + 2 * (2), v + 0 * (2), +1);
1100  accumulateConjugateProduct(w + 1 * (2), u + 0 * (2), v + 2 * (2), -1);
1101  accumulateConjugateProduct(w + 2 * (2), u + 0 * (2), v + 1 * (2), +1);
1102  accumulateConjugateProduct(w + 2 * (2), u + 1 * (2), v + 0 * (2), -1);
1103  }
1104  }
1105  }
1106 
1107  if (param->type == QUDA_WILSON_LINKS) {
1109  } else if (param->type == QUDA_ASQTAD_LONG_LINKS) {
1111  } else if (param->type == QUDA_ASQTAD_FAT_LINKS) {
1112  for (int dir = 0; dir < 4; dir++) {
1113  for (int i = 0; i < Vh; i++) {
1114  for (int m = 0; m < 3; m++) { // last 2 rows
1115  for (int n = 0; n < 3; n++) { // 3 columns
1116  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = 1.0 * rand() / (Float)RAND_MAX;
1117  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 2.0 * rand() / (Float)RAND_MAX;
1118  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = 3.0 * rand() / (Float)RAND_MAX;
1119  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 4.0 * rand() / (Float)RAND_MAX;
1120  }
1121  }
1122  }
1123  }
1124  }
1125 }
1126 
1129 
1130 template <typename Float> void constructUnitaryGaugeField(Float **res)
1131 {
1132  Float *resOdd[4], *resEven[4];
1133  for (int dir = 0; dir < 4; dir++) {
1134  resEven[dir] = res[dir];
1135  resOdd[dir] = res[dir] + Vh * gauge_site_size;
1136  }
1137 
1138  for (int dir = 0; dir < 4; dir++) {
1139  for (int i = 0; i < Vh; i++) {
1140  for (int m = 1; m < 3; m++) { // last 2 rows
1141  for (int n = 0; n < 3; n++) { // 3 columns
1142  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (Float)RAND_MAX;
1143  resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
1144  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (Float)RAND_MAX;
1145  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
1146  }
1147  }
1148  normalize((complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2), 3);
1149  orthogonalize((complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2),
1150  (complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1151  normalize((complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1152 
1153  normalize((complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2), 3);
1154  orthogonalize((complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2),
1155  (complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1156  normalize((complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1157 
1158  {
1159  Float *w = resEven[dir] + (i * 3 + 0) * 3 * 2;
1160  Float *u = resEven[dir] + (i * 3 + 1) * 3 * 2;
1161  Float *v = resEven[dir] + (i * 3 + 2) * 3 * 2;
1162 
1163  for (int n = 0; n < 6; n++) w[n] = 0.0;
1164  accumulateConjugateProduct(w + 0 * (2), u + 1 * (2), v + 2 * (2), +1);
1165  accumulateConjugateProduct(w + 0 * (2), u + 2 * (2), v + 1 * (2), -1);
1166  accumulateConjugateProduct(w + 1 * (2), u + 2 * (2), v + 0 * (2), +1);
1167  accumulateConjugateProduct(w + 1 * (2), u + 0 * (2), v + 2 * (2), -1);
1168  accumulateConjugateProduct(w + 2 * (2), u + 0 * (2), v + 1 * (2), +1);
1169  accumulateConjugateProduct(w + 2 * (2), u + 1 * (2), v + 0 * (2), -1);
1170  }
1171 
1172  {
1173  Float *w = resOdd[dir] + (i * 3 + 0) * 3 * 2;
1174  Float *u = resOdd[dir] + (i * 3 + 1) * 3 * 2;
1175  Float *v = resOdd[dir] + (i * 3 + 2) * 3 * 2;
1176 
1177  for (int n = 0; n < 6; n++) w[n] = 0.0;
1178  accumulateConjugateProduct(w + 0 * (2), u + 1 * (2), v + 2 * (2), +1);
1179  accumulateConjugateProduct(w + 0 * (2), u + 2 * (2), v + 1 * (2), -1);
1180  accumulateConjugateProduct(w + 1 * (2), u + 2 * (2), v + 0 * (2), +1);
1181  accumulateConjugateProduct(w + 1 * (2), u + 0 * (2), v + 2 * (2), -1);
1182  accumulateConjugateProduct(w + 2 * (2), u + 0 * (2), v + 1 * (2), +1);
1183  accumulateConjugateProduct(w + 2 * (2), u + 1 * (2), v + 0 * (2), -1);
1184  }
1185  }
1186  }
1187 }
1188 
1189 template <typename Float> void constructCloverField(Float *res, double norm, double diag)
1190 {
1191 
1192  Float c = 2.0 * norm / RAND_MAX;
1193 
1194  for (int i = 0; i < V; i++) {
1195  for (int j = 0; j < 72; j++) { res[i * 72 + j] = c * rand() - norm; }
1196 
1197  // impose clover symmetry on each chiral block
1198  for (int ch = 0; ch < 2; ch++) {
1199  res[i * 72 + 3 + 36 * ch] = -res[i * 72 + 0 + 36 * ch];
1200  res[i * 72 + 4 + 36 * ch] = -res[i * 72 + 1 + 36 * ch];
1201  res[i * 72 + 5 + 36 * ch] = -res[i * 72 + 2 + 36 * ch];
1202  res[i * 72 + 30 + 36 * ch] = -res[i * 72 + 6 + 36 * ch];
1203  res[i * 72 + 31 + 36 * ch] = -res[i * 72 + 7 + 36 * ch];
1204  res[i * 72 + 32 + 36 * ch] = -res[i * 72 + 8 + 36 * ch];
1205  res[i * 72 + 33 + 36 * ch] = -res[i * 72 + 9 + 36 * ch];
1206  res[i * 72 + 34 + 36 * ch] = -res[i * 72 + 16 + 36 * ch];
1207  res[i * 72 + 35 + 36 * ch] = -res[i * 72 + 17 + 36 * ch];
1208  }
1209 
1210  for (int j = 0; j < 6; j++) {
1211  res[i * 72 + j] += diag;
1212  res[i * 72 + j + 36] += diag;
1213  }
1214  }
1215 }
1216 
1217 template <typename Float> static void checkGauge(Float **oldG, Float **newG, double epsilon)
1218 {
1219 
1220  const int fail_check = 17;
1221  int fail[4][fail_check];
1222  int iter[4][18];
1223  for (int d = 0; d < 4; d++)
1224  for (int i = 0; i < fail_check; i++) fail[d][i] = 0;
1225  for (int d = 0; d < 4; d++)
1226  for (int i = 0; i < 18; i++) iter[d][i] = 0;
1227 
1228  for (int d = 0; d < 4; d++) {
1229  for (int eo = 0; eo < 2; eo++) {
1230  for (int i = 0; i < Vh; i++) {
1231  int ga_idx = (eo * Vh + i);
1232  for (int j = 0; j < 18; j++) {
1233  double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]);
1234 
1235  for (int f = 0; f < fail_check; f++)
1236  if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++;
1237  if (diff > epsilon || std::isnan(diff)) iter[d][j]++;
1238  }
1239  }
1240  }
1241  }
1242 
1243  printf("Component fails (X, Y, Z, T)\n");
1244  for (int i = 0; i < 18; i++)
1245  printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1246 
1247  printf("\nDeviation Failures = (X, Y, Z, T)\n");
1248  for (int f = 0; f < fail_check; f++) {
1249  printf("%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n", pow(10.0, -(f + 1)), fail[0][f],
1250  fail[1][f], fail[2][f], fail[3][f], fail[0][f] / (double)(V * 18), fail[1][f] / (double)(V * 18),
1251  fail[2][f] / (double)(V * 18), fail[3][f] / (double)(V * 18));
1252  }
1253 }
1254 
1255 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
1256 {
1257  if (precision == QUDA_DOUBLE_PRECISION)
1258  checkGauge((double **)oldG, (double **)newG, epsilon);
1259  else
1260  checkGauge((float **)oldG, (float **)newG, epsilon);
1261 }
1262 
1263 void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
1264 {
1265  if (precision == QUDA_DOUBLE_PRECISION) {
1266  constructUnitaryGaugeField((double **)link);
1267  } else {
1268  constructUnitaryGaugeField((float **)link);
1269  }
1270 
1271  if (phase) {
1272 
1273  for (int i = 0; i < V; i++) {
1274  for (int dir = XUP; dir <= TUP; dir++) {
1275  int idx = i;
1276  int oddBit = 0;
1277  if (i >= Vh) {
1278  idx = i - Vh;
1279  oddBit = 1;
1280  }
1281 
1282  int X1 = Z[0];
1283  int X2 = Z[1];
1284  int X3 = Z[2];
1285  int X4 = Z[3];
1286 
1287  int full_idx = fullLatticeIndex(idx, oddBit);
1288  int i4 = full_idx / (X3 * X2 * X1);
1289  int i3 = (full_idx - i4 * (X3 * X2 * X1)) / (X2 * X1);
1290  int i2 = (full_idx - i4 * (X3 * X2 * X1) - i3 * (X2 * X1)) / X1;
1291  int i1 = full_idx - i4 * (X3 * X2 * X1) - i3 * (X2 * X1) - i2 * X1;
1292 
1293  double coeff = 1.0;
1294  switch (dir) {
1295  case XUP:
1296  if ((i4 & 1) != 0) { coeff *= -1; }
1297  break;
1298 
1299  case YUP:
1300  if (((i4 + i1) & 1) != 0) { coeff *= -1; }
1301  break;
1302 
1303  case ZUP:
1304  if (((i4 + i1 + i2) & 1) != 0) { coeff *= -1; }
1305  break;
1306 
1307  case TUP:
1308  if (last_node_in_t() && i4 == (X4 - 1)) { coeff *= -1; }
1309  break;
1310 
1311  default: printf("ERROR: wrong dir(%d)\n", dir); exit(1);
1312  }
1313 
1314  if (precision == QUDA_DOUBLE_PRECISION) {
1315  // double* mylink = (double*)link;
1316  // mylink = mylink + (4*i + dir)*gauge_site_size;
1317  double *mylink = (double *)link[dir];
1318  mylink = mylink + i * gauge_site_size;
1319 
1320  mylink[12] *= coeff;
1321  mylink[13] *= coeff;
1322  mylink[14] *= coeff;
1323  mylink[15] *= coeff;
1324  mylink[16] *= coeff;
1325  mylink[17] *= coeff;
1326 
1327  } else {
1328  // float* mylink = (float*)link;
1329  // mylink = mylink + (4*i + dir)*gauge_site_size;
1330  float *mylink = (float *)link[dir];
1331  mylink = mylink + i * gauge_site_size;
1332 
1333  mylink[12] *= coeff;
1334  mylink[13] *= coeff;
1335  mylink[14] *= coeff;
1336  mylink[15] *= coeff;
1337  mylink[16] *= coeff;
1338  mylink[17] *= coeff;
1339  }
1340  }
1341  }
1342  }
1343 
1344 #if 1
1345  for (int dir = 0; dir < 4; dir++) {
1346  for (int i = 0; i < V * gauge_site_size; i++) {
1347  if (precision == QUDA_SINGLE_PRECISION) {
1348  float *f = (float *)link[dir];
1349  if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3)) {
1350  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n", i, f[i], __FUNCTION__);
1351  exit(1);
1352  }
1353  } else {
1354  double *f = (double *)link[dir];
1355  if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)) {
1356  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n", i, f[i], __FUNCTION__);
1357  exit(1);
1358  }
1359  }
1360  }
1361  }
1362 #endif
1363 
1364  return;
1365 }
1366 
1367 template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
1368 {
1369  const int fail_check = 16;
1370  int fail[fail_check];
1371  for (int f = 0; f < fail_check; f++) fail[f] = 0;
1372 
1373  int iter[18];
1374  for (int i = 0; i < 18; i++) iter[i] = 0;
1375 
1376  for (int dir = 0; dir < 4; dir++) {
1377  for (int i = 0; i < len; i++) {
1378  for (int j = 0; j < 18; j++) {
1379  int is = i * 18 + j;
1380  double diff = fabs(linkA[dir][is] - linkB[dir][is]);
1381  for (int f = 0; f < fail_check; f++)
1382  if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
1383  // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1384  if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
1385  }
1386  }
1387  }
1388 
1389  for (int i = 0; i < 18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1390 
1391  int accuracy_level = 0;
1392  for (int f = 0; f < fail_check; f++) {
1393  if (fail[f] == 0) { accuracy_level = f; }
1394  }
1395 
1396  for (int f = 0; f < fail_check; f++) {
1397  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0, -(f + 1)), fail[f], 4 * len * 18,
1398  fail[f] / (double)(4 * len * 18));
1399  }
1400 
1401  return accuracy_level;
1402 }
1403 
1404 static int compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
1405 {
1406  int ret;
1407 
1408  if (precision == QUDA_DOUBLE_PRECISION) {
1409  ret = compareLink((double **)linkA, (double **)linkB, len);
1410  } else {
1411  ret = compareLink((float **)linkA, (float **)linkB, len);
1412  }
1413 
1414  return ret;
1415 }
1416 
1417 // X indexes the lattice site
1418 static void printLinkElement(void *link, int X, QudaPrecision precision)
1419 {
1420  if (precision == QUDA_DOUBLE_PRECISION) {
1421  for (int i = 0; i < 3; i++) { printVector((double *)link + X * gauge_site_size + i * 6); }
1422 
1423  } else {
1424  for (int i = 0; i < 3; i++) { printVector((float *)link + X * gauge_site_size + i * 6); }
1425  }
1426 }
1427 
1428 int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
1429 {
1430  printfQuda("%s\n", msgA);
1431  printLinkElement(linkA[0], 0, prec);
1432  printfQuda("\n");
1433  printLinkElement(linkA[0], 1, prec);
1434  printfQuda("...\n");
1435  printLinkElement(linkA[3], len - 1, prec);
1436  printfQuda("\n");
1437 
1438  printfQuda("\n%s\n", msgB);
1439  printLinkElement(linkB[0], 0, prec);
1440  printfQuda("\n");
1441  printLinkElement(linkB[0], 1, prec);
1442  printfQuda("...\n");
1443  printLinkElement(linkB[3], len - 1, prec);
1444  printfQuda("\n");
1445 
1446  int ret = compare_link(linkA, linkB, len, prec);
1447  return ret;
1448 }
1449 
1450 void createMomCPU(void *mom, QudaPrecision precision)
1451 {
1452  void *temp;
1453 
1454  size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
1455  temp = malloc(4 * V * gauge_site_size * gSize);
1456  if (temp == NULL) {
1457  fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
1458  exit(1);
1459  }
1460 
1461  for (int i = 0; i < V; i++) {
1462  if (precision == QUDA_DOUBLE_PRECISION) {
1463  for (int dir = 0; dir < 4; dir++) {
1464  double *thismom = (double *)mom;
1465  for (int k = 0; k < mom_site_size; k++) {
1466  thismom[(4 * i + dir) * mom_site_size + k] = 1.0 * rand() / RAND_MAX;
1467  if (k == mom_site_size - 1) thismom[(4 * i + dir) * mom_site_size + k] = 0.0;
1468  }
1469  }
1470  } else {
1471  for (int dir = 0; dir < 4; dir++) {
1472  float *thismom = (float *)mom;
1473  for (int k = 0; k < mom_site_size; k++) {
1474  thismom[(4 * i + dir) * mom_site_size + k] = 1.0 * rand() / RAND_MAX;
1475  if (k == mom_site_size - 1) thismom[(4 * i + dir) * mom_site_size + k] = 0.0;
1476  }
1477  }
1478  }
1479  }
1480 
1481  free(temp);
1482  return;
1483 }
1484 
1485 void createHwCPU(void *hw, QudaPrecision precision)
1486 {
1487  for (int i = 0; i < V; i++) {
1488  if (precision == QUDA_DOUBLE_PRECISION) {
1489  for (int dir = 0; dir < 4; dir++) {
1490  double *thishw = (double *)hw;
1491  for (int k = 0; k < hw_site_size; k++) { thishw[(4 * i + dir) * hw_site_size + k] = 1.0 * rand() / RAND_MAX; }
1492  }
1493  } else {
1494  for (int dir = 0; dir < 4; dir++) {
1495  float *thishw = (float *)hw;
1496  for (int k = 0; k < hw_site_size; k++) { thishw[(4 * i + dir) * hw_site_size + k] = 1.0 * rand() / RAND_MAX; }
1497  }
1498  }
1499  }
1500 
1501  return;
1502 }
1503 
1504 template <typename Float> int compare_mom(Float *momA, Float *momB, int len)
1505 {
1506  const int fail_check = 16;
1507  int fail[fail_check];
1508  for (int f = 0; f < fail_check; f++) fail[f] = 0;
1509 
1510  int iter[mom_site_size];
1511  for (int i = 0; i < mom_site_size; i++) iter[i] = 0;
1512 
1513  for (int i = 0; i < len; i++) {
1514  for (int j = 0; j < mom_site_size - 1; j++) {
1515  int is = i * mom_site_size + j;
1516  double diff = fabs(momA[is] - momB[is]);
1517  for (int f = 0; f < fail_check; f++)
1518  if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
1519  // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1520  if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
1521  }
1522  }
1523 
1524  int accuracy_level = 0;
1525  for (int f = 0; f < fail_check; f++) {
1526  if (fail[f] == 0) { accuracy_level = f + 1; }
1527  }
1528 
1529  for (int i = 0; i < mom_site_size; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1530 
1531  for (int f = 0; f < fail_check; f++) {
1532  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0, -(f + 1)), fail[f], len * 9, fail[f] / (double)(len * 9));
1533  }
1534 
1535  return accuracy_level;
1536 }
1537 
1538 static void printMomElement(void *mom, int X, QudaPrecision precision)
1539 {
1540  if (precision == QUDA_DOUBLE_PRECISION) {
1541  double *thismom = ((double *)mom) + X * mom_site_size;
1542  printVector(thismom);
1543  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1544  } else {
1545  float *thismom = ((float *)mom) + X * mom_site_size;
1546  printVector(thismom);
1547  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1548  }
1549 }
1550 
1551 int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
1552 {
1553  printfQuda("mom:\n");
1554  printMomElement(momA, 0, prec);
1555  printfQuda("\n");
1556  printMomElement(momA, 1, prec);
1557  printfQuda("\n");
1558  printMomElement(momA, 2, prec);
1559  printfQuda("\n");
1560  printMomElement(momA, 3, prec);
1561  printfQuda("...\n");
1562 
1563  printfQuda("\nreference mom:\n");
1564  printMomElement(momB, 0, prec);
1565  printfQuda("\n");
1566  printMomElement(momB, 1, prec);
1567  printfQuda("\n");
1568  printMomElement(momB, 2, prec);
1569  printfQuda("\n");
1570  printMomElement(momB, 3, prec);
1571  printfQuda("\n");
1572 
1573  int ret;
1574  if (prec == QUDA_DOUBLE_PRECISION) {
1575  ret = compare_mom((double *)momA, (double *)momB, len);
1576  } else {
1577  ret = compare_mom((float *)momA, (float *)momB, len);
1578  }
1579 
1580  return ret;
1581 }
1582 
1583 // compute the magnitude squared anti-Hermitian matrix, including the
1584 // MILC convention of subtracting 4 from each site norm to improve
1585 // stability
1586 template <typename real> double mom_action(real *mom_, int len)
1587 {
1588  double action = 0.0;
1589  for (int i = 0; i < len; i++) {
1590  real *mom = mom_ + i * mom_site_size;
1591  double local = 0.0;
1592  for (int j = 0; j < 6; j++) local += mom[j] * mom[j];
1593  for (int j = 6; j < 9; j++) local += 0.5 * mom[j] * mom[j];
1594  local -= 4.0;
1595  action += local;
1596  }
1597 
1598  return action;
1599 }
1600 
1601 double mom_action(void *mom, QudaPrecision prec, int len)
1602 {
1603  double action = 0.0;
1604  if (prec == QUDA_DOUBLE_PRECISION) {
1605  action = mom_action<double>((double *)mom, len);
1606  } else if (prec == QUDA_SINGLE_PRECISION) {
1607  action = mom_action<float>((float *)mom, len);
1608  }
1609  comm_allreduce(&action);
1610  return action;
1611 }
1612 
1613 static struct timeval startTime;
1614 
1615 void stopwatchStart() { gettimeofday(&startTime, NULL); }
1616 
1618 {
1619  struct timeval endTime;
1620  gettimeofday(&endTime, NULL);
1621 
1622  long ds = endTime.tv_sec - startTime.tv_sec;
1623  long dus = endTime.tv_usec - startTime.tv_usec;
1624  return ds + 0.000001 * dus;
1625 }
1626 
1627 void performanceStats(std::vector<double> &time, std::vector<double> &gflops, std::vector<int> &iter)
1628 {
1629  auto mean_time = 0.0;
1630  auto mean_time2 = 0.0;
1631  auto mean_gflops = 0.0;
1632  auto mean_gflops2 = 0.0;
1633  auto mean_iter = 0.0;
1634  auto mean_iter2 = 0.0;
1635  // skip first solve due to allocations, potential UVM swapping overhead
1636  for (int i = 1; i < Nsrc; i++) {
1637  mean_time += time[i];
1638  mean_time2 += time[i] * time[i];
1639  mean_gflops += gflops[i];
1640  mean_gflops2 += gflops[i] * gflops[i];
1641  mean_iter += iter[i];
1642  mean_iter2 += iter[i] * iter[i];
1643  }
1644 
1645  auto NsrcM1 = Nsrc - 1;
1646 
1647  mean_time /= NsrcM1;
1648  mean_time2 /= NsrcM1;
1649  auto stddev_time = NsrcM1 > 1 ? sqrt((NsrcM1 / ((double)NsrcM1 - 1.0)) * (mean_time2 - mean_time * mean_time)) :
1650  std::numeric_limits<double>::infinity();
1651  mean_gflops /= NsrcM1;
1652  mean_gflops2 /= NsrcM1;
1653  auto stddev_gflops = NsrcM1 > 1 ? sqrt((NsrcM1 / ((double)NsrcM1 - 1.0)) * (mean_gflops2 - mean_gflops * mean_gflops)) :
1654  std::numeric_limits<double>::infinity();
1655 
1656  mean_iter /= NsrcM1;
1657  mean_iter2 /= NsrcM1;
1658  auto stddev_iter = NsrcM1 > 1 ? sqrt((NsrcM1 / ((double)NsrcM1 - 1.0)) * (mean_iter2 - mean_iter * mean_iter)) :
1659  std::numeric_limits<double>::infinity();
1660 
1661  printfQuda("%d solves, mean iteration count %g (stddev = %g), with mean solve time %g (stddev = %g), mean GFLOPS %g "
1662  "(stddev = %g) [excluding first solve]\n",
1663  Nsrc, mean_iter, stddev_iter, mean_time, stddev_time, mean_gflops, stddev_gflops);
1664 }
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
int commCoords(int)
int commDim(int)
int comm_dim_partitioned(int dim)
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
void comm_finalize(void)
void comm_allreduce(double *data)
void commDimPartitionedSet(int dir)
QudaSolveType solve_type
QudaPrecision prec_refinement_sloppy
quda::mgarray< int > num_setup_iter
quda::mgarray< int > mg_eig_poly_deg
quda::mgarray< QudaEigType > mg_eig_type
QudaReconstructType link_recon_sloppy
quda::mgarray< int > mg_eig_check_interval
quda::mgarray< char[256]> mg_vec_outfile
quda::mgarray< double > setup_ca_lambda_max
QudaPrecision prec_null
QudaReconstructType link_recon
quda::mgarray< bool > mg_eig
std::array< int, 4 > dim
quda::mgarray< QudaInverterType > coarse_solver
QudaReconstructType link_recon_precondition
quda::mgarray< QudaCABasis > coarse_solver_ca_basis
quda::mgarray< int > n_block_ortho
std::array< int, 4 > grid_partition
QudaPrecision prec_ritz
quda::mgarray< int > coarse_solver_maxiter
double epsilon
bool compute_clover
quda::mgarray< char[256]> mg_vec_infile
quda::mgarray< int > nu_post
quda::mgarray< int > nu_pre
quda::mgarray< int > setup_ca_basis_size
int rank_order
quda::mgarray< int > mg_eig_n_kr
quda::mgarray< int > coarse_solver_ca_basis_size
quda::mgarray< double > mg_eig_amin
quda::mgarray< QudaVerbosity > mg_verbosity
quda::mgarray< double > setup_ca_lambda_min
char latfile[256]
quda::mgarray< int > setup_maxiter
quda::mgarray< int > nvec
quda::mgarray< QudaCABasis > setup_ca_basis
quda::mgarray< QudaSchwarzType > mg_schwarz_type
quda::mgarray< QudaEigSpectrumType > mg_eig_spectrum
quda::mgarray< int > mg_eig_max_restarts
QudaDslashType dslash_type
quda::mgarray< bool > mg_eig_use_dagger
quda::mgarray< double > mu_factor
quda::mgarray< double > coarse_solver_ca_lambda_max
std::array< int, 4 > dim_partitioned
quda::mgarray< int > mg_eig_n_ev
quda::mgarray< double > mg_eig_amax
quda::mgarray< QudaInverterType > setup_inv
quda::mgarray< double > setup_tol
quda::mgarray< double > coarse_solver_ca_lambda_min
quda::mgarray< bool > mg_eig_use_poly_acc
QudaPrecision prec_eigensolver
quda::mgarray< QudaSolveType > coarse_solve_type
int Nsrc
quda::mgarray< double > mg_eig_tol
bool unit_gauge
QudaPrecision prec_precondition
quda::mgarray< bool > mg_eig_use_normop
QudaPrecision prec
QudaReconstructType link_recon_eigensolver
quda::mgarray< QudaPrecision > mg_eig_save_prec
quda::mgarray< double > smoother_tol
quda::mgarray< QudaSolveType > smoother_solve_type
quda::mgarray< bool > mg_eig_require_convergence
quda::mgarray< double > coarse_solver_tol
QudaPrecision smoother_halo_prec
quda::mgarray< QudaInverterType > smoother_type
quda::mgarray< int > setup_maxiter_refresh
quda::mgarray< int > mg_schwarz_cycle
std::array< int, 4 > gridsize_from_cmdline
QudaPrecision prec_sloppy
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaParity parity
Definition: covdev_test.cpp:40
const int nColor
Definition: covdev_test.cpp:44
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:31
QudaGaugeParam gauge_param
Definition: covdev_test.cpp:26
QudaInvertParam inv_param
Definition: covdev_test.cpp:27
enum QudaPrecision_s QudaPrecision
@ QUDA_NOISE_UNIFORM
Definition: enum_quda.h:385
@ QUDA_POWER_BASIS
Definition: enum_quda.h:201
@ QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:95
@ QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:93
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
Definition: enum_quda.h:96
@ QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:94
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_BOOLEAN_FALSE
Definition: enum_quda.h:460
@ QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:461
@ QUDA_RECONSTRUCT_INVALID
Definition: enum_quda.h:76
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71
@ QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:56
enum QudaDslashType_s QudaDslashType
enum QudaSolutionType_s QudaSolutionType
@ QUDA_EIG_TR_LANCZOS
Definition: enum_quda.h:137
@ QUDA_GCR_INVERTER
Definition: enum_quda.h:109
@ QUDA_BICGSTAB_INVERTER
Definition: enum_quda.h:108
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
enum QudaReconstructType_s QudaReconstructType
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_INVALID_PRECISION
Definition: enum_quda.h:66
@ QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:189
@ QUDA_SPECTRUM_SR_EIG
Definition: enum_quda.h:150
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_INVALID_SOLVE
Definition: enum_quda.h:175
@ QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:169
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
#define gauge_site_size
Definition: face_gauge.cpp:34
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
int Vs_y
Definition: host_utils.cpp:39
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: host_utils.cpp:889
void constructHostCloverField(void *clover, void *clover_inv, QudaInvertParam &inv_param)
Definition: host_utils.cpp:186
double mom_action(real *mom_, int len)
int Vh
Definition: host_utils.cpp:38
size_t host_gauge_data_type_size
Definition: host_utils.cpp:65
int dimPartitioned(int dim)
Definition: host_utils.cpp:376
int V5h
Definition: host_utils.cpp:50
QudaPrecision & cuda_prec
Definition: host_utils.cpp:58
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
Definition: host_utils.cpp:255
size_t host_spinor_data_type_size
Definition: host_utils.cpp:66
QudaPrecision & cuda_prec_sloppy
Definition: host_utils.cpp:59
void setQudaMgSolveTypes()
Definition: host_utils.cpp:81
void complexProduct(Float *a, Float *b, Float *c)
Definition: host_utils.cpp:699
void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
Definition: host_utils.cpp:968
int Vsh_y
Definition: host_utils.cpp:40
void createMomCPU(void *mom, QudaPrecision precision)
int Vsh_x
Definition: host_utils.cpp:40
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
Definition: host_utils.cpp:575
int E1h
Definition: host_utils.cpp:44
void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:490
int compareLink(Float **linkA, Float **linkB, int len)
void finalizeComms()
Definition: host_utils.cpp:292
int x4_from_full_index(int i)
Definition: host_utils.cpp:953
void performanceStats(std::vector< double > &time, std::vector< double > &gflops, std::vector< int > &iter)
QudaPrecision & cuda_prec_eigensolver
Definition: host_utils.cpp:62
int V5
Definition: host_utils.cpp:49
void setQudaDefaultMgTestParams()
Definition: host_utils.cpp:89
int fullLatticeIndex_5d(int i, int oddBit)
Definition: host_utils.cpp:940
void complexDotProduct(Float *a, Float *b, Float *c)
Definition: host_utils.cpp:713
int compare_mom(Float *momA, Float *momB, int len)
int Z[4]
Definition: host_utils.cpp:36
int fullLatticeIndex_4d(int i, int oddBit)
Definition: host_utils.cpp:902
size_t host_clover_data_type_size
Definition: host_utils.cpp:67
void su3Construct8(Float *mat)
Definition: host_utils.cpp:751
QudaPrecision & cuda_prec_precondition
Definition: host_utils.cpp:61
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:528
int Vs_x
Definition: host_utils.cpp:39
const char * __asan_default_options()
Set the default ASAN options. This ensures that QUDA just works when SANITIZE is enabled without requ...
Definition: host_utils.cpp:638
void su3Construct12(Float *mat)
Definition: host_utils.cpp:739
void constructQudaCloverField(void *clover, double norm, double diag, QudaPrecision precision)
Definition: host_utils.cpp:199
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
int Vs_t
Definition: host_utils.cpp:39
void complexAddTo(Float *a, Float *b)
Definition: host_utils.cpp:692
QudaPrecision & cuda_prec_refinement_sloppy
Definition: host_utils.cpp:60
int Vs_z
Definition: host_utils.cpp:39
void dw_setDims(int *X, const int L5)
Definition: host_utils.cpp:353
int E1
Definition: host_utils.cpp:44
void complexConjugateProduct(Float *a, Float *b, Float *c)
Definition: host_utils.cpp:706
QudaPrecision & cpu_prec
Definition: host_utils.cpp:57
int lex_rank_from_coords_x(const int *coords, void *fdata)
Definition: host_utils.cpp:669
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: host_utils.cpp:566
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
Definition: host_utils.cpp:758
#define ZUP
Definition: host_utils.cpp:33
int Vsh_z
Definition: host_utils.cpp:40
void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision precision, QudaSolutionType sol_type, const int *const x, quda::RNG &rng)
Definition: host_utils.cpp:235
int Vsh_t
Definition: host_utils.cpp:40
void setQudaPrecisions()
Definition: host_utils.cpp:69
int E2
Definition: host_utils.cpp:44
#define XUP
Definition: host_utils.cpp:31
void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc, char **argv)
Definition: host_utils.cpp:166
int V
Definition: host_utils.cpp:37
int Ls
Definition: host_utils.cpp:48
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
Definition: host_utils.cpp:720
void coordinate_from_shrinked_index(int coordinate[4], int shrinked_index, const int shrinked_dim[4], const int shift[4], int parity)
Definition: host_utils.cpp:393
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
int E4
Definition: host_utils.cpp:44
int E3
Definition: host_utils.cpp:44
int E[4]
Definition: host_utils.cpp:45
void constructCloverField(Float *res, double norm, double diag)
double stopwatchReadSeconds()
#define TUP
Definition: host_utils.cpp:34
void constructRandomGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type)
int Vh_ex
Definition: host_utils.cpp:46
#define YUP
Definition: host_utils.cpp:32
void createHwCPU(void *hw, QudaPrecision precision)
int V_ex
Definition: host_utils.cpp:46
int faceVolume[4]
Definition: host_utils.cpp:41
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: host_utils.cpp:591
void printVector(Float *v)
Definition: host_utils.cpp:676
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:423
int lex_rank_from_coords_t(const int *coords, void *fdata)
Definition: host_utils.cpp:662
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Definition: host_utils.cpp:733
float fat_link_max
double kappa5
Definition: host_utils.cpp:51
bool last_node_in_t()
Definition: host_utils.cpp:378
void stopwatchStart()
void constructQudaGaugeField(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: host_utils.cpp:146
void constructUnitaryGaugeField(Float **res)
QudaPrecision & cuda_prec_ritz
Definition: host_utils.cpp:63
QudaPrecision local_prec
Definition: host_utils.cpp:56
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: host_utils.cpp:457
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
Definition: host_utils.cpp:861
int index_4d_cb_from_coordinate_4d(const int coordinate[4], const int dim[4])
Definition: host_utils.cpp:388
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
Definition: host_utils.cpp:727
void get_size_from_env(int *const dims, const char env[])
Definition: host_utils.cpp:645
int getOddBit(int Y)
Definition: host_utils.cpp:682
void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const QudaInvertParam *inv_param, const QudaGaugeParam *gauge_param)
Definition: host_utils.cpp:207
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: host_utils.cpp:947
void setDims(int *X)
Definition: host_utils.cpp:315
void initRand()
Definition: host_utils.cpp:302
#define mom_site_size
Definition: host_utils.h:12
bool isPCSolution(QudaSolutionType solution_type)
Definition: host_utils.h:112
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
#define hw_site_size
Definition: host_utils.h:13
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:76
std::map< TuneKey, TuneParam > map
Definition: tune.cpp:34
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type)
Generate a random noise spinor. This variant allows the user to manage the RNG state.
FloatingPoint< float > Float
__host__ __device__ T sum(const array< T, s > &a)
Definition: utility.h:76
_EXTERN_C_ int MPI_Init(int *argc, char ***argv)
Definition: nvtx_pmpi.c:42
QudaGaugeParam param
Definition: pack_test.cpp:18
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
Definition: qio_field.h:12
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
double anisotropy
Definition: quda.h:37
QudaLinkType type
Definition: quda.h:41
QudaFieldLocation location
Definition: quda.h:33
QudaGaugeFixed gauge_fix
Definition: quda.h:63
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
QudaTboundary t_boundary
Definition: quda.h:44
int compute_clover
Definition: quda.h:266
QudaSolutionType solution_type
Definition: quda.h:228
int return_clover
Definition: quda.h:268
int compute_clover_inverse
Definition: quda.h:267
QudaPrecision clover_cpu_prec
Definition: quda.h:249
QudaDslashType dslash_type
Definition: quda.h:106
int return_clover_inverse
Definition: quda.h:269
QudaPrecision cpu_prec
Definition: quda.h:237
QudaGammaBasis gamma_basis
Definition: quda.h:246
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
QudaSiteSubset siteSubset
Definition: lattice_field.h:72
#define printfQuda(...)
Definition: util_quda.h:114
#define errorQuda(...)
Definition: util_quda.h:120