QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
staggered_invert_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <staggered_gauge_utils.h>
19 #include <llfat_reference.h>
20 #include <gauge_field.h>
21 #include <unitarization_links.h>
22 #include <blas_reference.h>
23 #include <random_quda.h>
24 
25 #if defined(QMP_COMMS)
26 #include <qmp.h>
27 #elif defined(MPI_COMMS)
28 #include <mpi.h>
29 #endif
30 
31 #include <qio_field.h>
32 
33 #define MAX(a,b) ((a)>(b)?(a):(b))
34 
35 // In a typical application, quda.h is the only QUDA header required.
36 #include <quda.h>
37 
38 #define mySpinorSiteSize 6
39 
40 extern void usage(char** argv);
41 
43 
44 extern int device;
45 
47 size_t gSize = sizeof(double);
48 
49 extern double reliable_delta;
50 extern bool alternative_reliable;
51 extern int test_type;
52 extern int xdim;
53 extern int ydim;
54 extern int zdim;
55 extern int tdim;
56 extern int gridsize_from_cmdline[];
58 extern QudaPrecision prec;
63 extern double mass; // the mass of the Dirac operator
64 extern double kappa;
65 extern int laplace3D;
66 extern double tol; // tolerance for inverter
67 extern double tol_hq; // heavy-quark tolerance for inverter
68 extern char latfile[];
69 extern int Nsrc; // number of spinors to apply to simultaneously
70 extern int niter;
71 extern int gcrNkrylov;
72 extern int pipeline; // length of pipeline for fused operations in GCR or BiCGstab-l
73 extern int solution_accumulator_pipeline; // length of pipeline for fused solution update from the direction vectors
74 extern QudaCABasis ca_basis; // basis for CA-CG solves
75 extern double ca_lambda_min; // minimum eigenvalue for scaling Chebyshev CA-CG solves
76 extern double ca_lambda_max; // maximum eigenvalue for scaling Chebyshev CA-CG solves
77 
78 // Dirac operator type
80 extern QudaMatPCType matpc_type; // preconditioning type
81 extern QudaSolutionType solution_type; // solution type
83 
84 extern bool compute_fatlong; // build the true fat/long links or use random numbers
85 extern double tadpole_factor;
86 // relativistic correction for naik term
87 extern double eps_naik;
88 // Number of naiks. If eps_naik is 0.0, we only need
89 // to construct one naik.
90 static int n_naiks = 1;
91 
92 // For loading the gauge fields
94 char** argv_copy;
95 
96 int X[4];
97 
102 
103 static void end();
104 
106 {
107  gauge_param.X[0] = xdim;
108  gauge_param.X[1] = ydim;
109  gauge_param.X[2] = zdim;
110  gauge_param.X[3] = tdim;
111 
112  gauge_param.cpu_prec = cpu_prec;
113  gauge_param.cuda_prec = prec;
114  gauge_param.reconstruct = link_recon;
116  gauge_param.cuda_prec_sloppy = prec_sloppy;
118 
119  gauge_param.anisotropy = 1.0;
120 
121  // For HISQ, this must always be set to 1.0, since the tadpole
122  // correction is baked into the coefficients for the first fattening.
123  // The tadpole doesn't mean anything for the second fattening
124  // since the input fields are unitarized.
125  gauge_param.tadpole_coeff = 1.0;
126 
128  gauge_param.scale = -1.0 / 24.0;
129  if (eps_naik != 0) { gauge_param.scale *= (1.0 + eps_naik); }
130  } else {
131  gauge_param.scale = 1.0;
132  }
133  gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
134  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
136  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
137  gauge_param.type = QUDA_WILSON_LINKS;
138 
139  gauge_param.ga_pad = 0;
140 
141 #ifdef MULTI_GPU
142  int x_face_size = gauge_param.X[1] * gauge_param.X[2] * gauge_param.X[3] / 2;
143  int y_face_size = gauge_param.X[0] * gauge_param.X[2] * gauge_param.X[3] / 2;
144  int z_face_size = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[3] / 2;
145  int t_face_size = gauge_param.X[0] * gauge_param.X[1] * gauge_param.X[2] / 2;
146  int pad_size =MAX(x_face_size, y_face_size);
147  pad_size = MAX(pad_size, z_face_size);
148  pad_size = MAX(pad_size, t_face_size);
149  gauge_param.ga_pad = pad_size;
150 #endif
151 }
152 
154 {
155  // Solver params
156  inv_param.verbosity = QUDA_VERBOSE;
157  inv_param.mass = mass;
158  inv_param.kappa = kappa = 1.0 / (8.0 + mass); // for Laplace operator
159  inv_param.laplace3D = laplace3D; // for Laplace operator
160 
161  // outer solver parameters
162  inv_param.inv_type = inv_type;
163  inv_param.tol = tol;
164  inv_param.tol_restart = 1e-3; // now theoretical background for this parameter...
165  inv_param.maxiter = niter;
166  inv_param.reliable_delta = reliable_delta;
168  inv_param.use_sloppy_partial_accumulator = false;
170  inv_param.pipeline = pipeline;
171 
172  inv_param.Ls = Nsrc;
173 
174  if (tol_hq == 0 && tol == 0) {
175  errorQuda("qudaInvert: requesting zero residual\n");
176  exit(1);
177  }
178  // require both L2 relative and heavy quark residual to determine convergence
179  inv_param.residual_type = static_cast<QudaResidualType_s>(0);
180  inv_param.residual_type = (tol != 0) ?
181  static_cast<QudaResidualType_s>(inv_param.residual_type | QUDA_L2_RELATIVE_RESIDUAL) :
182  inv_param.residual_type;
183  inv_param.residual_type = (tol_hq != 0) ?
184  static_cast<QudaResidualType_s>(inv_param.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) :
185  inv_param.residual_type;
186  inv_param.heavy_quark_check = (inv_param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 5 : 0);
187 
188  inv_param.tol_hq = tol_hq; // specify a tolerance for the residual for heavy quark residual
189 
190  inv_param.Nsteps = 2;
191 
192  // domain decomposition preconditioner parameters
194  inv_param.tol_precondition = 1e-1;
195  inv_param.maxiter_precondition = 10;
197  inv_param.cuda_prec_precondition = inv_param.cuda_prec_sloppy;
198 
199  // Specify Krylov sub-size for GCR, BICGSTAB(L), basis size for CA-CG, CA-GCR
200  inv_param.gcrNkrylov = gcrNkrylov;
201 
202  // Specify basis for CA-CG, lambda min/max for Chebyshev basis
203  // lambda_max < lambda_max . use power iters to generate
204  inv_param.ca_basis = ca_basis;
205  inv_param.ca_lambda_min = ca_lambda_min;
206  inv_param.ca_lambda_max = ca_lambda_max;
207 
208  inv_param.solution_type = solution_type;
209  inv_param.solve_type = solve_type;
210  inv_param.matpc_type = matpc_type;
211  inv_param.dagger = QUDA_DAG_NO;
213 
214  inv_param.cpu_prec = cpu_prec;
215  inv_param.cuda_prec = prec;
216  inv_param.cuda_prec_sloppy = prec_sloppy;
219  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // this is meaningless, but must be thus set
220  inv_param.dirac_order = QUDA_DIRAC_ORDER;
221 
222  inv_param.dslash_type = dslash_type;
223 
226 
227  int tmpint = MAX(X[1] * X[2] * X[3], X[0] * X[2] * X[3]);
228  tmpint = MAX(tmpint, X[0] * X[1] * X[3]);
229  tmpint = MAX(tmpint, X[0] * X[1] * X[2]);
230 
231  inv_param.sp_pad = tmpint;
232 }
233 
235 {
236 
237  // Ensure that the default is improved staggered
240 
242  setGaugeParam(gauge_param);
244  setInvertParam(inv_param);
245 
246  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
247  initQuda(device);
248 
249  setDims(gauge_param.X);
250  dw_setDims(gauge_param.X, Nsrc); // so we can use 5-d indexing from dwf
252 
253  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
254 
255  void* qdp_inlink[4] = {nullptr,nullptr,nullptr,nullptr};
256  void* qdp_fatlink[4] = {nullptr,nullptr,nullptr,nullptr};
257  void* qdp_longlink[4] = {nullptr,nullptr,nullptr,nullptr};
258  void* milc_fatlink = nullptr;
259  void* milc_longlink = nullptr;
260 
261  for (int dir = 0; dir < 4; dir++) {
262  qdp_inlink[dir] = malloc(V*gaugeSiteSize*gSize);
263  qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
264  qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize);
265  }
266  milc_fatlink = malloc(4*V*gaugeSiteSize*gSize);
267  milc_longlink = malloc(4 * V * gaugeSiteSize * gSize);
268 
269  // for load, etc
270  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
271 
272  // load a field WITHOUT PHASES
273  if (strcmp(latfile, "")) {
274  read_gauge_field(latfile, qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc_copy, argv_copy);
276  applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
277  }
278  } else {
280  construct_gauge_field(qdp_inlink, 1, gauge_param.cpu_prec, &gauge_param);
281  } else {
282  construct_fat_long_gauge_field(qdp_inlink, qdp_longlink, 1, gauge_param.cpu_prec, &gauge_param,
284  }
285  }
286 
287  // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
288  double plaq[3];
289  computeStaggeredPlaquetteQDPOrder(qdp_inlink, plaq, gauge_param, dslash_type);
290 
291  printfQuda("Computed plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
292 
293  // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
294  // "compute" the fat/long links or not.
296  for (int dir = 0; dir < 4; dir++) {
297  memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gaugeSiteSize * gSize);
298  memset(qdp_longlink[dir],0,V*gaugeSiteSize*gSize);
299  }
300  } else { // QUDA_ASQTAD_DSLASH
301 
302  if (compute_fatlong) {
303  computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik);
304  } else {
305  for (int dir = 0; dir < 4; dir++) {
306  memcpy(qdp_fatlink[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
307  }
308  }
309 
310  // Compute fat link plaquette
311  computeStaggeredPlaquetteQDPOrder(qdp_fatlink, plaq, gauge_param, dslash_type);
312 
313  printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
314  }
315 
316  // Alright, we've created all the void** links.
317  // Create the void* pointers
318  reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
319  reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
320 
322  csParam.nColor=3;
323  csParam.nSpin = 1;
324  csParam.nDim = 5;
325  for (int d = 0; d < 4; d++) csParam.x[d] = gauge_param.X[d];
326  bool pc = (inv_param.solution_type == QUDA_MATPC_SOLUTION || inv_param.solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
327  if (pc) csParam.x[0] /= 2;
328  csParam.x[4] = Nsrc;
329 
330  csParam.setPrecision(inv_param.cpu_prec);
331  csParam.pad = 0;
335  csParam.gammaBasis = inv_param.gamma_basis;
336  csParam.create = QUDA_ZERO_FIELD_CREATE;
337  in = new cpuColorSpinorField(csParam);
338  out = new cpuColorSpinorField(csParam);
339  ref = new cpuColorSpinorField(csParam);
340  tmp = new cpuColorSpinorField(csParam);
341 
342  // Construct source
343  auto *rng = new quda::RNG(quda::LatticeFieldParam(gauge_param), 1234);
344  rng->Init();
345 
346  construct_spinor_source(in->V(), 1, 3, inv_param.cpu_prec, csParam.x, *rng);
347 
348  rng->Release();
349  delete rng;
350 
351 #ifdef MULTI_GPU
352  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
353  tmp_value = MAX(tmp_value, xdim * ydim * tdim / 2);
354  tmp_value = MAX(tmp_value, xdim * ydim * zdim / 2);
355 
356  int fat_pad = tmp_value;
357  int link_pad = 3*tmp_value;
358 
359  // FIXME: currently assume staggered is SU(3)
363  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
364  GaugeFieldParam cpuFatParam(milc_fatlink, gauge_param);
366  cpuGaugeField *cpuFat = new cpuGaugeField(cpuFatParam);
367  ghost_fatlink = (void**)cpuFat->Ghost();
368 
369  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
370  GaugeFieldParam cpuLongParam(milc_longlink, gauge_param);
371  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
372  cpuGaugeField *cpuLong = new cpuGaugeField(cpuLongParam);
373  ghost_longlink = (void**)cpuLong->Ghost();
374 
375 #else
376  int fat_pad = 0;
377  int link_pad = 0;
378 #endif
379 
383  gauge_param.ga_pad = fat_pad;
385  gauge_param.reconstruct = link_recon;
387  } else {
388  gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
389  }
390  gauge_param.cuda_prec_precondition = gauge_param.cuda_prec_sloppy;
391  gauge_param.reconstruct_precondition = gauge_param.reconstruct_sloppy;
392  loadGaugeQuda(milc_fatlink, &gauge_param);
393 
395  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
396  gauge_param.ga_pad = link_pad;
398  gauge_param.reconstruct = link_recon;
399  gauge_param.reconstruct_sloppy = link_recon_sloppy;
400  gauge_param.cuda_prec_precondition = gauge_param.cuda_prec_sloppy;
401  gauge_param.reconstruct_precondition = gauge_param.reconstruct_sloppy;
402  loadGaugeQuda(milc_longlink, &gauge_param);
403  }
404 
405  double time0 = -((double)clock()); // Start the timer
406 
407  double nrm2=0;
408  double src2=0;
409  int ret = 0;
410 
411  int len = 0;
413  len = V*Nsrc;
414  } else {
415  len = Vh*Nsrc;
416  }
417 
418  switch (test_type) {
419  case 0: // full parity solution
420  case 1: // solving prec system, reconstructing
421  case 2:
422 
423  invertQuda(out->V(), in->V(), &inv_param);
424  time0 += clock(); // stop the timer
425  time0 /= CLOCKS_PER_SEC;
426 
427  // In QUDA, the full staggered operator has the sign convention
428  //{{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
429  // have the minus sign. Passing in QUDA_DAG_YES solves this
430  // discrepancy
431  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&ref->Even()), qdp_fatlink, qdp_longlink, ghost_fatlink,
432  ghost_longlink, reinterpret_cast<cpuColorSpinorField *>(&out->Odd()), QUDA_EVEN_PARITY,
433  QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
434  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&ref->Odd()), qdp_fatlink, qdp_longlink, ghost_fatlink,
435  ghost_longlink, reinterpret_cast<cpuColorSpinorField *>(&out->Even()), QUDA_ODD_PARITY,
436  QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
437 
439  xpay(out->V(), kappa, ref->V(), ref->Length(), gauge_param.cpu_prec);
440  ax(0.5 / kappa, ref->V(), ref->Length(), gauge_param.cpu_prec);
441  } else {
442  axpy(2 * mass, out->V(), ref->V(), ref->Length(), gauge_param.cpu_prec);
443  }
444 
445  // Reference debugging code: print the first component
446  // of the even and odd partities within a solution vector.
447  /*
448  printfQuda("\nLength: %lu\n", ref->Length());
449 
450  // for verification
451  printfQuda("\n\nEven:\n");
452  printfQuda("CUDA: %f\n", ((double*)(in->Even().V()))[0]);
453  printfQuda("Soln: %f\n", ((double*)(out->Even().V()))[0]);
454  printfQuda("CPU: %f\n", ((double*)(ref->Even().V()))[0]);
455 
456  printfQuda("\n\nOdd:\n");
457  printfQuda("CUDA: %f\n", ((double*)(in->Odd().V()))[0]);
458  printfQuda("Soln: %f\n", ((double*)(out->Odd().V()))[0]);
459  printfQuda("CPU: %f\n", ((double*)(ref->Odd().V()))[0]);
460  printfQuda("\n\n");
461  */
462 
463  mxpy(in->V(), ref->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
464  nrm2 = norm_2(ref->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
465  src2 = norm_2(in->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
466 
467  break;
468 
469  case 3: // even
470  case 4:
471 
472  invertQuda(out->V(), in->V(), &inv_param);
473 
474  time0 += clock();
475  time0 /= CLOCKS_PER_SEC;
476 
477  matdagmat(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec,
478  gauge_param.cpu_prec, tmp, test_type == 3 ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY, dslash_type);
479 
480  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) {
481  printfQuda("%f %f\n", ((float *)in->V())[12], ((float *)ref->V())[12]);
482  } else {
483  printfQuda("%f %f\n", ((double *)in->V())[12], ((double *)ref->V())[12]);
484  }
485 
486  mxpy(in->V(), ref->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
487  nrm2 = norm_2(ref->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
488  src2 = norm_2(in->V(), len * mySpinorSiteSize, inv_param.cpu_prec);
489 
490  break;
491 
492  case 5: // multi mass CG, even
493  case 6:
494 
495 #define NUM_OFFSETS 12
496 
497  {
498  double masses[NUM_OFFSETS] ={0.06, 0.061, 0.064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205};
499  inv_param.num_offset = NUM_OFFSETS;
500  // these can be set independently
501  for (int i = 0; i < inv_param.num_offset; i++) {
502  inv_param.tol_offset[i] = inv_param.tol;
503  inv_param.tol_hq_offset[i] = inv_param.tol_hq;
504  }
505  void* outArray[NUM_OFFSETS];
506 
507  cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
508  spinorOutArray[0] = out;
509  for (int i = 1; i < inv_param.num_offset; i++) { spinorOutArray[i] = new cpuColorSpinorField(csParam); }
510 
511  for (int i = 0; i < inv_param.num_offset; i++) {
512  outArray[i] = spinorOutArray[i]->V();
513  inv_param.offset[i] = 4*masses[i]*masses[i];
514  }
515 
516  invertMultiShiftQuda(outArray, in->V(), &inv_param);
517 
518  cudaDeviceSynchronize();
519  time0 += clock(); // stop the timer
520  time0 /= CLOCKS_PER_SEC;
521 
522  printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", time0,
523  inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops / inv_param.secs);
524 
525  printfQuda("checking the solution\n");
527  if (inv_param.solve_type == QUDA_NORMOP_SOLVE) {
528  //parity = QUDA_EVENODD_PARITY;
529  errorQuda("full parity not supported\n");
530  } else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN) {
531  parity = QUDA_EVEN_PARITY;
532  } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD) {
533  parity = QUDA_ODD_PARITY;
534  } else {
535  errorQuda("ERROR: invalid spinor parity \n");
536  }
537  for(int i=0;i < inv_param.num_offset;i++){
538  printfQuda("%dth solution: mass=%f, ", i, masses[i]);
539  matdagmat(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0,
540  inv_param.cpu_prec, gauge_param.cpu_prec, tmp, parity, dslash_type);
541 
542  mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
543  double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
544  double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
545  double hqr = sqrt(blas::HeavyQuarkResidualNorm(*spinorOutArray[i], *ref).z);
546  double l2r = sqrt(nrm2/src2);
547 
548  printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, "
549  "host = %g\n",
550  i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i],
551  inv_param.true_res_hq_offset[i], hqr);
552 
553  //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge
554  if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){
555  ret |=1;
556  }
557  }
558 
559  for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i];
560  } break;
561 
562  default:
563  errorQuda("Unsupported test type");
564 
565  } // switch
566 
567  if (test_type <=4){
568 
569  double hqr = sqrt(blas::HeavyQuarkResidualNorm(*out, *ref).z);
570  double l2r = sqrt(nrm2/src2);
571 
572  printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
573  inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
574 
575  printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", time0,
576  inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops / inv_param.secs);
577  }
578 
579  // Clean up gauge fields, at least
580  for (int dir = 0; dir < 4; dir++) {
581  if (qdp_inlink[dir] != nullptr) { free(qdp_inlink[dir]); qdp_inlink[dir] = nullptr; }
582  if (qdp_fatlink[dir] != nullptr) { free(qdp_fatlink[dir]); qdp_fatlink[dir] = nullptr; }
583  if (qdp_longlink[dir] != nullptr) { free(qdp_longlink[dir]); qdp_longlink[dir] = nullptr; }
584  }
585  if (milc_fatlink != nullptr) { free(milc_fatlink); milc_fatlink = nullptr; }
586  if (milc_longlink != nullptr) { free(milc_longlink); milc_longlink = nullptr; }
587 
588 #ifdef MULTI_GPU
589  if (cpuFat != nullptr) { delete cpuFat; cpuFat = nullptr; }
590  if (cpuLong != nullptr) { delete cpuLong; cpuLong = nullptr; }
591 #endif
592 
593  end();
594  return ret;
595 }
596 
597 static void end(void)
598 {
599  delete in;
600  delete out;
601  delete ref;
602  delete tmp;
603 
604  endQuda();
605 }
606 
608 {
609  printfQuda("running the following test:\n");
610 
611  printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n");
612  printfQuda("%s %s %s %s %s %d/%d/%d %d \n", get_prec_str(prec),
615 
616  printfQuda("Grid partition info: X Y Z T\n");
617  printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
618  dimPartitioned(3));
619 
620  return ;
621 
622 }
623 
624  void
625 usage_extra(char** argv )
626 {
627  printfQuda("Extra options:\n");
628  printfQuda(" --test <0/1/2/3/4/5/6> # Test method\n");
629  printfQuda(" 0: Full parity inverter\n");
630  printfQuda(" 1: Even even spinor CG inverter, reconstruct to full parity\n");
631  printfQuda(" 2: Odd odd spinor CG inverter, reconstruct to full parity\n");
632  printfQuda(" 3: Even even spinor CG inverter\n");
633  printfQuda(" 4: Odd odd spinor CG inverter\n");
634  printfQuda(" 5: Even even spinor multishift CG inverter\n");
635  printfQuda(" 6: Odd odd spinor multishift CG inverter\n");
636  printfQuda(" --cpu-prec <double/single/half> # Set CPU precision\n");
637 
638  return ;
639 }
640 int main(int argc, char **argv)
641 {
642 
643  // Set a default
645 
646  for (int i = 1; i < argc; i++) {
647 
648  if (process_command_line_option(argc, argv, &i) == 0) { continue; }
649 
650  if (strcmp(argv[i], "--cpu-prec") == 0) {
651  if (i+1 >= argc){
652  usage(argv);
653  }
654  cpu_prec= get_prec(argv[i+1]);
655  i++;
656  continue;
657  }
658 
659  printf("ERROR: Invalid option:%s\n", argv[i]);
660  usage(argv);
661  }
662 
663  // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp)
664  initComms(argc, argv, gridsize_from_cmdline);
665 
666  if (test_type < 0 || test_type > 6) {
667  errorQuda("Test type %d is outside the valid range.\n", test_type);
668  }
669 
670  // Ensure a reasonable default
671  // ensure that the default is improved staggered
673  warningQuda("The dslash_type %d isn't staggered, asqtad, or laplace. Defaulting to asqtad.\n", dslash_type);
675  }
676 
678  if (test_type != 0) {
679  errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type);
680  }
681 
684  matpc_type = QUDA_MATPC_EVEN_EVEN; // doesn't matter
685 
686  } else {
687 
690  warningQuda("The full spinor staggered operator (test 0) can't be inverted with (P)CG. Switching to BiCGstab.\n");
692  }
693 
695  if (test_type == 0) {
697  } else {
699  }
700  }
701 
702  if (test_type == 1 || test_type == 3 || test_type == 5) {
704  } else if (test_type == 2 || test_type == 4 || test_type == 6) {
706  } else if (test_type == 0) {
707  matpc_type = QUDA_MATPC_EVEN_EVEN; // it doesn't matter
708  }
709 
710  if (test_type == 0 || test_type == 1 || test_type == 2) {
712  } else {
714  }
715  }
716 
718  prec_sloppy = prec;
719  }
720 
723  }
726  }
727 
728  if(inv_type != QUDA_CG_INVERTER && (test_type == 5 || test_type == 6)) {
729  errorQuda("Preconditioning is currently not supported in multi-shift solver solvers");
730  }
731 
732 
733  // Set n_naiks to 2 if eps_naik != 0.0
735  if (eps_naik != 0.0) {
736  if (compute_fatlong) {
737  n_naiks = 2;
738  printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
739  } else {
740  eps_naik = 0.0;
741  printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
742  }
743  } else {
744  printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
745  }
746  }
747 
749 
750  printfQuda("dslash_type = %d\n", dslash_type);
751 
752  argc_copy = argc;
753  argv_copy = argv;
754 
755  int ret = invert_test();
756 
757  // finalize the communications layer
758  finalizeComms();
759 
760  return ret;
761 }
int maxiter_precondition
Definition: quda.h:292
int zdim
Definition: test_util.cpp:1617
double secs
Definition: quda.h:251
void * qdp_longlink[4]
char ** argv_copy
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508
QudaDiracFieldOrder dirac_order
Definition: quda.h:219
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:747
QudaMassNormalization mass_normalization
Definition: quda.h:208
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:182
int niter
Definition: test_util.cpp:1629
size_t gSize
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
double anisotropy
Definition: quda.h:38
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
QudaCABasis ca_basis
Definition: test_util.cpp:1631
void endQuda(void)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1047
QudaCABasis ca_basis
Definition: quda.h:298
QudaSolveType solve_type
Definition: quda.h:205
QudaVerbosity verbosity_precondition
Definition: quda.h:286
enum QudaPrecision_s QudaPrecision
int ga_pad
Definition: quda.h:63
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:187
QudaPrecision prec_sloppy
Definition: test_util.cpp:1609
QudaGaugeFixed gauge_fix
Definition: quda.h:61
QudaSolveType solve_type
Definition: test_util.cpp:1663
QudaInverterType inv_type_precondition
Definition: quda.h:270
QudaLinkType type
Definition: quda.h:42
void ** ghost_fatlink
double kappa
Definition: quda.h:106
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
void usage_extra(char **argv)
#define errorQuda(...)
Definition: util_quda.h:121
double ca_lambda_min
Definition: test_util.cpp:1632
double tol
Definition: quda.h:121
QudaDslashType dslash_type
Definition: quda.h:102
QudaReconstructType reconstruct_precondition
Definition: quda.h:59
QudaInverterType inv_type
Definition: quda.h:103
QudaPrecision cuda_prec
Definition: quda.h:214
int ydim
Definition: test_util.cpp:1616
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1606
enum QudaSolveType_s QudaSolveType
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
int argc_copy
int device
Definition: test_util.cpp:1602
QudaPrecision cpu_prec
Definition: quda.h:213
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
int solution_accumulator_pipeline
Definition: test_util.cpp:1635
QudaDagType dagger
Definition: quda.h:207
QudaMatPCType matpc_type
Definition: test_util.cpp:1662
void finalizeComms()
Definition: test_util.cpp:128
int pipeline
Definition: test_util.cpp:1634
QudaGaugeParam gauge_param
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:216
QudaReconstructType link_recon
Definition: test_util.cpp:1605
int invert_test()
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
double true_res
Definition: quda.h:126
const char * get_staggered_test_type(int t)
Definition: misc.cpp:827
int test_type
Definition: test_util.cpp:1636
double tadpole_factor
Definition: test_util.cpp:1651
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
int tdim
Definition: test_util.cpp:1618
void construct_spinor_source(void *v, int nSpin, int nColor, QudaPrecision precision, const int *const x, quda::RNG &rng)
Definition: test_util.cpp:1342
QudaInverterType inv_type
Definition: test_util.cpp:1640
double ca_lambda_max
Definition: quda.h:304
static int n_naiks
double tol
Definition: test_util.cpp:1656
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
bool compute_fatlong
Definition: test_util.cpp:1655
void * qdp_fatlink[4]
void setDims(int *)
Definition: test_util.cpp:151
QudaFieldLocation input_location
Definition: quda.h:99
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:191
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
double reliable_delta
Definition: quda.h:129
int solution_accumulator_pipeline
Definition: quda.h:142
double ca_lambda_min
Definition: quda.h:301
QudaPrecision prec_refinement_sloppy
Definition: test_util.cpp:1610
QudaSolutionType solution_type
Definition: quda.h:204
int use_alternative_reliable
Definition: quda.h:131
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
void computeStaggeredPlaquetteQDPOrder(void **qdp_link, double plaq[3], const QudaGaugeParam &gauge_param_in, const QudaDslashType dslash_type)
double scale
Definition: quda.h:40
void initQuda(int device)
QudaPrecision prec
Definition: test_util.cpp:1608
QudaFieldLocation output_location
Definition: quda.h:100
QudaPrecision cpu_prec
QudaInvertParam inv_param
Definition: covdev_test.cpp:37
QudaPrecision cuda_prec_sloppy
Definition: quda.h:215
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
QudaVerbosity verbosity
Definition: quda.h:244
void setSpinorSiteSize(int n)
Definition: test_util.cpp:211
ColorSpinorParam csParam
Definition: pack_test.cpp:24
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:179
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:185
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:35
cpuColorSpinorField * in
QudaInvertParam newQudaInvertParam(void)
double gflops
Definition: quda.h:250
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
cpuColorSpinorField * tmp
QudaPrecision cuda_prec_precondition
Definition: quda.h:58
#define mySpinorSiteSize
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
double tol_hq
Definition: quda.h:123
enum QudaMatPCType_s QudaMatPCType
int xdim
Definition: test_util.cpp:1615
#define warningQuda(...)
Definition: util_quda.h:133
double true_res_hq
Definition: quda.h:127
enum QudaSolutionType_s QudaSolutionType
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaGammaBasis gamma_basis
Definition: quda.h:221
void staggered_dslash(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
const void ** Ghost() const
Definition: gauge_field.h:323
double tol_precondition
Definition: quda.h:289
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:176
int use_sloppy_partial_accumulator
Definition: quda.h:132
int heavy_quark_check
Definition: quda.h:165
enum QudaParity_s QudaParity
double reliable_delta
Definition: test_util.cpp:1658
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int X[4]
Definition: quda.h:36
char latfile[]
Definition: test_util.cpp:1623
double mass
Definition: quda.h:105
void ** ghost_longlink
int gcrNkrylov
Definition: quda.h:259
double tol_hq
Definition: test_util.cpp:1657
#define NUM_OFFSETS
double mass
Definition: test_util.cpp:1646
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:55
int V
Definition: test_util.cpp:27
double norm_2(void *v, int len, QudaPrecision precision)
double eps_naik
Definition: test_util.cpp:1652
void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik)
void * memset(void *s, int c, size_t n)
int main(int argc, char **argv)
cpuGaugeField * cpuLong
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1062
int laplace3D
Definition: test_util.cpp:1622
double ca_lambda_max
Definition: test_util.cpp:1633
void display_test_info()
QudaResidualType_s
Definition: enum_quda.h:186
QudaDslashType dslash_type
Definition: test_util.cpp:1621
double tadpole_coeff
Definition: quda.h:39
cpuColorSpinorField * out
QudaPrecision cuda_prec_precondition
Definition: quda.h:217
cpuColorSpinorField * ref
bool alternative_reliable
Definition: test_util.cpp:1659
double tol_restart
Definition: quda.h:122
void setGaugeParam(QudaGaugeParam &gauge_param)
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
#define MAX(a, b)
enum QudaCABasis_s QudaCABasis
void * qdp_inlink[4]
double kappa
Definition: test_util.cpp:1647
cpuGaugeField * cpuFat
#define printfQuda(...)
Definition: util_quda.h:115
QudaTboundary t_boundary
Definition: quda.h:45
enum QudaDslashType_s QudaDslashType
void setInvertParam(QudaInvertParam &inv_param)
int X[4]
void mxpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:34
QudaResidualType residual_type
Definition: quda.h:320
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
int num_offset
Definition: quda.h:169
int Nsrc
Definition: test_util.cpp:1627
int gcrNkrylov
Definition: test_util.cpp:1630
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
Definition: qio_field.h:14
QudaParity parity
Definition: covdev_test.cpp:54
QudaSolutionType solution_type
Definition: test_util.cpp:1664
QudaMatPCType matpc_type
Definition: quda.h:206
void usage(char **argv)
Definition: test_util.cpp:1783
enum QudaInverterType_s QudaInverterType
QudaPrecision get_prec(QIO_Reader *infile)
Definition: qio_field.cpp:69
static void end()
QudaPrecision cpu_prec
Definition: quda.h:47
#define gaugeSiteSize
Definition: face_gauge.cpp:34
QudaGaugeParam newQudaGaugeParam(void)
QudaPreserveSource preserve_source
Definition: quda.h:211
int Vh
Definition: test_util.cpp:28