QUDA  v1.1.0
A library for QCD on GPUs
milc_interface.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <iostream>
4 #include <quda.h>
5 #include <quda_milc_interface.h>
6 #include <quda_internal.h>
7 #include <color_spinor_field.h>
8 #include <string.h>
9 #include <unitarization_links.h>
10 #include <ks_improved_force.h>
11 #include <dslash_quda.h>
12 
13 #include <vector>
14 #include <fstream>
15 
16 #define MAX(a, b) ((a) > (b) ? (a) : (b))
17 
18 // code for NVTX taken from Jiri Kraus' blog post:
19 // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
20 
21 #ifdef INTERFACE_NVTX
22 
23 #if QUDA_NVTX_VERSION == 3
24 #include "nvtx3/nvToolsExt.h"
25 #else
26 #include "nvToolsExt.h"
27 #endif
28 
29 static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
30 static const int num_colors = sizeof(colors)/sizeof(uint32_t);
31 
32 #define PUSH_RANGE(name,cid) { \
33  int color_id = cid; \
34  color_id = color_id%num_colors;\
35  nvtxEventAttributes_t eventAttrib = {0}; \
36  eventAttrib.version = NVTX_VERSION; \
37  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
38  eventAttrib.colorType = NVTX_COLOR_ARGB; \
39  eventAttrib.color = colors[color_id]; \
40  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
41  eventAttrib.message.ascii = name; \
42  nvtxRangePushEx(&eventAttrib); \
43 }
44 #define POP_RANGE nvtxRangePop();
45 #else
46 #define PUSH_RANGE(name,cid)
47 #define POP_RANGE
48 #endif
49 
50 
51 static bool initialized = false;
52 static int gridDim[4];
53 static int localDim[4];
54 
55 static bool invalidate_quda_gauge = true;
56 static bool create_quda_gauge = false;
57 
58 static bool have_resident_gauge = false;
59 
60 static bool invalidate_quda_mom = true;
61 
62 static bool invalidate_quda_mg = true;
63 
64 static void *df_preconditioner = nullptr;
65 
66 using namespace quda;
67 using namespace quda::fermion_force;
68 
69 
70 #define QUDAMILC_VERBOSE 1
71 
72 template <bool start> void inline qudamilc_called(const char *func, QudaVerbosity verb)
73 {
74  // add NVTX markup if enabled
75  if (start) {
76  PUSH_RANGE(func, 1);
77  } else {
78  POP_RANGE;
79  }
80 
81  #ifdef QUDAMILC_VERBOSE
82  if (verb >= QUDA_VERBOSE) {
83  if (start) {
84  printfQuda("QUDA_MILC_INTERFACE: %s (called) \n", func);
85  } else {
86  printfQuda("QUDA_MILC_INTERFACE: %s (return) \n", func);
87  }
88  }
89 #endif
90 }
91 
92 template <bool start> void inline qudamilc_called(const char *func) { qudamilc_called<start>(func, getVerbosity()); }
93 
94 void qudaSetMPICommHandle(void *mycomm) { setMPICommHandleQuda(mycomm); }
95 
97 {
98  if (initialized) return;
99  setVerbosityQuda(input.verbosity, "", stdout);
100  qudamilc_called<true>(__func__);
101  qudaSetLayout(input.layout);
102  initialized = true;
103  qudamilc_called<false>(__func__);
104 }
105 
107 {
108  qudamilc_called<true>(__func__);
109  endQuda();
110  qudamilc_called<false>(__func__);
111 }
112 #if defined(MULTI_GPU) && !defined(QMP_COMMS)
117 static int rankFromCoords(const int *coords, void *fdata)
118 {
119  int *dims = static_cast<int *>(fdata);
120 
121  int rank = coords[3];
122  for (int i = 2; i >= 0; i--) {
123  rank = dims[i] * rank + coords[i];
124  }
125  return rank;
126 }
127 #endif
128 
130 {
131  int local_dim[4];
132  for(int dir=0; dir<4; ++dir){ local_dim[dir] = input.latsize[dir]; }
133 #ifdef MULTI_GPU
134  for(int dir=0; dir<4; ++dir){ local_dim[dir] /= input.machsize[dir]; }
135 #endif
136  for(int dir=0; dir<4; ++dir){
137  if(local_dim[dir]%2 != 0){
138  printf("Error: Odd lattice dimensions are not supported\n");
139  exit(1);
140  }
141  }
142 
143  for(int dir=0; dir<4; ++dir) localDim[dir] = local_dim[dir];
144 
145 #ifdef MULTI_GPU
146  for(int dir=0; dir<4; ++dir) gridDim[dir] = input.machsize[dir];
147 #ifdef QMP_COMMS
148  initCommsGridQuda(4, gridDim, nullptr, nullptr);
149 #else
150  initCommsGridQuda(4, gridDim, rankFromCoords, (void *)(gridDim));
151 #endif
152  static int device = -1;
153 #else
154  for(int dir=0; dir<4; ++dir) gridDim[dir] = 1;
155  static int device = input.device;
156 #endif
157 
158  initQuda(device);
159 }
160 
162 
163 void qudaFreePinned(void *ptr) { pool_pinned_free(ptr); }
164 
165 void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); }
166 
167 void qudaFreeManaged(void *ptr) { managed_free(ptr); }
168 
170 {
171  static bool initialized = false;
172 
173  if(initialized) return;
174  qudamilc_called<true>(__func__);
175 
176 #if defined(GPU_HISQ_FORCE) || defined(GPU_UNITARIZE)
177  const bool reunit_allow_svd = (params.reunit_allow_svd) ? true : false;
178  const bool reunit_svd_only = (params.reunit_svd_only) ? true : false;
179  const double unitarize_eps = 1e-14;
180  const double max_error = 1e-10;
181 #endif
182 
183 #ifdef GPU_HISQ_FORCE
185  params.force_filter,
186  max_error,
187  reunit_allow_svd,
188  reunit_svd_only,
189  params.reunit_svd_rel_error,
190  params.reunit_svd_abs_error);
191 #endif
192 
193 #ifdef GPU_UNITARIZE
194  setUnitarizeLinksConstants(unitarize_eps,
195  max_error,
196  reunit_allow_svd,
197  reunit_svd_only,
198  params.reunit_svd_rel_error,
199  params.reunit_svd_abs_error);
200 #endif // UNITARIZE_GPU
201 
202  initialized = true;
203  qudamilc_called<false>(__func__);
204  return;
205 }
206 
207 
208 
209 static QudaGaugeParam newMILCGaugeParam(const int* dim, QudaPrecision prec, QudaLinkType link_type)
210 {
212  for(int dir=0; dir<4; ++dir) gParam.X[dir] = dim[dir];
213  gParam.cuda_prec_sloppy = gParam.cpu_prec = gParam.cuda_prec = prec;
214  gParam.type = link_type;
215 
216  gParam.reconstruct_sloppy = gParam.reconstruct = ((link_type == QUDA_SU3_LINKS) ? QUDA_RECONSTRUCT_12 : QUDA_RECONSTRUCT_NO);
217  gParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
219  gParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
220  gParam.scale = 1.0;
221  gParam.anisotropy = 1.0;
222  gParam.tadpole_coeff = 1.0;
223  gParam.scale = 0;
224  gParam.ga_pad = 0;
225  gParam.site_ga_pad = 0;
226  gParam.mom_ga_pad = 0;
227  gParam.llfat_ga_pad = 0;
228  return gParam;
229 }
230 
231 static void invalidateGaugeQuda() {
232  qudamilc_called<true>(__func__);
233  freeGaugeQuda();
234  invalidate_quda_gauge = true;
235  have_resident_gauge = false;
236  qudamilc_called<false>(__func__);
237 }
238 
239 void qudaLoadKSLink(int prec, QudaFatLinkArgs_t fatlink_args,
240  const double act_path_coeff[6], void* inlink, void* fatlink, void* longlink)
241 {
242  qudamilc_called<true>(__func__);
243 
244  QudaGaugeParam param = newMILCGaugeParam(localDim,
247 
250 
251  computeKSLinkQuda(fatlink, longlink, nullptr, inlink, const_cast<double*>(act_path_coeff), &param);
252 
253  // requires loadGaugeQuda to be called in subequent solver
254  invalidateGaugeQuda();
255 
256  // this flags that we are using QUDA to create the HISQ links
257  create_quda_gauge = true;
258  qudamilc_called<false>(__func__);
259 }
260 
261 
262 
264  const double act_path_coeff[6], void* inlink, void* fatlink, void* ulink)
265 {
266  qudamilc_called<true>(__func__);
267 
268  QudaGaugeParam param = newMILCGaugeParam(localDim,
271 
272  computeKSLinkQuda(fatlink, nullptr, ulink, inlink, const_cast<double*>(act_path_coeff), &param);
273 
274  // requires loadGaugeQuda to be called in subequent solver
275  invalidateGaugeQuda();
276 
277  // this flags that we are using QUDA to create the HISQ links
278  create_quda_gauge = true;
279  qudamilc_called<false>(__func__);
280 }
281 
282 
283 void qudaHisqForce(int prec, int num_terms, int num_naik_terms, double dt, double** coeff, void** quark_field,
284  const double level2_coeff[6], const double fat7_coeff[6],
285  const void* const w_link, const void* const v_link, const void* const u_link,
286  void* const milc_momentum)
287 {
288  qudamilc_called<true>(__func__);
289 
291 
292  if (!invalidate_quda_mom) {
293  gParam.use_resident_mom = true;
294  gParam.make_resident_mom = true;
295  gParam.return_result_mom = false;
296  } else {
297  gParam.use_resident_mom = false;
298  gParam.make_resident_mom = false;
299  gParam.return_result_mom = true;
300  }
301 
302  computeHISQForceQuda(milc_momentum, dt, level2_coeff, fat7_coeff,
303  w_link, v_link, u_link,
304  quark_field, num_terms, num_naik_terms, coeff,
305  &gParam);
306 
307  have_resident_gauge = false;
308  qudamilc_called<false>(__func__);
309  return;
310 }
311 
312 
313 void qudaAsqtadForce(int prec, const double act_path_coeff[6],
314  const void* const one_link_src[4], const void* const naik_src[4],
315  const void* const link, void* const milc_momentum)
316 {
317  errorQuda("This interface has been removed and is no longer supported");
318 }
319 
320 
321 
322 void qudaComputeOprod(int prec, int num_terms, int num_naik_terms, double** coeff, double scale,
323  void** quark_field, void* oprod[3])
324 {
325  errorQuda("This interface has been removed and is no longer supported");
326 }
327 
328 void qudaUpdateUPhasedPipeline(int prec, double eps, QudaMILCSiteArg_t *arg, int phase_in, int want_gaugepipe)
329 {
330  qudamilc_called<true>(__func__);
331  QudaGaugeParam qudaGaugeParam
332  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
333  void *gauge = arg->site ? arg->site : arg->link;
334  void *mom = arg->site ? arg->site : arg->mom;
335 
336  qudaGaugeParam.gauge_offset = arg->link_offset;
337  qudaGaugeParam.mom_offset = arg->mom_offset;
338  qudaGaugeParam.site_size = arg->size;
340 
341  qudaGaugeParam.staggered_phase_applied = phase_in;
343  if (phase_in) qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
344  if (want_gaugepipe) {
345  qudaGaugeParam.make_resident_gauge = true;
346  qudaGaugeParam.return_result_gauge = true;
347  if (!have_resident_gauge) {
348  qudaGaugeParam.use_resident_gauge = false;
349  have_resident_gauge = true;
350  if (getVerbosity() >= QUDA_VERBOSE) { printfQuda("QUDA_MILC_INTERFACE: Using gauge pipeline \n"); }
351  } else {
352  qudaGaugeParam.use_resident_gauge = true;
353  }
354  }
355 
356  if (!invalidate_quda_mom) {
357  qudaGaugeParam.use_resident_mom = true;
358  qudaGaugeParam.make_resident_mom = true;
359  } else {
360  qudaGaugeParam.use_resident_mom = false;
361  qudaGaugeParam.make_resident_mom = false;
362  }
363 
364  updateGaugeFieldQuda(gauge, mom, eps, 0, 0, &qudaGaugeParam);
365  qudamilc_called<false>(__func__);
366  return;
367 }
368 
369 void qudaUpdateUPhased(int prec, double eps, QudaMILCSiteArg_t *arg, int phase_in)
370 {
371  qudaUpdateUPhasedPipeline(prec, eps, arg, phase_in, 0);
372 }
373 
374 void qudaUpdateU(int prec, double eps, QudaMILCSiteArg_t *arg) { qudaUpdateUPhased(prec, eps, arg, 0); }
375 
376 void qudaRephase(int prec, void *gauge, int flag, double i_mu)
377 {
378  qudamilc_called<true>(__func__);
379  QudaGaugeParam qudaGaugeParam
380  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
381 
382  qudaGaugeParam.staggered_phase_applied = 1 - flag;
384  qudaGaugeParam.i_mu = i_mu;
385  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
386 
387  staggeredPhaseQuda(gauge, &qudaGaugeParam);
388  qudamilc_called<false>(__func__);
389  return;
390 }
391 
392 void qudaUnitarizeSU3Phased(int prec, double tol, QudaMILCSiteArg_t *arg, int phase_in)
393 {
394  qudamilc_called<true>(__func__);
395  QudaGaugeParam qudaGaugeParam
396  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
397 
398  void *gauge = arg->site ? arg->site : arg->link;
399  qudaGaugeParam.gauge_offset = arg->link_offset;
400  qudaGaugeParam.site_size = arg->size;
402  qudaGaugeParam.staggered_phase_applied = phase_in;
404  // when we take care of phases in QUDA we need to respect MILC boundary conditions.
405  if (phase_in) qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
406 
407  if (!have_resident_gauge) {
408  qudaGaugeParam.make_resident_gauge = false;
409  qudaGaugeParam.use_resident_gauge = false;
410  } else {
411  qudaGaugeParam.use_resident_gauge = true;
412  qudaGaugeParam.make_resident_gauge = true;
413  }
414  qudaGaugeParam.return_result_gauge = true;
415  have_resident_gauge = false;
416 
417  projectSU3Quda(gauge, tol, &qudaGaugeParam);
418  invalidateGaugeQuda();
419  qudamilc_called<false>(__func__);
420  return;
421 }
422 
424 
425 // download the momentum from MILC and place into the resident mom field
427 {
428  qudamilc_called<true>(__func__);
429 
431  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
432 
433  void *mom = arg->site ? arg->site : arg->mom;
434  param.mom_offset = arg->mom_offset;
435  param.site_size = arg->size;
439 
440  momResidentQuda(mom, &param);
441  invalidate_quda_mom = false;
442 
443  qudamilc_called<false>(__func__);
444 }
445 
446 // upload the momentum to MILC and invalidate the current resident momentum
448 {
449  qudamilc_called<true>(__func__);
450 
452  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
453 
454  void *mom = arg->site ? arg->site : arg->mom;
455  param.mom_offset = arg->mom_offset;
456  param.site_size = arg->size;
460 
461  momResidentQuda(mom, &param);
462  invalidate_quda_mom = true;
463 
464  qudamilc_called<false>(__func__);
465 }
466 
468 {
469  qudamilc_called<true>(__func__);
470 
472  = newMILCGaugeParam(localDim, (prec == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
473 
474  void *mom = arg->site ? arg->site : arg->mom;
475  param.mom_offset = arg->mom_offset;
476  param.site_size = arg->size;
479 
480  if (!invalidate_quda_mom) {
481  param.use_resident_mom = true;
482  param.make_resident_mom = true;
483  invalidate_quda_mom = false;
484  } else { // no momentum residency
485  param.use_resident_mom = false;
486  param.make_resident_mom = false;
487  invalidate_quda_mom = true;
488  }
489 
490  double action = momActionQuda(mom, &param);
491 
492  qudamilc_called<false>(__func__);
493 
494  return action;
495 }
496 
497 static inline int opp(int dir){
498  return 7-dir;
499 }
500 
501 
502 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){
503 
504  int index=0;
505  // Plaquette paths
506  if (num_loop_types >= 1)
507  for(int i=0; i<4; ++i){
508  if(i==dir) continue;
509  paths[index][0] = i; paths[index][1] = opp(dir); paths[index++][2] = opp(i);
510  paths[index][0] = opp(i); paths[index][1] = opp(dir); paths[index++][2] = i;
511  }
512 
513  // Rectangle Paths
514  if (num_loop_types >= 2)
515  for(int i=0; i<4; ++i){
516  if(i==dir) continue;
517  paths[index][0] = paths[index][1] = i; paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
518  index++;
519  paths[index][0] = paths[index][1] = opp(i); paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
520  index++;
521  paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
522  index++;
523  paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
524  index++;
525  paths[index][0] = i; paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
526  index++;
527  paths[index][0] = opp(i); paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
528  index++;
529  }
530 
531  if (num_loop_types >= 3) {
532  // Staple paths
533  for(int i=0; i<4; ++i){
534  for(int j=0; j<4; ++j){
535  if(i==dir || j==dir || i==j) continue;
536  paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
537  index++;
538  paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
539  index++;
540  paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
541  index++;
542  paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
543  index++;
544  }
545  }
546  }
547 
548 }
549 
550 void qudaGaugeForcePhased(int precision, int num_loop_types, double milc_loop_coeff[3], double eb3,
551  QudaMILCSiteArg_t *arg, int phase_in)
552 {
553  qudamilc_called<true>(__func__);
554 
555  int numPaths = 0;
556  switch (num_loop_types) {
557  case 1:
558  numPaths = 6;
559  break;
560  case 2:
561  numPaths = 24;
562  break;
563  case 3:
564  numPaths = 48;
565  break;
566  default:
567  errorQuda("Invalid num_loop_types = %d\n", num_loop_types);
568  }
569 
570  QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim,
571  (precision==1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION,
573  void *gauge = arg->site ? arg->site : arg->link;
574  void *mom = arg->site ? arg->site : arg->mom;
575 
576  qudaGaugeParam.gauge_offset = arg->link_offset;
577  qudaGaugeParam.mom_offset = arg->mom_offset;
578  qudaGaugeParam.site_size = arg->size;
580  qudaGaugeParam.staggered_phase_applied = phase_in;
582  if (phase_in) qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
583  if (phase_in) qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
584 
585  if (!have_resident_gauge) {
586  qudaGaugeParam.make_resident_gauge = true;
587  qudaGaugeParam.use_resident_gauge = false;
588  // have_resident_gauge = true;
589  } else {
590  qudaGaugeParam.make_resident_gauge = true;
591  qudaGaugeParam.use_resident_gauge = true;
592  }
593 
594  double *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
595  int *length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));
596 
597  if (num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
598  loop_coeff[i] = milc_loop_coeff[0];
599  length[i] = 3;
600  }
601  if (num_loop_types >= 2) for(int i= 6; i<24; ++i) {
602  loop_coeff[i] = milc_loop_coeff[1];
603  length[i] = 5;
604  }
605  if (num_loop_types >= 3) for(int i=24; i<48; ++i) {
606  loop_coeff[i] = milc_loop_coeff[2];
607  length[i] = 5;
608  }
609 
610  int** input_path_buf[4];
611  for(int dir=0; dir<4; ++dir){
612  input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
613  for(int i=0; i<numPaths; ++i){
614  input_path_buf[dir][i] = static_cast<int*>(safe_malloc(length[i]*sizeof(int)));
615  }
616  createGaugeForcePaths(input_path_buf[dir], dir, num_loop_types);
617  }
618 
619  if (!invalidate_quda_mom) {
620  qudaGaugeParam.use_resident_mom = true;
621  qudaGaugeParam.make_resident_mom = true;
622  qudaGaugeParam.return_result_mom = false;
623 
624  // this means when we compute the momentum, we acummulate to the
625  // preexisting resident momentum instead of overwriting it
626  qudaGaugeParam.overwrite_mom = false;
627  } else {
628  qudaGaugeParam.use_resident_mom = false;
629  qudaGaugeParam.make_resident_mom = false;
630  qudaGaugeParam.return_result_mom = true;
631 
632  // this means we compute momentum into a fresh field, copy it back
633  // and sum to current momentum in MILC. This saves an initial
634  // CPU->GPU download of the current momentum.
635  qudaGaugeParam.overwrite_mom = false;
636  }
637 
638  int max_length = 6;
639 
640  computeGaugeForceQuda(mom, gauge, input_path_buf, length,
641  loop_coeff, numPaths, max_length, eb3, &qudaGaugeParam);
642 
643  for(int dir=0; dir<4; ++dir){
644  for(int i=0; i<numPaths; ++i) host_free(input_path_buf[dir][i]);
645  host_free(input_path_buf[dir]);
646  }
647 
648  host_free(length);
649  host_free(loop_coeff);
650 
651  qudamilc_called<false>(__func__);
652  return;
653 }
654 
655 void qudaGaugeForce(int precision, int num_loop_types, double milc_loop_coeff[3], double eb3, QudaMILCSiteArg_t *arg)
656 {
657  qudaGaugeForcePhased(precision, num_loop_types, milc_loop_coeff, eb3, arg, 0);
658 }
659 
660 static int getLinkPadding(const int dim[4])
661 {
662  int padding = MAX(dim[1]*dim[2]*dim[3]/2, dim[0]*dim[2]*dim[3]/2);
663  padding = MAX(padding, dim[0]*dim[1]*dim[3]/2);
664  padding = MAX(padding, dim[0]*dim[1]*dim[2]/2);
665  return padding;
666 }
667 
668 // set the params for the single mass solver
669 static void setInvertParams(const int dim[4], QudaPrecision cpu_prec, QudaPrecision cuda_prec,
670  QudaPrecision cuda_prec_sloppy, double mass, double target_residual,
671  double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity,
673 {
674  invertParam->verbosity = verbosity;
675  invertParam->mass = mass;
676  invertParam->tol = target_residual;
677  invertParam->tol_hq = target_residual_hq;
678 
679  invertParam->residual_type = static_cast<QudaResidualType_s>(0);
680  invertParam->residual_type = (target_residual != 0) ?
681  static_cast<QudaResidualType_s>(invertParam->residual_type | QUDA_L2_RELATIVE_RESIDUAL) :
682  invertParam->residual_type;
683  invertParam->residual_type = (target_residual_hq != 0) ?
684  static_cast<QudaResidualType_s>(invertParam->residual_type | QUDA_HEAVY_QUARK_RESIDUAL) :
685  invertParam->residual_type;
686 
687  invertParam->heavy_quark_check = (invertParam->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0);
688  if (invertParam->heavy_quark_check) {
689  invertParam->max_hq_res_increase = 5; // this caps the number of consecutive hq residual increases
690  invertParam->max_hq_res_restart_total = 10; // this caps the number of hq restarts in case of solver stalling
691  }
692 
693  invertParam->use_sloppy_partial_accumulator = 0;
694  invertParam->num_offset = 0;
695 
696  invertParam->inv_type = inverter;
697  invertParam->maxiter = maxiter;
698  invertParam->reliable_delta = reliable_delta;
699 
701  invertParam->cpu_prec = cpu_prec;
702  invertParam->cuda_prec = cuda_prec;
703  invertParam->cuda_prec_sloppy = invertParam->heavy_quark_check ? cuda_prec : cuda_prec_sloppy;
705 
706  invertParam->gcrNkrylov = 10;
707 
708  invertParam->solution_type = QUDA_MATPC_SOLUTION;
709  invertParam->solve_type = QUDA_DIRECT_PC_SOLVE;
711  invertParam->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code.
712  invertParam->dirac_order = QUDA_DIRAC_ORDER;
713 
714  invertParam->dslash_type = QUDA_ASQTAD_DSLASH;
715  invertParam->Ls = 1;
716  invertParam->gflops = 0.0;
717 
720 
721  if (parity == QUDA_EVEN_PARITY) { // even parity
722  invertParam->matpc_type = QUDA_MATPC_EVEN_EVEN;
723  } else if (parity == QUDA_ODD_PARITY) {
724  invertParam->matpc_type = QUDA_MATPC_ODD_ODD;
725  } else {
726  errorQuda("Invalid parity\n");
727  }
728 
729  invertParam->dagger = QUDA_DAG_NO;
730  invertParam->sp_pad = 0;
732 
733  // for the preconditioner
735  invertParam->tol_precondition = 1e-1;
736  invertParam->maxiter_precondition = 2;
737  invertParam->verbosity_precondition = QUDA_SILENT;
738 
739  invertParam->compute_action = 0;
740 }
741 
742 
743 // Set params for the multi-mass solver.
744 static void setInvertParams(const int dim[4], QudaPrecision cpu_prec, QudaPrecision cuda_prec,
745  QudaPrecision cuda_prec_sloppy, int num_offset, const double offset[],
746  const double target_residual_offset[], const double target_residual_hq_offset[],
748  QudaInverterType inverter, QudaInvertParam *invertParam)
749 {
750  const double null_mass = -1;
751 
752  setInvertParams(dim, cpu_prec, cuda_prec, cuda_prec_sloppy, null_mass, target_residual_offset[0],
753  target_residual_hq_offset[0], maxiter, reliable_delta, parity, verbosity, inverter, invertParam);
754 
755  invertParam->num_offset = num_offset;
756  for (int i = 0; i < num_offset; ++i) {
757  invertParam->offset[i] = offset[i];
758  invertParam->tol_offset[i] = target_residual_offset[i];
759  invertParam->tol_hq_offset[i] = target_residual_hq_offset[i];
760  }
761 }
762 
763 static void getReconstruct(QudaReconstructType &reconstruct, QudaReconstructType &reconstruct_sloppy)
764 {
765  {
766  char *reconstruct_env = getenv("QUDA_MILC_HISQ_RECONSTRUCT");
767  if (!reconstruct_env || strcmp(reconstruct_env, "18") == 0) {
768  reconstruct = QUDA_RECONSTRUCT_NO;
769  } else if (strcmp(reconstruct_env, "13") == 0) {
770  reconstruct = QUDA_RECONSTRUCT_13;
771  } else if (strcmp(reconstruct_env, "9") == 0) {
772  reconstruct = QUDA_RECONSTRUCT_9;
773  } else {
774  errorQuda("QUDA_MILC_HISQ_RECONSTRUCT=%s not supported", reconstruct_env);
775  }
776  }
777 
778  {
779  char *reconstruct_sloppy_env = getenv("QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY");
780  if (!reconstruct_sloppy_env) { // if env is not set, default to using outer reconstruct type
781  reconstruct_sloppy = reconstruct;
782  } else if (strcmp(reconstruct_sloppy_env, "18") == 0) {
783  reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
784  } else if (strcmp(reconstruct_sloppy_env, "13") == 0) {
785  reconstruct_sloppy = QUDA_RECONSTRUCT_13;
786  } else if (strcmp(reconstruct_sloppy_env, "9") == 0) {
787  reconstruct_sloppy = QUDA_RECONSTRUCT_9;
788  } else {
789  errorQuda("QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY=%s not supported", reconstruct_sloppy_env);
790  }
791  }
792 }
793 
794 static void setGaugeParams(QudaGaugeParam &fat_param, QudaGaugeParam &long_param, const void *const fatlink,
795  const void *const longlink, const int dim[4], QudaPrecision cpu_prec,
796  QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, double tadpole, double naik_epsilon)
797 {
798  for (int dir = 0; dir < 4; ++dir) fat_param.X[dir] = dim[dir];
799 
800  fat_param.cpu_prec = cpu_prec;
801  fat_param.cuda_prec = cuda_prec;
804  fat_param.reconstruct = QUDA_RECONSTRUCT_NO;
807  fat_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
808  fat_param.anisotropy = 1.0;
809  fat_param.t_boundary = QUDA_PERIODIC_T; // anti-periodic boundary conditions are built into the gauge field
811  fat_param.ga_pad = getLinkPadding(dim);
812 
813  if (longlink != nullptr) {
814  // improved staggered parameters
815  fat_param.type = QUDA_ASQTAD_FAT_LINKS;
816 
817  // now set the long link parameters needed
818  long_param = fat_param;
819  long_param.tadpole_coeff = tadpole;
820  long_param.scale = -(1.0 + naik_epsilon) / (24.0 * long_param.tadpole_coeff * long_param.tadpole_coeff);
821  long_param.type = QUDA_THREE_LINKS;
822  long_param.ga_pad = 3*fat_param.ga_pad;
823  getReconstruct(long_param.reconstruct, long_param.reconstruct_sloppy);
824  long_param.reconstruct_precondition = long_param.reconstruct_sloppy;
825  } else {
826  // naive staggered parameters
827  fat_param.type = QUDA_SU3_LINKS;
829  }
830 
831 }
832 
833 static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param)
834 {
835  param->nColor = 3;
836  param->nSpin = 1;
837  param->nDim = 4;
838 
839  for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir];
840  param->x[0] /= 2;
841 
842  param->setPrecision(precision);
843  param->pad = 0;
844  param->siteSubset = QUDA_PARITY_SITE_SUBSET;
845  param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
847  param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code.
848  param->create = QUDA_ZERO_FIELD_CREATE;
849 }
850 
852  QudaExtLibType deflation_ext_lib, char vec_infile[], char vec_outfile[], QudaEigParam *df_param)
853 {
854  df_param->import_vectors = strcmp(vec_infile,"") ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
855 
856  df_param->cuda_prec_ritz = ritz_prec;
857  df_param->location = location_ritz;
858  df_param->mem_type_ritz = mem_type_ritz;
859 
860  df_param->run_verify = QUDA_BOOLEAN_FALSE;
861 
862  df_param->nk = df_param->invert_param->n_ev;
863  df_param->np = df_param->invert_param->n_ev * df_param->invert_param->deflation_grid;
864 
865  // set file i/o parameters
866  strcpy(df_param->vec_infile, vec_infile);
867  strcpy(df_param->vec_outfile, vec_outfile);
869 }
870 
871 static size_t getColorVectorOffset(QudaParity local_parity, bool even_odd_exchange, const int dim[4])
872 {
873  size_t offset;
874  int volume = dim[0]*dim[1]*dim[2]*dim[3];
875 
876  if(local_parity == QUDA_EVEN_PARITY){
877  offset = even_odd_exchange ? volume*6/2 : 0;
878  }else{
879  offset = even_odd_exchange ? 0 : volume*6/2;
880  }
881  return offset;
882 }
883 
884 void qudaMultishiftInvert(int external_precision, int quda_precision, int num_offsets, double *const offset,
885  QudaInvertArgs_t inv_args, const double target_residual[],
886  const double target_fermilab_residual[], const void *const fatlink,
887  const void *const longlink, void *source, void **solutionArray, double *const final_residual,
888  double *const final_fermilab_residual, int *num_iters)
889 {
890  static const QudaVerbosity verbosity = getVerbosity();
891  qudamilc_called<true>(__func__, verbosity);
892 
893  if (target_residual[0] == 0) errorQuda("qudaMultishiftInvert: zeroth target residual cannot be zero\n");
894 
895  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
896 
897  static bool force_double_queried = false;
898  static bool do_not_force_double = false;
899  if (!force_double_queried) {
900  char *donotusedouble_env = getenv("QUDA_MILC_OVERRIDE_DOUBLE_MULTISHIFT"); // disable forcing outer double precision
901  if (donotusedouble_env && (!(strcmp(donotusedouble_env, "0") == 0))) {
902  do_not_force_double = true;
903  printfQuda("Disabling always using double as fine precision for MILC multishift\n");
904  }
905  force_double_queried = true;
906  }
907 
908  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
909  bool use_mixed_precision = (((quda_precision == 2) && inv_args.mixed_precision)
910  || ((quda_precision == 1) && (inv_args.mixed_precision == 2))) ?
911  true :
912  false;
913 
914  QudaPrecision device_precision_sloppy;
915  switch(inv_args.mixed_precision) {
916  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
917  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
918  default: device_precision_sloppy = device_precision;
919  }
920 
921  // override fine precision to double, switch to mixed as necessary
922  if (!do_not_force_double && device_precision == QUDA_SINGLE_PRECISION) {
923  // force outer double
924  device_precision = QUDA_DOUBLE_PRECISION;
925  if (device_precision_sloppy == QUDA_SINGLE_PRECISION) use_mixed_precision = true;
926  }
927 
928  QudaGaugeParam fat_param = newQudaGaugeParam();
929  QudaGaugeParam long_param = newQudaGaugeParam();
930  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
931  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
932 
933  QudaInvertParam invertParam = newQudaInvertParam();
934 
935  QudaParity local_parity = inv_args.evenodd;
936  const double reliable_delta = (use_mixed_precision ? 1e-1 : 0.0);
937  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, num_offsets, offset,
938  target_residual, target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity,
939  QUDA_CG_INVERTER, &invertParam);
940 
941  if (inv_args.mixed_precision == 1) {
944  long_param.reconstruct_refinement_sloppy = long_param.reconstruct_sloppy;
946  invertParam.reliable_delta_refinement = 0.1;
947  }
948 
950  setColorSpinorParams(localDim, host_precision, &csParam);
951 
952  // dirty hack to invalidate the cached gauge field without breaking interface compatability
953  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
954 
955  // set the solver
956  if (invalidate_quda_gauge || !create_quda_gauge) {
957  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
958  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
959  invalidate_quda_gauge = false;
960  }
961 
962  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
963 
964  void** sln_pointer = (void**)malloc(num_offsets*sizeof(void*));
965  int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision;
966  void* src_pointer = static_cast<char*>(source) + quark_offset;
967 
968  for (int i = 0; i < num_offsets; ++i) sln_pointer[i] = static_cast<char *>(solutionArray[i]) + quark_offset;
969 
970  invertMultiShiftQuda(sln_pointer, src_pointer, &invertParam);
971  free(sln_pointer);
972 
973  // return the number of iterations taken by the inverter
974  *num_iters = invertParam.iter;
975  for (int i = 0; i < num_offsets; ++i) {
976  final_residual[i] = invertParam.true_res_offset[i];
977  final_fermilab_residual[i] = invertParam.true_res_hq_offset[i];
978  } // end loop over number of offsets
979 
980  if (!create_quda_gauge) invalidateGaugeQuda();
981 
982  qudamilc_called<false>(__func__, verbosity);
983 } // qudaMultiShiftInvert
984 
985 void qudaInvert(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
986  double target_residual, double target_fermilab_residual, const void *const fatlink,
987  const void *const longlink, void *source, void *solution, double *const final_residual,
988  double *const final_fermilab_residual, int *num_iters)
989 {
990  static const QudaVerbosity verbosity = getVerbosity();
991  qudamilc_called<true>(__func__, verbosity);
992 
993  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n");
994 
995  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
996 
997  static bool force_double_queried = false;
998  static bool do_not_force_double = false;
999  if (!force_double_queried) {
1000  char *donotusedouble_env = getenv("QUDA_MILC_OVERRIDE_DOUBLE_MULTISHIFT"); // disable forcing outer double precision
1001  if (donotusedouble_env && (!(strcmp(donotusedouble_env, "0") == 0))) {
1002  do_not_force_double = true;
1003  printfQuda("Disabling always using double as fine precision for MILC multishift\n");
1004  }
1005  force_double_queried = true;
1006  }
1007 
1008  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1009 
1010  QudaPrecision device_precision_sloppy;
1011  switch(inv_args.mixed_precision) {
1012  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
1013  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
1014  default: device_precision_sloppy = device_precision;
1015  }
1016 
1017  // override fine precision to double, switch to mixed as necessary
1018  if (!do_not_force_double && device_precision == QUDA_SINGLE_PRECISION) {
1019  // force outer double
1020  device_precision = QUDA_DOUBLE_PRECISION;
1021  }
1022 
1023  QudaGaugeParam fat_param = newQudaGaugeParam();
1024  QudaGaugeParam long_param = newQudaGaugeParam();
1025  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
1026  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
1027 
1028  QudaInvertParam invertParam = newQudaInvertParam();
1029 
1030  QudaParity local_parity = inv_args.evenodd;
1031  const double reliable_delta = 1e-1;
1032 
1033  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, mass, target_residual,
1034  target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity,
1035  QUDA_CG_INVERTER, &invertParam);
1036 
1038  setColorSpinorParams(localDim, host_precision, &csParam);
1039 
1040  // dirty hack to invalidate the cached gauge field without breaking interface compatability
1041  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
1042 
1043  if (invalidate_quda_gauge || !create_quda_gauge) {
1044  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
1045  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
1046  invalidate_quda_gauge = false;
1047  }
1048 
1049  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
1050 
1051  int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision;
1052 
1053  invertQuda(static_cast<char *>(solution) + quark_offset, static_cast<char *>(source) + quark_offset, &invertParam);
1054 
1055  // return the number of iterations taken by the inverter
1056  *num_iters = invertParam.iter;
1057  *final_residual = invertParam.true_res;
1058  *final_fermilab_residual = invertParam.true_res_hq;
1059 
1060  if (!create_quda_gauge) invalidateGaugeQuda();
1061 
1062  qudamilc_called<false>(__func__, verbosity);
1063 } // qudaInvert
1064 
1065 
1066 void qudaDslash(int external_precision, int quda_precision, QudaInvertArgs_t inv_args, const void *const fatlink,
1067  const void *const longlink, void* src, void* dst, int* num_iters)
1068 {
1069  static const QudaVerbosity verbosity = getVerbosity();
1070  qudamilc_called<true>(__func__, verbosity);
1071 
1072  // static const QudaVerbosity verbosity = getVerbosity();
1073  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1074  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1075  QudaPrecision device_precision_sloppy = device_precision;
1076 
1077  QudaGaugeParam fat_param = newQudaGaugeParam();
1078  QudaGaugeParam long_param = newQudaGaugeParam();
1079  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
1080  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
1081 
1082  QudaInvertParam invertParam = newQudaInvertParam();
1083 
1084  QudaParity local_parity = inv_args.evenodd;
1085  QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY;
1086 
1087  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity,
1088  verbosity, QUDA_CG_INVERTER, &invertParam);
1089 
1091  setColorSpinorParams(localDim, host_precision, &csParam);
1092 
1093  // dirty hack to invalidate the cached gauge field without breaking interface compatability
1094  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
1095 
1096  if (invalidate_quda_gauge || !create_quda_gauge) {
1097  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
1098  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
1099  invalidate_quda_gauge = false;
1100  }
1101 
1102  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
1103 
1104  int src_offset = getColorVectorOffset(other_parity, false, localDim);
1105  int dst_offset = getColorVectorOffset(local_parity, false, localDim);
1106 
1107  dslashQuda(static_cast<char*>(dst) + dst_offset*host_precision,
1108  static_cast<char*>(src) + src_offset*host_precision,
1109  &invertParam, local_parity);
1110 
1111  if (!create_quda_gauge) invalidateGaugeQuda();
1112 
1113  qudamilc_called<false>(__func__, verbosity);
1114 } // qudaDslash
1115 
1116 void qudaInvertMsrc(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
1117  double target_residual, double target_fermilab_residual, const void *const fatlink,
1118  const void *const longlink, void **sourceArray, void **solutionArray, double *const final_residual,
1119  double *const final_fermilab_residual, int *num_iters, int num_src)
1120 {
1121  static const QudaVerbosity verbosity = getVerbosity();
1122  qudamilc_called<true>(__func__, verbosity);
1123 
1124  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n");
1125 
1126  // static const QudaVerbosity verbosity = getVerbosity();
1127  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1128  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1129  QudaPrecision device_precision_sloppy;
1130 
1131  switch(inv_args.mixed_precision) {
1132  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
1133  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
1134  default: device_precision_sloppy = device_precision;
1135  }
1136 
1137  QudaGaugeParam fat_param = newQudaGaugeParam();
1138  QudaGaugeParam long_param = newQudaGaugeParam();
1139  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
1140  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
1141 
1142  QudaInvertParam invertParam = newQudaInvertParam();
1143 
1144  QudaParity local_parity = inv_args.evenodd;
1145  const double reliable_delta = 1e-1;
1146 
1147  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, mass, target_residual,
1148  target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity,
1149  QUDA_CG_INVERTER, &invertParam);
1150  invertParam.num_src = num_src;
1151 
1153  setColorSpinorParams(localDim, host_precision, &csParam);
1154 
1155  // dirty hack to invalidate the cached gauge field without breaking interface compatability
1156  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
1157 
1158  if (invalidate_quda_gauge || !create_quda_gauge) {
1159  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
1160  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
1161  invalidate_quda_gauge = false;
1162  }
1163 
1164  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
1165 
1166  int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision;
1167  void** sln_pointer = (void**)malloc(num_src*sizeof(void*));
1168  void** src_pointer = (void**)malloc(num_src*sizeof(void*));
1169 
1170  for (int i = 0; i < num_src; ++i) sln_pointer[i] = static_cast<char *>(solutionArray[i]) + quark_offset;
1171  for (int i = 0; i < num_src; ++i) src_pointer[i] = static_cast<char *>(sourceArray[i]) + quark_offset;
1172 
1173  invertMultiSrcQuda(sln_pointer, src_pointer, &invertParam, nullptr, nullptr);
1174 
1175  free(sln_pointer);
1176  free(src_pointer);
1177 
1178  // return the number of iterations taken by the inverter
1179  *num_iters = invertParam.iter;
1180  *final_residual = invertParam.true_res;
1181  *final_fermilab_residual = invertParam.true_res_hq;
1182 
1183  if (!create_quda_gauge) invalidateGaugeQuda();
1184 
1185  qudamilc_called<false>(__func__, verbosity);
1186 } // qudaInvert
1187 
1188 void qudaEigCGInvert(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
1189  double target_residual, double target_fermilab_residual, const void *const fatlink,
1190  const void *const longlink,
1191  void *source, // array of source vectors -> overwritten on exit
1192  void *solution, // temporary
1193  QudaEigArgs_t eig_args,
1194  const int rhs_idx, // current rhs
1195  const int last_rhs_flag, // is this the last rhs to solve
1196  double *const final_residual, double *const final_fermilab_residual, int *num_iters)
1197 {
1198  static const QudaVerbosity verbosity = getVerbosity();
1199  qudamilc_called<true>(__func__, verbosity);
1200 
1201  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n");
1202 
1203  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1204  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
1205  QudaPrecision device_precision_sloppy;
1206 
1207  switch(inv_args.mixed_precision) {
1208  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
1209  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
1210  default: device_precision_sloppy = device_precision;
1211  }
1212 
1213  QudaGaugeParam fat_param = newQudaGaugeParam();
1214  QudaGaugeParam long_param = newQudaGaugeParam();
1215  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
1216  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
1217 
1218  QudaInvertParam invertParam = newQudaInvertParam();
1219 
1220  QudaParity local_parity = inv_args.evenodd;
1221  double& target_res = target_residual;
1222  double& target_res_hq = target_fermilab_residual;
1223  const double reliable_delta = 1e-1;
1224 
1225  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, mass, target_res, target_res_hq,
1226  inv_args.max_iter, reliable_delta, local_parity, verbosity, QUDA_CG_INVERTER, &invertParam);
1227 
1228  QudaEigParam df_param = newQudaEigParam();
1229  df_param.invert_param = &invertParam;
1230 
1231  invertParam.n_ev = eig_args.nev;
1232  invertParam.max_search_dim = eig_args.max_search_dim;
1233  invertParam.deflation_grid = eig_args.deflation_grid;
1234  invertParam.tol_restart = eig_args.tol_restart;
1235  invertParam.eigcg_max_restarts = eig_args.eigcg_max_restarts;
1236  invertParam.max_restart_num = eig_args.max_restart_num;
1237  invertParam.inc_tol = eig_args.inc_tol;
1238  invertParam.eigenval_tol = eig_args.eigenval_tol;
1239  invertParam.rhs_idx = rhs_idx;
1240 
1241  if ((inv_args.solver_type != QUDA_INC_EIGCG_INVERTER) && (inv_args.solver_type != QUDA_EIGCG_INVERTER))
1242  errorQuda("Incorrect inverter type.\n");
1243  invertParam.inv_type = inv_args.solver_type;
1244 
1246 
1247  setDeflationParam(eig_args.prec_ritz, eig_args.location_ritz, eig_args.mem_type_ritz, eig_args.deflation_ext_lib, eig_args.vec_infile, eig_args.vec_outfile, &df_param);
1248 
1250  setColorSpinorParams(localDim, host_precision, &csParam);
1251 
1252  // dirty hack to invalidate the cached gauge field without breaking interface compatability
1253  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
1254 
1255  if ((invalidate_quda_gauge || !create_quda_gauge) && (rhs_idx == 0)) { // do this for the first RHS
1256  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
1257  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
1258  invalidate_quda_gauge = false;
1259  }
1260 
1261  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
1262 
1263  int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision;
1264 
1265  if(rhs_idx == 0) df_preconditioner = newDeflationQuda(&df_param);
1266 
1267  invertParam.deflation_op = df_preconditioner;
1268 
1269  invertQuda(static_cast<char *>(solution) + quark_offset, static_cast<char *>(source) + quark_offset, &invertParam);
1270 
1271  if (last_rhs_flag) destroyDeflationQuda(df_preconditioner);
1272 
1273  // return the number of iterations taken by the inverter
1274  *num_iters = invertParam.iter;
1275  *final_residual = invertParam.true_res;
1276  *final_fermilab_residual = invertParam.true_res_hq;
1277 
1278  if (!create_quda_gauge && last_rhs_flag) invalidateGaugeQuda();
1279 
1280  qudamilc_called<false>(__func__, verbosity);
1281 } // qudaEigCGInvert
1282 
1283 // Structure used to handle loading from input file
1285 
1288  QudaPrecision preconditioner_precision; // precision for near-nulls, coarse links
1289 
1290  // Setup
1291  int nvec[QUDA_MAX_MG_LEVEL]; // ignored on first level, if non-zero on last level we deflate
1292  QudaInverterType setup_inv[QUDA_MAX_MG_LEVEL]; // ignored on first and last level
1293  double setup_tol[QUDA_MAX_MG_LEVEL]; // ignored on first and last level
1294  double setup_maxiter[QUDA_MAX_MG_LEVEL]; // ignored on first and last level
1295  char mg_vec_infile[QUDA_MAX_MG_LEVEL][256]; // ignored on first and last level
1296  char mg_vec_outfile[QUDA_MAX_MG_LEVEL][256]; // ignored on first and last level
1297  int geo_block_size[QUDA_MAX_MG_LEVEL][4]; // ignored on first and last level (first is 2 2 2 2)
1298 
1299  // Solve
1300  QudaSolveType coarse_solve_type[QUDA_MAX_MG_LEVEL]; // ignored on first and second level
1302  double coarse_solver_tol[QUDA_MAX_MG_LEVEL]; // ignored on first level
1303  int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; // ignored on first level
1305  int nu_pre[QUDA_MAX_MG_LEVEL]; // all but last level
1306  int nu_post[QUDA_MAX_MG_LEVEL]; // all but last level
1307 
1308  // Misc
1310 
1311  // Coarsest level deflation
1315  double deflate_tol;
1317  double deflate_a_min; // ignored if no polynomial acceleration
1318  int deflate_poly_deg; // ignored if no polynomial acceleration
1319 
1321  {
1322  // set dummy values so all elements are initialized
1323  // some of these values get immediately overriden in the
1324  // constructor, in some cases with identical values:
1325  // this is to separate "initializing" with "best practices"
1326  for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++) {
1327  nvec[i] = 24;
1329  setup_tol[i] = 1e-5;
1330  setup_maxiter[i] = 500;
1331  mg_vec_infile[i][0] = 0;
1332  mg_vec_outfile[i][0] = 0;
1333  for (int d = 0; d < 4; d++) { geo_block_size[i][d] = 2; }
1334 
1337  coarse_solver_tol[i] = 0.25;
1338  coarse_solver_maxiter[i] = 16;
1340  nu_pre[i] = 0;
1341  nu_post[i] = 2;
1342 
1344  }
1345  }
1346 
1347  // set defaults
1349  mg_levels(4),
1350  verify_results(true),
1351  preconditioner_precision(QUDA_HALF_PRECISION),
1352  deflate_n_ev(66),
1353  deflate_n_kr(128),
1354  deflate_max_restarts(50),
1355  deflate_tol(1e-5),
1356  deflate_use_poly_acc(false),
1357  deflate_a_min(1e-2),
1358  deflate_poly_deg(50)
1359  {
1360  /* initialize internal arrays */
1361  setArrayDefaults();
1362 
1363  /* required or best-practice values for typical solves */
1364  nvec[0] = 24; // must be this
1365  geo_block_size[0][0] = 2; // must be this...
1366  geo_block_size[0][1] = 2; // "
1367  geo_block_size[0][2] = 2; // "
1368  geo_block_size[0][3] = 2; // "
1369 
1370  nvec[1] = 64;
1371  geo_block_size[1][0] = 2;
1372  geo_block_size[1][1] = 2;
1373  geo_block_size[1][2] = 2;
1374  geo_block_size[1][3] = 2;
1375 
1376  nvec[2] = 96;
1377  geo_block_size[2][0] = 2;
1378  geo_block_size[2][1] = 2;
1379  geo_block_size[2][2] = 2;
1380  geo_block_size[2][3] = 2;
1381 
1382  /* Setup */
1383 
1384  /* level 0 -> 1 is K-D, no customization */
1385 
1386  /* Level 1 (pseudo-fine) to 2 (intermediate) */
1388  setup_tol[1] = 1e-5;
1389  setup_maxiter[1] = 500;
1390  mg_vec_infile[1][0] = 0;
1391  mg_vec_outfile[1][0] = 0;
1392 
1393  /* Level 2 (intermediate) to 3 (coarsest) */
1395  setup_tol[2] = 1e-5;
1396  setup_maxiter[2] = 500;
1397  mg_vec_infile[2][0] = 0;
1398  mg_vec_outfile[2][0] = 0;
1399 
1400  /* Solve info */
1401 
1402  /* Level 0 only needs a smoother */
1404  nu_pre[0] = 0;
1405  nu_post[0] = 4;
1406 
1407  /* Level 1 */
1409  coarse_solver_tol[1] = 5e-2;
1410  coarse_solver_maxiter[1] = 4;
1412  nu_pre[1] = 0;
1413  nu_post[1] = 2;
1414 
1415  /* Level 2 */
1418  coarse_solver_tol[2] = 0.25;
1419  coarse_solver_maxiter[2] = 4;
1421  nu_pre[2] = 0;
1422  nu_post[2] = 2;
1423 
1424  /* Level 3 */
1426  coarse_solver[3] = QUDA_CA_GCR_INVERTER; // use CGNR for non-deflated... sometimes
1427  coarse_solver_tol[3] = 0.25;
1428  coarse_solver_maxiter[3] = 16; // use larger for non-deflated
1429 
1430  /* Misc */
1435 
1436  /* Deflation */
1437  nvec[3] = 0; // 64; // do not deflate
1438  mg_vec_infile[3][0] = 0;
1439  mg_vec_outfile[3][0] = 0;
1440  deflate_n_ev = 66;
1441  deflate_n_kr = 128;
1442  deflate_tol = 1e-3;
1443  deflate_max_restarts = 50;
1444  deflate_use_poly_acc = false;
1445  deflate_a_min = 1e-2;
1446  deflate_poly_deg = 20;
1447  }
1448 
1450  {
1451  if (strcmp(name, "gcr") == 0) {
1452  return QUDA_GCR_INVERTER;
1453  } else if (strcmp(name, "cgnr") == 0) {
1454  return QUDA_CGNR_INVERTER;
1455  } else if (strcmp(name, "cgne") == 0) {
1456  return QUDA_CGNE_INVERTER;
1457  } else if (strcmp(name, "bicgstab") == 0) {
1458  return QUDA_BICGSTAB_INVERTER;
1459  } else if (strcmp(name, "ca-gcr") == 0) {
1460  return QUDA_CA_GCR_INVERTER;
1461  } else {
1462  return QUDA_INVALID_INVERTER;
1463  }
1464  }
1465 
1467  {
1468  if (strcmp(name, "single") == 0) {
1469  return QUDA_SINGLE_PRECISION;
1470  } else if (strcmp(name, "half") == 0) {
1471  return QUDA_HALF_PRECISION;
1472  } else {
1473  return QUDA_INVALID_PRECISION;
1474  }
1475  }
1476 
1478  {
1479  if (strcmp(name, "direct") == 0) {
1480  return QUDA_DIRECT_SOLVE;
1481  } else if (strcmp(name, "direct-pc") == 0) {
1482  return QUDA_DIRECT_PC_SOLVE;
1483  } else {
1484  return QUDA_INVALID_SOLVE;
1485  }
1486  }
1487 
1489  {
1490  if (strcmp(name, "silent") == 0) {
1491  return QUDA_SILENT;
1492  } else if (strcmp(name, "summarize") == 0 || strcmp(name, "false") == 0) {
1493  // false == summary is for backwards compatibility
1494  return QUDA_SUMMARIZE;
1495  } else if (strcmp(name, "verbose") == 0 || strcmp(name, "true") == 0) {
1496  // true == verbose is for backwards compatibility
1497  return QUDA_VERBOSE;
1498  } else if (strcmp(name, "debug") == 0) {
1499  return QUDA_DEBUG_VERBOSE;
1500  } else {
1501  return QUDA_INVALID_VERBOSITY;
1502  }
1503  }
1504 
1505  // parse out a line
1506  bool update(std::vector<std::string> &input_line)
1507  {
1508 
1509  int error_code = 0; // no error
1510  // 1 = wrong number of arguments
1511 
1512  if (strcmp(input_line[0].c_str(), "mg_levels") == 0) {
1513  if (input_line.size() < 2) {
1514  error_code = 1;
1515  } else {
1516  mg_levels = atoi(input_line[1].c_str());
1517  }
1518 
1519  } else if (strcmp(input_line[0].c_str(), "verify_results") == 0) {
1520  if (input_line.size() < 2) {
1521  error_code = 1;
1522  } else {
1523  verify_results = input_line[1][0] == 't' ? true : false;
1524  }
1525 
1526  } else if (strcmp(input_line[0].c_str(), "preconditioner_precision") == 0) {
1527  if (input_line.size() < 2) {
1528  error_code = 1;
1529  } else {
1530  preconditioner_precision = getQudaPrecision(input_line[1].c_str());
1531  }
1532 
1533  } else if (strcmp(input_line[0].c_str(), "mg_verbosity") == 0) {
1534  if (input_line.size() < 3) {
1535  error_code = 1;
1536  } else {
1537  mg_verbosity[atoi(input_line[1].c_str())] = getQudaVerbosity(input_line[2].c_str());
1538  }
1539 
1540  } else /* Begin Setup */
1541  if (strcmp(input_line[0].c_str(), "nvec") == 0) {
1542  if (input_line.size() < 3) {
1543  error_code = 1;
1544  } else {
1545  nvec[atoi(input_line[1].c_str())] = atoi(input_line[2].c_str());
1546  }
1547 
1548  } else if (strcmp(input_line[0].c_str(), "geo_block_size") == 0) {
1549  if (input_line.size() < 6) {
1550  error_code = 1;
1551  } else {
1552  for (int d = 0; d < 4; d++) geo_block_size[atoi(input_line[1].c_str())][d] = atoi(input_line[2 + d].c_str());
1553  }
1554 
1555  } else if (strcmp(input_line[0].c_str(), "setup_inv") == 0) {
1556  if (input_line.size() < 3) {
1557  error_code = 1;
1558  } else {
1559  setup_inv[atoi(input_line[1].c_str())] = getQudaInverterType(input_line[2].c_str());
1560  }
1561 
1562  } else if (strcmp(input_line[0].c_str(), "setup_tol") == 0) {
1563  if (input_line.size() < 3) {
1564  error_code = 1;
1565  } else {
1566  setup_tol[atoi(input_line[1].c_str())] = atof(input_line[2].c_str());
1567  }
1568 
1569  } else if (strcmp(input_line[0].c_str(), "setup_maxiter") == 0) {
1570  if (input_line.size() < 3) {
1571  error_code = 1;
1572  } else {
1573  setup_maxiter[atoi(input_line[1].c_str())] = atoi(input_line[2].c_str());
1574  }
1575 
1576  } else if (strcmp(input_line[0].c_str(), "mg_vec_infile") == 0) {
1577  if (input_line.size() < 3) {
1578  error_code = 1;
1579  } else {
1580  strcpy(mg_vec_infile[atoi(input_line[1].c_str())], input_line[2].c_str());
1581  }
1582 
1583  } else if (strcmp(input_line[0].c_str(), "mg_vec_outfile") == 0) {
1584  if (input_line.size() < 3) {
1585  error_code = 1;
1586  } else {
1587  strcpy(mg_vec_outfile[atoi(input_line[1].c_str())], input_line[2].c_str());
1588  }
1589 
1590  } else /* Begin Solvers */
1591  if (strcmp(input_line[0].c_str(), "coarse_solve_type") == 0) {
1592  if (input_line.size() < 3) {
1593  error_code = 1;
1594  } else {
1595  coarse_solve_type[atoi(input_line[1].c_str())] = getQudaSolveType(input_line[2].c_str());
1596  }
1597 
1598  } else if (strcmp(input_line[0].c_str(), "coarse_solver") == 0) {
1599  if (input_line.size() < 3) {
1600  error_code = 1;
1601  } else {
1602  coarse_solver[atoi(input_line[1].c_str())] = getQudaInverterType(input_line[2].c_str());
1603  }
1604 
1605  } else if (strcmp(input_line[0].c_str(), "coarse_solver_tol") == 0) {
1606  if (input_line.size() < 3) {
1607  error_code = 1;
1608  } else {
1609  coarse_solver_tol[atoi(input_line[1].c_str())] = atof(input_line[2].c_str());
1610  }
1611 
1612  } else if (strcmp(input_line[0].c_str(), "coarse_solver_maxiter") == 0) {
1613  if (input_line.size() < 3) {
1614  error_code = 1;
1615  } else {
1616  coarse_solver_maxiter[atoi(input_line[1].c_str())] = atoi(input_line[2].c_str());
1617  }
1618 
1619  } else if (strcmp(input_line[0].c_str(), "smoother_type") == 0) {
1620  if (input_line.size() < 3) {
1621  error_code = 1;
1622  } else {
1623  smoother_type[atoi(input_line[1].c_str())] = getQudaInverterType(input_line[2].c_str());
1624  }
1625 
1626  } else if (strcmp(input_line[0].c_str(), "nu_pre") == 0) {
1627  if (input_line.size() < 3) {
1628  error_code = 1;
1629  } else {
1630  nu_pre[atoi(input_line[1].c_str())] = atoi(input_line[2].c_str());
1631  }
1632 
1633  } else if (strcmp(input_line[0].c_str(), "nu_post") == 0) {
1634  if (input_line.size() < 3) {
1635  error_code = 1;
1636  } else {
1637  nu_post[atoi(input_line[1].c_str())] = atoi(input_line[2].c_str());
1638  }
1639 
1640  } else /* Begin Deflation */
1641  if (strcmp(input_line[0].c_str(), "deflate_n_ev") == 0) {
1642  if (input_line.size() < 2) {
1643  error_code = 1;
1644  } else {
1645  deflate_n_ev = atoi(input_line[1].c_str());
1646  }
1647 
1648  } else if (strcmp(input_line[0].c_str(), "deflate_n_kr") == 0) {
1649  if (input_line.size() < 2) {
1650  error_code = 1;
1651  } else {
1652  deflate_n_kr = atoi(input_line[1].c_str());
1653  }
1654 
1655  } else if (strcmp(input_line[0].c_str(), "deflate_max_restarts") == 0) {
1656  if (input_line.size() < 2) {
1657  error_code = 1;
1658  } else {
1659  deflate_max_restarts = atoi(input_line[1].c_str());
1660  }
1661 
1662  } else if (strcmp(input_line[0].c_str(), "deflate_tol") == 0) {
1663  if (input_line.size() < 2) {
1664  error_code = 1;
1665  } else {
1666  deflate_tol = atof(input_line[1].c_str());
1667  }
1668 
1669  } else if (strcmp(input_line[0].c_str(), "deflate_use_poly_acc") == 0) {
1670  if (input_line.size() < 2) {
1671  error_code = 1;
1672  } else {
1673  deflate_use_poly_acc = input_line[1][0] == 't' ? true : false;
1674  }
1675 
1676  } else if (strcmp(input_line[0].c_str(), "deflate_a_min") == 0) {
1677  if (input_line.size() < 2) {
1678  error_code = 1;
1679  } else {
1680  deflate_a_min = atof(input_line[1].c_str());
1681  }
1682 
1683  } else if (strcmp(input_line[0].c_str(), "deflate_poly_deg") == 0) {
1684  if (input_line.size() < 2) {
1685  error_code = 1;
1686  } else {
1687  deflate_poly_deg = atoi(input_line[1].c_str());
1688  }
1689 
1690  } else {
1691  printf("Invalid option %s\n", input_line[0].c_str());
1692  return false;
1693  }
1694 
1695  if (error_code == 1) {
1696  printf("Input option %s has an invalid number of arguments\n", input_line[0].c_str());
1697  return false;
1698  }
1699 
1700  return true;
1701  }
1702 };
1703 
1704 // Internal structure that maintains `QudaMultigridParam`,
1705 // `QudaInvertParam`, `QudaEigParam`s, and the traditional
1706 // void* returned by `newMultigridQuda`.
1707 // last_mass tracks rebuilds based on changing the mass.
1713  double last_mass;
1715 };
1716 
1717 // Parameters defining the eigensolver
1718 void milcSetMultigridEigParam(QudaEigParam &mg_eig_param, mgInputStruct &input_struct, int level)
1719 {
1720  mg_eig_param.eig_type = QUDA_EIG_TR_LANCZOS; // mg_eig_type[level];
1721  mg_eig_param.spectrum = QUDA_SPECTRUM_SR_EIG; // mg_eig_spectrum[level];
1722  if ((mg_eig_param.eig_type == QUDA_EIG_TR_LANCZOS || mg_eig_param.eig_type)
1723  && !(mg_eig_param.spectrum == QUDA_SPECTRUM_LR_EIG || mg_eig_param.spectrum == QUDA_SPECTRUM_SR_EIG)) {
1724  errorQuda("Only real spectrum type (LR or SR) can be passed to the a Lanczos type solver");
1725  }
1726 
1727  mg_eig_param.n_ev = input_struct.deflate_n_ev; // mg_eig_n_ev[level];
1728  mg_eig_param.n_kr = input_struct.deflate_n_kr; // mg_eig_n_kr[level];
1729  mg_eig_param.n_conv = input_struct.nvec[level];
1730  mg_eig_param.n_ev_deflate = -1; // deflate everything that converged
1731  mg_eig_param.batched_rotate = 0; // mg_eig_batched_rotate[level];
1732  mg_eig_param.require_convergence
1733  = QUDA_BOOLEAN_TRUE; // mg_eig_require_convergence[level] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
1734 
1735  mg_eig_param.tol = input_struct.deflate_tol; // mg_eig_tol[level];
1736  mg_eig_param.check_interval = 10; // mg_eig_check_interval[level];
1737  mg_eig_param.max_restarts = input_struct.deflate_max_restarts; // mg_eig_max_restarts[level];
1738 
1739  mg_eig_param.compute_svd = QUDA_BOOLEAN_FALSE;
1740  mg_eig_param.use_norm_op = QUDA_BOOLEAN_TRUE; // mg_eig_use_normop[level] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
1741  mg_eig_param.use_dagger = QUDA_BOOLEAN_FALSE; // mg_eig_use_dagger[level] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
1742 
1744  mg_eig_param.poly_deg = input_struct.deflate_poly_deg; // mg_eig_poly_deg[level];
1745  mg_eig_param.a_min = input_struct.deflate_a_min; // mg_eig_amin[level];
1746  mg_eig_param.a_max = 0.0; // compute estimate // mg_eig_amax[level];
1747 
1748  // set file i/o parameters
1749  // Give empty strings, Multigrid will handle IO.
1750  strcpy(mg_eig_param.vec_infile, "");
1751  strcpy(mg_eig_param.vec_outfile, "");
1752  mg_eig_param.io_parity_inflate = QUDA_BOOLEAN_FALSE; // do not inflate coarse vectors
1753  mg_eig_param.save_prec = QUDA_SINGLE_PRECISION; // cannot save in fixed point
1754 
1755  strcpy(mg_eig_param.QUDA_logfile, "" /*eig_QUDA_logfile*/);
1756 }
1757 
1758 void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precision, QudaPrecision device_precision,
1759  QudaPrecision device_precision_sloppy, double mass, const char *const mg_param_file)
1760 {
1761  static const QudaVerbosity verbosity = getVerbosity();
1762  QudaMultigridParam &mg_param = mg_pack->mg_param;
1763 
1764  // Create an input struct
1765  mgInputStruct input_struct;
1766 
1767  // Load input struct on rank 0
1768  if (comm_rank() == 0) {
1769  std::ifstream input_file(mg_param_file, std::ios_base::in);
1770 
1771  if (!input_file.is_open()) { errorQuda("MILC interface MG input file %s does not exist!", mg_param_file); }
1772 
1773  // enter parameter loop
1774  char buffer[1024];
1775  std::vector<std::string> elements;
1776  while (!input_file.eof()) {
1777 
1778  elements.clear();
1779 
1780  // get line
1781  input_file.getline(buffer, 1024);
1782 
1783  // split on spaces, tabs
1784  char *pch = strtok(buffer, " \t");
1785  while (pch != nullptr) {
1786  elements.emplace_back(std::string(pch));
1787  pch = strtok(nullptr, " \t");
1788  }
1789 
1790  // skip empty lines, comments
1791  if (elements.size() == 0 || elements[0][0] == '#') continue;
1792 
1793  // debug: print back out
1794  if (verbosity == QUDA_VERBOSE) {
1795  for (auto elem : elements) { printf("%s ", elem.c_str()); }
1796  printf("\n");
1797  }
1798 
1799  input_struct.update(elements);
1800  }
1801  }
1802 
1803  comm_barrier();
1804  comm_broadcast((void *)&input_struct, sizeof(mgInputStruct));
1805 
1806  auto mg_levels = input_struct.mg_levels;
1807 
1808  // Prepare eigenvector params
1809  for (int i = 0; i < mg_levels; i++) {
1810  mg_pack->mg_eig_param[i] = newQudaEigParam();
1811  milcSetMultigridEigParam(mg_pack->mg_eig_param[i], input_struct, i);
1812  }
1813 
1814  mg_pack->mg_inv_param = newQudaInvertParam();
1815  mg_pack->mg_param = newQudaMultigridParam();
1816  mg_pack->last_mass = mass;
1817 
1818  mg_pack->mg_param.invert_param = &mg_pack->mg_inv_param;
1819  for (int i = 0; i < mg_levels; i++) { mg_pack->mg_param.eig_param[i] = &mg_pack->mg_eig_param[i]; }
1820 
1821  QudaInvertParam &inv_param = *mg_param.invert_param; // this will be used to setup SolverParam parent in MGParam class
1822 
1823  inv_param.Ls = 1;
1824 
1825  inv_param.sp_pad = 0;
1826  inv_param.cl_pad = 0;
1827 
1828  inv_param.cpu_prec = host_precision;
1829  inv_param.cuda_prec = device_precision;
1830  inv_param.cuda_prec_sloppy = device_precision_sloppy;
1835 
1838 
1839  inv_param.dslash_type = QUDA_ASQTAD_DSLASH; // dslash_type;
1840 
1841  inv_param.mass = mass;
1842  inv_param.kappa = 1.0 / (2.0 * (4.0 + inv_param.mass));
1843 
1846 
1847  // this gets ignored
1848  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; // matpc_type;
1849 
1850  // req'd for staggered/hisq
1852 
1855 
1856  mg_param.invert_param = &inv_param;
1857  mg_param.n_level = mg_levels; // set from file
1858 
1859  mg_param.use_mma = QUDA_BOOLEAN_FALSE; // TODO: set to false, for now.
1860 
1861  for (int i = 0; i < mg_param.n_level; i++) {
1862 
1863  if (i == 0) {
1864  for (int j = 0; j < 4; j++) {
1865  mg_param.geo_block_size[i][j] = 2; // Kahler-Dirac blocking
1866  }
1867  } else {
1868  for (int j = 0; j < 4; j++) { mg_param.geo_block_size[i][j] = input_struct.geo_block_size[i][j]; }
1869  }
1870 
1871  // mg_param.use_eig_solver[i] = QUDA_BOOLEAN_FALSE; //mg_eig[i] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
1872  if (i == mg_param.n_level - 1 && input_struct.nvec[i] > 0) {
1873  mg_param.use_eig_solver[i] = QUDA_BOOLEAN_TRUE;
1874  } else {
1875  mg_param.use_eig_solver[i] = QUDA_BOOLEAN_FALSE;
1876  }
1877 
1878  mg_param.verbosity[i] = input_struct.mg_verbosity[i];
1879  mg_param.setup_inv_type[i] = input_struct.setup_inv[i];
1880  mg_param.num_setup_iter[i] = 1; // num_setup_iter[i];
1881  mg_param.setup_tol[i] = input_struct.setup_tol[i];
1882  mg_param.setup_maxiter[i] = input_struct.setup_maxiter[i];
1883 
1884  // Basis to use for CA-CGN(E/R) setup
1885  mg_param.setup_ca_basis[i] = QUDA_POWER_BASIS; // setup_ca_basis[i];
1886 
1887  // Basis size for CACG setup
1888  mg_param.setup_ca_basis_size[i] = 4; // setup_ca_basis_size[i];
1889 
1890  // Minimum and maximum eigenvalue for Chebyshev CA basis setup
1891  mg_param.setup_ca_lambda_min[i] = 0.0; // setup_ca_lambda_min[i];
1892  mg_param.setup_ca_lambda_max[i] = -1.0; // use power iterations // setup_ca_lambda_max[i];
1893 
1894  mg_param.spin_block_size[i] = 1;
1895  // change this to refresh fields when mass or links change
1896  mg_param.setup_maxiter_refresh[i] = 0; // setup_maxiter_refresh[i];
1897  mg_param.n_vec[i] = (i == 0) ? 24 : input_struct.nvec[i];
1898  mg_param.n_block_ortho[i] = 2; // n_block_ortho[i]; // number of times to Gram-Schmidt
1899  mg_param.precision_null[i] = input_struct.preconditioner_precision; // precision to store the null-space basis
1900  mg_param.smoother_halo_precision[i]
1901  = input_struct.preconditioner_precision; // precision of the halo exchange in the smoother
1902  mg_param.nu_pre[i] = input_struct.nu_pre[i];
1903  mg_param.nu_post[i] = input_struct.nu_post[i];
1904  mg_param.mu_factor[i] = 1.; // mu_factor[i];
1905 
1906  mg_param.cycle_type[i] = QUDA_MG_CYCLE_RECURSIVE;
1907 
1908  // top level: coarse vs optimized KD, otherwise standard
1909  // aggregation. FIXME optimized
1911 
1912  // set the coarse solver wrappers including bottom solver
1913  mg_param.coarse_solver[i] = input_struct.coarse_solver[i];
1914  mg_param.coarse_solver_tol[i] = input_struct.coarse_solver_tol[i];
1915  mg_param.coarse_solver_maxiter[i] = input_struct.coarse_solver_maxiter[i];
1916 
1917  // Basis to use for CA-CGN(E/R) coarse solver
1918  mg_param.coarse_solver_ca_basis[i] = QUDA_POWER_BASIS; // coarse_solver_ca_basis[i];
1919 
1920  // Basis size for CACG coarse solver/
1921  mg_param.coarse_solver_ca_basis_size[i] = 16; // coarse_solver_ca_basis_size[i];
1922 
1923  // Minimum and maximum eigenvalue for Chebyshev CA basis
1924  mg_param.coarse_solver_ca_lambda_min[i] = 0.0; // coarse_solver_ca_lambda_min[i];
1925  mg_param.coarse_solver_ca_lambda_max[i] = -1.0; // use power iterations // coarse_solver_ca_lambda_max[i];
1926 
1927  mg_param.smoother[i] = input_struct.smoother_type[i];
1928 
1929  // set the smoother / bottom solver tolerance (for MR smoothing this will be ignored)
1930  mg_param.smoother_tol[i] = 1e-10; // smoother_tol[i];
1931 
1932  // set to QUDA_DIRECT_SOLVE for no even/odd preconditioning on the smoother
1933  // set to QUDA_DIRECT_PC_SOLVE for to enable even/odd preconditioning on the smoother
1934  // from test routines: // smoother_solve_type[i];
1935  switch (i) {
1936  case 0: mg_param.smoother_solve_type[0] = QUDA_DIRECT_SOLVE; break;
1937  case 1: mg_param.smoother_solve_type[1] = QUDA_DIRECT_PC_SOLVE; break;
1938  default: mg_param.smoother_solve_type[i] = input_struct.coarse_solve_type[i]; break;
1939  }
1940 
1941  // set to QUDA_ADDITIVE_SCHWARZ for Additive Schwarz precondioned smoother (presently only impelemented for MR)
1942  mg_param.smoother_schwarz_type[i] = QUDA_INVALID_SCHWARZ; // schwarz_type[i];
1943 
1944  // if using Schwarz preconditioning then use local reductions only
1945  mg_param.global_reduction[i]
1946  = QUDA_BOOLEAN_TRUE; // (schwarz_type[i] == QUDA_INVALID_SCHWARZ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
1947 
1948  // set number of Schwarz cycles to apply
1949  mg_param.smoother_schwarz_cycle[i] = 1; // schwarz_cycle[i];
1950 
1951  // Set set coarse_grid_solution_type: this defines which linear
1952  // system we are solving on a given level
1953  // * QUDA_MAT_SOLUTION - we are solving the full system and inject
1954  // a full field into coarse grid
1955  // * QUDA_MATPC_SOLUTION - we are solving the e/o-preconditioned
1956  // system, and only inject single parity field into coarse grid
1957  //
1958  // Multiple possible scenarios here
1959  //
1960  // 1. **Direct outer solver and direct smoother**: here we use
1961  // full-field residual coarsening, and everything involves the
1962  // full system so coarse_grid_solution_type = QUDA_MAT_SOLUTION
1963  //
1964  // 2. **Direct outer solver and preconditioned smoother**: here,
1965  // only the smoothing uses e/o preconditioning, so
1966  // coarse_grid_solution_type = QUDA_MAT_SOLUTION_TYPE.
1967  // We reconstruct the full residual prior to coarsening after the
1968  // pre-smoother, and then need to project the solution for post
1969  // smoothing.
1970  //
1971  // 3. **Preconditioned outer solver and preconditioned smoother**:
1972  // here we use single-parity residual coarsening throughout, so
1973  // coarse_grid_solution_type = QUDA_MATPC_SOLUTION. This is a bit
1974  // questionable from a theoretical point of view, since we don't
1975  // coarsen the preconditioned operator directly, rather we coarsen
1976  // the full operator and preconditioned that, but it just works.
1977  // This is the optimal combination in general for Wilson-type
1978  // operators: although there is an occasional increase in
1979  // iteration or two), by working completely in the preconditioned
1980  // space, we save the cost of reconstructing the full residual
1981  // from the preconditioned smoother, and re-projecting for the
1982  // subsequent smoother, as well as reducing the cost of the
1983  // ancillary blas operations in the coarse-grid solve.
1984  //
1985  // Note, we cannot use preconditioned outer solve with direct
1986  // smoother
1987  //
1988  // Finally, we have to treat the top level carefully: for all
1989  // other levels the entry into and out of the grid will be a
1990  // full-field, which we can then work in Schur complement space or
1991  // not (e.g., freedom to choose coarse_grid_solution_type). For
1992  // the top level, if the outer solver is for the preconditioned
1993  // system, then we must use preconditoning, e.g., option 3.) above.
1994 
1995  if (i == 0) { // top-level treatment
1996 
1997  // Always this for now
1998  if (solve_type == QUDA_DIRECT_SOLVE) {
2000  } else if (solve_type == QUDA_DIRECT_PC_SOLVE) {
2002  } else {
2003  errorQuda("Unexpected solve_type = %d\n", solve_type);
2004  }
2005 
2006  } else if (i == 1) {
2007 
2008  // Always this for now.
2010  } else {
2011 
2012  if (input_struct.coarse_solve_type[i] == QUDA_DIRECT_SOLVE) {
2014  } else if (input_struct.coarse_solve_type[i] == QUDA_DIRECT_PC_SOLVE) {
2016  } else {
2017  errorQuda("unexpected solve type = %d\n", input_struct.coarse_solve_type[i]);
2018  }
2019  }
2020 
2021  mg_param.omega[i] = 0.85; // ignored // omega; // over/under relaxation factor
2022 
2023  mg_param.location[i] = QUDA_CUDA_FIELD_LOCATION; // solver_location[i];
2024  mg_param.setup_location[i] = QUDA_CUDA_FIELD_LOCATION; // setup_location[i];
2025  }
2026 
2027  // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory
2029 
2030  // coarsening the spin on the first restriction is undefined for staggered fields.
2031  mg_param.spin_block_size[0] = 0;
2032 
2033  mg_param.setup_type = QUDA_NULL_VECTOR_SETUP; // setup_type;
2034  mg_param.pre_orthonormalize = QUDA_BOOLEAN_FALSE; // pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2035  mg_param.post_orthonormalize = QUDA_BOOLEAN_TRUE; // post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2036 
2037  mg_param.compute_null_vector
2038  = QUDA_COMPUTE_NULL_VECTOR_YES; // generate_nullspace ? QUDA_COMPUTE_NULL_VECTOR_YES : QUDA_COMPUTE_NULL_VECTOR_NO;
2039 
2040  mg_param.generate_all_levels = QUDA_BOOLEAN_TRUE; // generate_all_levels ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2041 
2042  mg_param.run_verify = input_struct.verify_results ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2043  mg_param.run_low_mode_check = QUDA_BOOLEAN_FALSE; // low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2044  mg_param.run_oblique_proj_check = QUDA_BOOLEAN_FALSE; // oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2045  mg_param.preserve_deflation = QUDA_BOOLEAN_TRUE; // FIXME, controversial, should update if mass changes?
2046 
2047  // set file i/o parameters
2048  for (int i = 0; i < mg_param.n_level; i++) {
2049  strcpy(mg_param.vec_infile[i], input_struct.mg_vec_infile[i]);
2050  strcpy(mg_param.vec_outfile[i], input_struct.mg_vec_outfile[i]);
2051  if (strcmp(mg_param.vec_infile[i], "") != 0) mg_param.vec_load[i] = QUDA_BOOLEAN_TRUE;
2052  if (strcmp(mg_param.vec_outfile[i], "") != 0) mg_param.vec_store[i] = QUDA_BOOLEAN_TRUE;
2053  }
2054 
2055  mg_param.coarse_guess = QUDA_BOOLEAN_FALSE; // mg_eig_coarse_guess ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
2056 
2057  // these need to tbe set for now but are actually ignored by the MG setup
2058  // needed to make it pass the initialization test
2060  inv_param.tol = 1e-10;
2061  inv_param.maxiter = 1000;
2062  inv_param.reliable_delta = 1e-6; // reliable_delta;
2063  inv_param.gcrNkrylov = 10;
2064 
2066 
2067  inv_param.verbosity = input_struct.mg_verbosity[0];
2068 
2069  // We need to pass this back to the fat/long links for the outer-most level.
2070  mg_pack->preconditioner_precision = input_struct.preconditioner_precision;
2071 }
2072 
2073 void *qudaMultigridCreate(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
2074  const void *const fatlink, const void *const longlink, const char *const mg_param_file)
2075 {
2076  static const QudaVerbosity verbosity = getVerbosity();
2077  qudamilc_called<true>(__func__, verbosity);
2078 
2079  // Flip the sign of the mass to fix a consistency issue between MILC, QUDA full
2080  // parity dslash operator
2081  mass = -mass;
2082 
2083  // static const QudaVerbosity verbosity = getVerbosity();
2084  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2085  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2086  QudaPrecision device_precision_sloppy = QUDA_SINGLE_PRECISION;
2087 
2088  QudaGaugeParam fat_param = newQudaGaugeParam();
2089  QudaGaugeParam long_param = newQudaGaugeParam();
2090  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
2091  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
2092 
2093  // Set some other smart defaults
2094  fat_param.type = QUDA_ASQTAD_FAT_LINKS;
2095  fat_param.cuda_prec_refinement_sloppy = fat_param.cuda_prec_sloppy;
2097 
2098  long_param.type = QUDA_ASQTAD_LONG_LINKS;
2099  long_param.cuda_prec_refinement_sloppy = long_param.cuda_prec_sloppy;
2100  long_param.reconstruct_refinement_sloppy = long_param.reconstruct_sloppy;
2101 
2102  // Prepare a multigrid pack
2103  milcMultigridPack *mg_pack = new milcMultigridPack;
2104 
2105  // Set parameters incl. loading from the parameter file here.
2106  milcSetMultigridParam(mg_pack, host_precision, device_precision, device_precision_sloppy, mass, mg_param_file);
2107 
2108  fat_param.cuda_prec_precondition = mg_pack->preconditioner_precision;
2109  long_param.cuda_prec_precondition = mg_pack->preconditioner_precision;
2110 
2111  // dirty hack to invalidate the cached gauge field without breaking interface compatability
2112  // compounding hack: *num_iters == 1 is always true here
2113  // if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda();
2114  invalidateGaugeQuda();
2115 
2116  if (invalidate_quda_gauge || !create_quda_gauge) {
2117  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
2118  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
2119  invalidate_quda_gauge = false;
2120  }
2121 
2122  mg_pack->mg_preconditioner = newMultigridQuda(&mg_pack->mg_param);
2123  mg_pack->last_mass = mass;
2124 
2125  invalidate_quda_mg = false;
2126 
2127  if (!create_quda_gauge) invalidateGaugeQuda();
2128 
2129  qudamilc_called<false>(__func__, verbosity);
2130 
2131  return (void *)mg_pack;
2132 }
2133 
2134 void qudaInvertMG(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
2135  double target_residual, double target_fermilab_residual, const void *const fatlink,
2136  const void *const longlink, void *mg_pack_ptr, int mg_rebuild_type, void *source, void *solution,
2137  double *const final_residual, double *const final_fermilab_residual, int *num_iters)
2138 {
2139  static const QudaVerbosity verbosity = getVerbosity();
2140  qudamilc_called<true>(__func__, verbosity);
2141 
2142  // FIXME: Flip the sign of the mass to fix a consistency issue between
2143  // MILC, QUDA full parity dslash operator
2144  mass = -mass;
2145 
2146  milcMultigridPack *mg_pack = (milcMultigridPack *)(mg_pack_ptr);
2147 
2148  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n");
2149 
2150  // static const QudaVerbosity verbosity = getVerbosity();
2151  QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2152  QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2153  QudaPrecision device_precision_sloppy = QUDA_SINGLE_PRECISION; // required for MG
2154 
2155  QudaGaugeParam fat_param = newQudaGaugeParam();
2156  QudaGaugeParam long_param = newQudaGaugeParam();
2157  setGaugeParams(fat_param, long_param, fatlink, longlink, localDim, host_precision, device_precision,
2158  device_precision_sloppy, inv_args.tadpole, inv_args.naik_epsilon);
2159 
2160  fat_param.cuda_prec_refinement_sloppy = fat_param.cuda_prec_sloppy;
2161  fat_param.cuda_prec_precondition = mg_pack->preconditioner_precision;
2163 
2164  long_param.type = QUDA_ASQTAD_LONG_LINKS;
2165  long_param.cuda_prec_refinement_sloppy = long_param.cuda_prec_sloppy;
2166  long_param.cuda_prec_precondition = mg_pack->preconditioner_precision;
2168 
2169  QudaInvertParam invertParam = newQudaInvertParam();
2170 
2171  QudaParity local_parity = inv_args.evenodd; // ignored, just needed to set some defaults
2172  const double reliable_delta = 1e-4;
2173 
2174  setInvertParams(localDim, host_precision, device_precision, device_precision_sloppy, mass, target_residual,
2175  target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity,
2176  QUDA_GCR_INVERTER, &invertParam);
2177 
2178  invertParam.inv_type = QUDA_GCR_INVERTER;
2179  invertParam.preconditioner = mg_pack->mg_preconditioner;
2181  invertParam.solution_type = QUDA_MAT_SOLUTION;
2182  invertParam.solve_type = QUDA_DIRECT_SOLVE;
2183  invertParam.verbosity_precondition = QUDA_VERBOSE;
2184 
2185  invertParam.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // req'd
2186  invertParam.cuda_prec_precondition = mg_pack->preconditioner_precision;
2187  invertParam.gcrNkrylov = 15;
2188  invertParam.pipeline = 16; // pipeline, get from file
2189 
2191  setColorSpinorParams(localDim, host_precision, &csParam);
2192 
2193  // dirty hack to invalidate the cached gauge field without breaking interface compatability
2194  if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) {
2195  invalidateGaugeQuda();
2196  invalidate_quda_mg = true;
2197  }
2198 
2199  if (mass != mg_pack->last_mass) {
2200  mg_pack->mg_param.invert_param->mass = mass;
2201  mg_pack->last_mass = mass;
2202  invalidateGaugeQuda();
2203  invalidate_quda_mg = true;
2204  }
2205 
2206  if (invalidate_quda_gauge || !create_quda_gauge || invalidate_quda_mg) {
2207  loadGaugeQuda(const_cast<void *>(fatlink), &fat_param);
2208  if (longlink != nullptr) loadGaugeQuda(const_cast<void *>(longlink), &long_param);
2209  invalidate_quda_gauge = false;
2210 
2211  // FIXME: hack to reset gaugeFatPrecise (see interface_quda.cpp), etc.
2212  // Solution is to have a version of this that _only_
2213  // rebuilds the Dirac matrices, I believe.
2214  if (mg_rebuild_type == 1) {
2215  if (verbosity >= QUDA_VERBOSE) printfQuda("Performing a full MG solver update\n");
2217  } else {
2218  if (verbosity >= QUDA_VERBOSE) printfQuda("Performing a thin MG solver update\n");
2220  }
2221  updateMultigridQuda(mg_pack->mg_preconditioner, &mg_pack->mg_param);
2222  invalidate_quda_mg = false;
2223  }
2224 
2225  if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH;
2226 
2227  int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision;
2228 
2229  // FIXME: due to sign convention woes passing in an initial
2230  // guess is currently broken. Needs a sign flip to fix.
2231  // MG is fast enough we won't worry...
2232 
2233  invertQuda(static_cast<char *>(solution) + quark_offset, static_cast<char *>(source) + quark_offset, &invertParam);
2234 
2235  // FIXME: Flip sign on solution to correct for mass convention
2236  int cv_size = localDim[0] * localDim[1] * localDim[2] * localDim[3] * 3 * 2; // (dimension * Nc = 3 * cplx)
2237  if (host_precision == QUDA_DOUBLE_PRECISION) {
2238  auto soln = (double *)(solution);
2239  for (long i = 0; i < cv_size; i++) { soln[i] = -soln[i]; }
2240  } else {
2241  auto soln = (float *)(solution);
2242  for (long i = 0; i < cv_size; i++) { soln[i] = -soln[i]; }
2243  }
2244 
2245  // return the number of iterations taken by the inverter
2246  *num_iters = invertParam.iter;
2247  *final_residual = invertParam.true_res;
2248  *final_fermilab_residual = invertParam.true_res_hq;
2249 
2250  if (!create_quda_gauge) invalidateGaugeQuda();
2251 
2252  qudamilc_called<false>(__func__, verbosity);
2253 }
2254 
2255 void qudaMultigridDestroy(void *mg_pack_ptr)
2256 {
2257  static const QudaVerbosity verbosity = getVerbosity();
2258  qudamilc_called<true>(__func__, verbosity);
2259 
2260  if (mg_pack_ptr != 0) {
2261  milcMultigridPack *mg_pack = (milcMultigridPack *)(mg_pack_ptr);
2263  delete mg_pack;
2264  }
2265 
2266  qudamilc_called<false>(__func__, verbosity);
2267 }
2268 
2269 static int clover_alloc = 0;
2270 
2271 void* qudaCreateGaugeField(void* gauge, int geometry, int precision)
2272 {
2273  qudamilc_called<true>(__func__);
2274  QudaPrecision qudaPrecision = (precision==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2275  QudaGaugeParam qudaGaugeParam
2276  = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS);
2277  qudamilc_called<false>(__func__);
2278  return createGaugeFieldQuda(gauge, geometry, &qudaGaugeParam);
2279 }
2280 
2281 
2282 void qudaSaveGaugeField(void* gauge, void* inGauge)
2283 {
2284  qudamilc_called<true>(__func__);
2285  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
2286  QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS);
2287  saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam);
2288  qudamilc_called<false>(__func__);
2289 }
2290 
2291 
2292 void qudaDestroyGaugeField(void* gauge)
2293 {
2294  qudamilc_called<true>(__func__);
2295  destroyGaugeFieldQuda(gauge);
2296  qudamilc_called<false>(__func__);
2297 }
2298 
2299 
2300 void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args,
2301  int external_precision, int quda_precision, double kappa, double reliable_delta);
2302 
2303 void qudaCloverForce(void *mom, double dt, void **x, void **p, double *coeff, double kappa, double ck,
2304  int nvec, double multiplicity, void *gauge, int precision, QudaInvertArgs_t inv_args)
2305 {
2306  qudamilc_called<true>(__func__);
2307  QudaGaugeParam qudaGaugeParam
2308  = newMILCGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION, QUDA_GENERAL_LINKS);
2309  qudaGaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER; // refers to momentum gauge order
2310 
2311  QudaInvertParam invertParam = newQudaInvertParam();
2312  setInvertParam(invertParam, inv_args, precision, precision, kappa, 0);
2313  invertParam.num_offset = nvec;
2314  for (int i=0; i<nvec; ++i) invertParam.offset[i] = 0.0; // not needed
2315  invertParam.clover_coeff = 0.0; // not needed
2316 
2317  // solution types
2319  invertParam.solve_type = QUDA_NORMOP_PC_SOLVE;
2320  invertParam.inv_type = QUDA_CG_INVERTER;
2322 
2323  invertParam.verbosity = getVerbosity();
2324  invertParam.verbosity_precondition = QUDA_SILENT;
2325  invertParam.use_resident_solution = inv_args.use_resident_solution;
2326 
2327  computeCloverForceQuda(mom, dt, x, p, coeff, -kappa * kappa, ck, nvec, multiplicity, gauge, &qudaGaugeParam,
2328  &invertParam);
2329  qudamilc_called<false>(__func__);
2330 }
2331 
2332 void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], QudaInvertArgs_t &inv_args,
2333  int external_precision, int quda_precision)
2334 {
2335 
2336  const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2337  const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2338  QudaPrecision device_precision_sloppy;
2339 
2340  switch(inv_args.mixed_precision) {
2341  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
2342  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
2343  default: device_precision_sloppy = device_precision;
2344  }
2345 
2346  for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir];
2347 
2348  qudaGaugeParam.anisotropy = 1.0;
2349  qudaGaugeParam.type = QUDA_WILSON_LINKS;
2350  qudaGaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
2351 
2352  // Check the boundary conditions
2353  // Can't have twisted or anti-periodic boundary conditions in the spatial
2354  // directions with 12 reconstruct at the moment.
2355  bool trivial_phase = true;
2356  for(int dir=0; dir<3; ++dir){
2357  if(inv_args.boundary_phase[dir] != 0) trivial_phase = false;
2358  }
2359  if(inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false;
2360 
2361  if(trivial_phase){
2362  qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T;
2363  qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
2364  qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
2365  }else{
2366  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
2367  qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
2368  qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
2369  }
2370 
2371  qudaGaugeParam.cpu_prec = host_precision;
2372  qudaGaugeParam.cuda_prec = device_precision;
2373  qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy;
2374  qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy;
2375  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
2376  qudaGaugeParam.ga_pad = getLinkPadding(dim);
2377 }
2378 
2379 void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args,
2380  int external_precision, int quda_precision, double kappa, double reliable_delta) {
2381 
2382  const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2383  const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
2384  QudaPrecision device_precision_sloppy;
2385  switch(inv_args.mixed_precision) {
2386  case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break;
2387  case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break;
2388  default: device_precision_sloppy = device_precision;
2389  }
2390 
2391  static const QudaVerbosity verbosity = getVerbosity();
2392 
2393  invertParam.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
2394  invertParam.kappa = kappa;
2395  invertParam.dagger = QUDA_DAG_NO;
2397  invertParam.gcrNkrylov = 30;
2398  invertParam.reliable_delta = reliable_delta;
2399  invertParam.maxiter = inv_args.max_iter;
2400 
2401  invertParam.cuda_prec_precondition = device_precision_sloppy;
2402  invertParam.verbosity_precondition = verbosity;
2403  invertParam.verbosity = verbosity;
2404  invertParam.cpu_prec = host_precision;
2405  invertParam.cuda_prec = device_precision;
2406  invertParam.cuda_prec_sloppy = device_precision_sloppy;
2409  invertParam.dirac_order = QUDA_DIRAC_ORDER;
2410  invertParam.sp_pad = 0;
2411  invertParam.cl_pad = 0;
2412  invertParam.clover_cpu_prec = host_precision;
2413  invertParam.clover_cuda_prec = device_precision;
2414  invertParam.clover_cuda_prec_sloppy = device_precision_sloppy;
2415  invertParam.clover_cuda_prec_precondition = device_precision_sloppy;
2416  invertParam.clover_order = QUDA_PACKED_CLOVER_ORDER;
2417 
2418  invertParam.compute_action = 0;
2419 }
2420 
2421 
2422 void qudaLoadGaugeField(int external_precision,
2423  int quda_precision,
2424  QudaInvertArgs_t inv_args,
2425  const void* milc_link) {
2426  qudamilc_called<true>(__func__);
2427  QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();
2428  setGaugeParams(qudaGaugeParam, localDim, inv_args, external_precision, quda_precision);
2429 
2430  loadGaugeQuda(const_cast<void *>(milc_link), &qudaGaugeParam);
2431  qudamilc_called<false>(__func__);
2432 } // qudaLoadGaugeField
2433 
2434 
2436  qudamilc_called<true>(__func__);
2437  freeGaugeQuda();
2438  qudamilc_called<false>(__func__);
2439 } // qudaFreeGaugeField
2440 
2441 void qudaLoadCloverField(int external_precision, int quda_precision, QudaInvertArgs_t inv_args, void *milc_clover,
2442  void *milc_clover_inv, QudaSolutionType solution_type, QudaSolveType solve_type, QudaInverterType inverter,
2443  double clover_coeff, int compute_trlog, double *trlog)
2444 {
2445  qudamilc_called<true>(__func__);
2446  QudaInvertParam invertParam = newQudaInvertParam();
2447  setInvertParam(invertParam, inv_args, external_precision, quda_precision, 0.0, 0.0);
2448  invertParam.solution_type = solution_type;
2449  invertParam.solve_type = solve_type;
2450  invertParam.inv_type = inverter;
2452  invertParam.compute_clover_trlog = compute_trlog;
2453  invertParam.clover_coeff = clover_coeff;
2454 
2455  // Hacks to mollify checkInvertParams which is called from
2456  // loadCloverQuda. These "required" parameters are irrelevant here.
2457  // Better procedure: invertParam should be defined in
2458  // qudaCloverInvert and qudaEigCGCloverInvert and passed here
2459  // instead of redefining a partial version here
2460  invertParam.tol = 0.;
2461  invertParam.tol_hq = 0.;
2462  invertParam.residual_type = static_cast<QudaResidualType_s>(0);
2463 
2464  if(invertParam.dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
2465  if (clover_alloc == 0) {
2466  loadCloverQuda(milc_clover, milc_clover_inv, &invertParam);
2467  clover_alloc = 1;
2468  } else {
2469  errorQuda("Clover term already allocated");
2470  }
2471  }
2472 
2473  if (compute_trlog) {
2474  trlog[0] = invertParam.trlogA[0];
2475  trlog[1] = invertParam.trlogA[1];
2476  }
2477  qudamilc_called<false>(__func__);
2478 } // qudaLoadCoverField
2479 
2481  qudamilc_called<true>(__func__);
2482  if (clover_alloc==1) {
2483  freeCloverQuda();
2484  clover_alloc = 0;
2485  } else {
2486  errorQuda("Trying to free non-allocated clover term");
2487  }
2488  qudamilc_called<false>(__func__);
2489 } // qudaFreeCloverField
2490 
2491 
2492 void qudaCloverInvert(int external_precision,
2493  int quda_precision,
2494  double kappa,
2495  double clover_coeff,
2496  QudaInvertArgs_t inv_args,
2497  double target_residual,
2498  double target_fermilab_residual,
2499  const void* link,
2500  void* clover, // could be stored in Milc format
2501  void* cloverInverse,
2502  void* source,
2503  void* solution,
2504  double* const final_residual,
2505  double* const final_fermilab_residual,
2506  int* num_iters)
2507 {
2508  qudamilc_called<true>(__func__);
2509  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaCloverInvert: requesting zero residual\n");
2510 
2511  if (link) qudaLoadGaugeField(external_precision, quda_precision, inv_args, link);
2512 
2513  if (clover || cloverInverse) {
2514  qudaLoadCloverField(external_precision, quda_precision, inv_args, clover, cloverInverse, QUDA_MAT_SOLUTION,
2516  }
2517 
2518  double reliable_delta = 1e-1;
2519 
2520  QudaInvertParam invertParam = newQudaInvertParam();
2521  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
2522  invertParam.residual_type = static_cast<QudaResidualType_s>(0);
2523  invertParam.residual_type = (target_residual != 0) ? static_cast<QudaResidualType_s> ( invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : invertParam.residual_type;
2524  invertParam.residual_type = (target_fermilab_residual != 0) ? static_cast<QudaResidualType_s> (invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : invertParam.residual_type;
2525 
2526  invertParam.tol = target_residual;
2527  invertParam.tol_hq = target_fermilab_residual;
2528  invertParam.heavy_quark_check = (invertParam.residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0);
2529  invertParam.clover_coeff = clover_coeff;
2530 
2531  // solution types
2532  invertParam.solution_type = QUDA_MAT_SOLUTION;
2535  invertParam.matpc_type = QUDA_MATPC_ODD_ODD;
2536 
2537  invertQuda(solution, source, &invertParam);
2538 
2539  *num_iters = invertParam.iter;
2540  *final_residual = invertParam.true_res;
2541  *final_fermilab_residual = invertParam.true_res_hq;
2542 
2543  if (clover || cloverInverse) qudaFreeCloverField();
2544  if (link) qudaFreeGaugeField();
2545  qudamilc_called<false>(__func__);
2546 } // qudaCloverInvert
2547 
2548 void qudaEigCGCloverInvert(int external_precision, int quda_precision, double kappa, double clover_coeff,
2549  QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual,
2550  const void *link,
2551  void *clover, // could be stored in Milc format
2552  void *cloverInverse,
2553  void *source, // array of source vectors -> overwritten on exit!
2554  void *solution, // temporary
2555  QudaEigArgs_t eig_args,
2556  const int rhs_idx, // current rhs
2557  const int last_rhs_flag, // is this the last rhs to solve?
2558  double *const final_residual, double *const final_fermilab_residual, int *num_iters)
2559 {
2560  qudamilc_called<true>(__func__);
2561  if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaCloverInvert: requesting zero residual\n");
2562 
2563  if (link && (rhs_idx == 0)) qudaLoadGaugeField(external_precision, quda_precision, inv_args, link);
2564 
2565  if ( (clover || cloverInverse) && (rhs_idx == 0)) {
2566  qudaLoadCloverField(external_precision, quda_precision, inv_args, clover, cloverInverse, QUDA_MAT_SOLUTION,
2568  }
2569 
2570  double reliable_delta = 1e-1;
2571 
2572  QudaInvertParam invertParam = newQudaInvertParam();
2573  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
2574  invertParam.residual_type = static_cast<QudaResidualType_s>(0);
2575  invertParam.residual_type = (target_residual != 0) ? static_cast<QudaResidualType_s> ( invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : invertParam.residual_type;
2576  invertParam.residual_type = (target_fermilab_residual != 0) ? static_cast<QudaResidualType_s> (invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : invertParam.residual_type;
2577 
2578  invertParam.tol = target_residual;
2579  invertParam.tol_hq = target_fermilab_residual;
2580  invertParam.heavy_quark_check = (invertParam.residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0);
2581  invertParam.clover_coeff = clover_coeff;
2582 
2583  // solution types
2584  invertParam.solution_type = QUDA_MAT_SOLUTION;
2585  invertParam.matpc_type = QUDA_MATPC_ODD_ODD;
2586 
2588  QudaEigParam df_param = newQudaEigParam();
2589  df_param.invert_param = &invertParam;
2590 
2591  invertParam.solve_type = QUDA_NORMOP_PC_SOLVE;
2592  invertParam.n_ev = eig_args.nev;
2593  invertParam.max_search_dim = eig_args.max_search_dim;
2594  invertParam.deflation_grid = eig_args.deflation_grid;
2595  invertParam.cuda_prec_ritz = eig_args.prec_ritz;
2596  invertParam.tol_restart = eig_args.tol_restart;
2597  invertParam.eigcg_max_restarts = eig_args.eigcg_max_restarts;
2598  invertParam.max_restart_num = eig_args.max_restart_num;
2599  invertParam.inc_tol = eig_args.inc_tol;
2600  invertParam.eigenval_tol = eig_args.eigenval_tol;
2601  invertParam.rhs_idx = rhs_idx;
2602 
2603 
2604  if((inv_args.solver_type != QUDA_INC_EIGCG_INVERTER) && (inv_args.solver_type != QUDA_EIGCG_INVERTER)) errorQuda("Incorrect inverter type.\n");
2605  invertParam.inv_type = inv_args.solver_type;
2606 
2608 
2609  setDeflationParam(eig_args.prec_ritz, eig_args.location_ritz, eig_args.mem_type_ritz, eig_args.deflation_ext_lib, eig_args.vec_infile, eig_args.vec_outfile, &df_param);
2610 
2611  if(rhs_idx == 0) df_preconditioner = newDeflationQuda(&df_param);
2612  invertParam.deflation_op = df_preconditioner;
2613 
2614  invertQuda(solution, source, &invertParam);
2615 
2616  if (last_rhs_flag) destroyDeflationQuda(df_preconditioner);
2617 
2618  *num_iters = invertParam.iter;
2619  *final_residual = invertParam.true_res;
2620  *final_fermilab_residual = invertParam.true_res_hq;
2621 
2622  if ( (clover || cloverInverse) && last_rhs_flag) qudaFreeCloverField();
2623  if (link && last_rhs_flag) qudaFreeGaugeField();
2624  qudamilc_called<false>(__func__);
2625 } // qudaEigCGCloverInvert
2626 
2627 
2628 void qudaCloverMultishiftInvert(int external_precision,
2629  int quda_precision,
2630  int num_offsets,
2631  double* const offset,
2632  double kappa,
2633  double clover_coeff,
2634  QudaInvertArgs_t inv_args,
2635  const double* target_residual_offset,
2636  const void* milc_link,
2637  void* milc_clover,
2638  void* milc_clover_inv,
2639  void* source,
2640  void** solutionArray,
2641  double* const final_residual,
2642  int* num_iters)
2643 {
2644  static const QudaVerbosity verbosity = getVerbosity();
2645  qudamilc_called<true>(__func__, verbosity);
2646 
2647  for (int i = 0; i < num_offsets; ++i) {
2648  if (target_residual_offset[i] == 0) errorQuda("qudaCloverMultishiftInvert: target residual cannot be zero\n");
2649  }
2650 
2651  // if doing a pure double-precision multi-shift solve don't use reliable updates
2652  const bool use_mixed_precision = (((quda_precision==2) && inv_args.mixed_precision) ||
2653  ((quda_precision==1) && (inv_args.mixed_precision==2)) ) ? true : false;
2654  double reliable_delta = (use_mixed_precision) ? 1e-2 : 0.0;
2655  QudaInvertParam invertParam = newQudaInvertParam();
2656  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
2658  invertParam.num_offset = num_offsets;
2659  for(int i=0; i<num_offsets; ++i){
2660  invertParam.offset[i] = offset[i];
2661  invertParam.tol_offset[i] = target_residual_offset[i];
2662  }
2663  invertParam.tol = target_residual_offset[0];
2664  invertParam.clover_coeff = clover_coeff;
2665 
2666  // solution types
2668  invertParam.solve_type = QUDA_NORMOP_PC_SOLVE;
2669  invertParam.inv_type = QUDA_CG_INVERTER;
2671 
2672  invertParam.verbosity = verbosity;
2673  invertParam.verbosity_precondition = QUDA_SILENT;
2674 
2675  invertParam.make_resident_solution = inv_args.make_resident_solution;
2676  invertParam.compute_true_res = 0;
2677 
2678  if (num_offsets==1 && offset[0] == 0) {
2679  // set the solver
2680  char *quda_solver = getenv("QUDA_MILC_CLOVER_SOLVER");
2681 
2682  // default is chronological CG
2683  if (!quda_solver || strcmp(quda_solver,"CHRONO_CG_SOLVER")==0) {
2684  // use CG with chronological forecasting
2685  invertParam.chrono_use_resident = 1;
2686  invertParam.chrono_make_resident = 1;
2687  invertParam.chrono_max_dim = 10;
2688  } else if (strcmp(quda_solver,"BICGSTAB_SOLVER")==0){
2689  // use two-step BiCGStab
2690  invertParam.inv_type = QUDA_BICGSTAB_INVERTER;
2691  invertParam.solve_type = QUDA_DIRECT_PC_SOLVE;
2692  } else if (strcmp(quda_solver,"CG_SOLVER")==0){
2693  // regular CG
2694  invertParam.chrono_use_resident = 0;
2695  invertParam.chrono_make_resident = 0;
2696  }
2697 
2698  invertQuda(solutionArray[0], source, &invertParam);
2699  *final_residual = invertParam.true_res;
2700  } else {
2701  invertMultiShiftQuda(solutionArray, source, &invertParam);
2702  for (int i=0; i<num_offsets; ++i) final_residual[i] = invertParam.true_res_offset[i];
2703  }
2704 
2705  // return the number of iterations taken by the inverter
2706  *num_iters = invertParam.iter;
2707 
2708  qudamilc_called<false>(__func__, verbosity);
2709 } // qudaCloverMultishiftInvert
2710 
2711 void qudaGaugeFixingOVR(int precision, unsigned int gauge_dir, int Nsteps, int verbose_interval, double relax_boost,
2712  double tolerance, unsigned int reunit_interval, unsigned int stopWtheta, void *milc_sitelink)
2713 {
2714  QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim,
2715  (precision==1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION,
2716  QUDA_SU3_LINKS);
2717  qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
2718  //qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
2719 
2720  double timeinfo[3];
2721  computeGaugeFixingOVRQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta, \
2722  &qudaGaugeParam, timeinfo);
2723 
2724  printfQuda("Time H2D: %lf\n", timeinfo[0]);
2725  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
2726  printfQuda("Time D2H: %lf\n", timeinfo[2]);
2727  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
2728 }
2729 
2730 void qudaGaugeFixingFFT( int precision,
2731  unsigned int gauge_dir,
2732  int Nsteps,
2733  int verbose_interval,
2734  double alpha,
2735  unsigned int autotune,
2736  double tolerance,
2737  unsigned int stopWtheta,
2738  void* milc_sitelink
2739  )
2740 {
2741  QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim,
2742  (precision==1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION,
2744  qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
2745  //qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
2746 
2747 
2748  double timeinfo[3];
2749  computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta, \
2750  &qudaGaugeParam, timeinfo);
2751 
2752  printfQuda("Time H2D: %lf\n", timeinfo[0]);
2753  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
2754  printfQuda("Time D2H: %lf\n", timeinfo[2]);
2755  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
2756 }
QudaPrecision Precision() const
void comm_barrier(void)
int comm_rank(void)
void comm_broadcast(void *data, size_t nbytes)
double kappa
double reliable_delta
QudaSolveType solve_type
quda::mgarray< char[256]> mg_vec_outfile
double mass
double tol
std::array< int, 4 > dim
QudaVerbosity verbosity
quda::mgarray< QudaInverterType > coarse_solver
quda::mgarray< int > coarse_solver_maxiter
bool verify_results
quda::mgarray< char[256]> mg_vec_infile
quda::mgarray< int > nu_post
quda::mgarray< int > nu_pre
quda::mgarray< QudaVerbosity > mg_verbosity
quda::mgarray< int > setup_maxiter
quda::mgarray< int > nvec
QudaMemoryType mem_type_ritz
QudaSolutionType solution_type
QudaExtLibType deflation_ext_lib
int mg_levels
quda::mgarray< QudaInverterType > setup_inv
double clover_coeff
quda::mgarray< double > setup_tol
QudaFieldLocation location_ritz
quda::mgarray< QudaSolveType > coarse_solve_type
QudaPrecision prec
quda::mgarray< double > coarse_solver_tol
quda::mgarray< QudaInverterType > smoother_type
quda::mgarray< std::array< int, 4 > > geo_block_size
QudaParity parity
Definition: covdev_test.cpp:40
QudaInvertParam inv_param
Definition: covdev_test.cpp:27
@ QUDA_MG_CYCLE_RECURSIVE
Definition: enum_quda.h:182
@ QUDA_PACKED_CLOVER_ORDER
Definition: enum_quda.h:258
enum QudaSolveType_s QudaSolveType
enum QudaPrecision_s QudaPrecision
@ QUDA_NULL_VECTOR_SETUP
Definition: enum_quda.h:447
@ QUDA_POWER_BASIS
Definition: enum_quda.h:201
@ QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:516
@ QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:97
@ QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:91
@ QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:98
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:226
@ QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:227
@ QUDA_DAG_NO
Definition: enum_quda.h:223
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430
@ QUDA_SILENT
Definition: enum_quda.h:265
@ QUDA_INVALID_VERBOSITY
Definition: enum_quda.h:269
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_BOOLEAN_FALSE
Definition: enum_quda.h:460
@ QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:461
@ QUDA_DEGRAND_ROSSI_GAMMA_BASIS
Definition: enum_quda.h:368
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71
@ QUDA_RECONSTRUCT_13
Definition: enum_quda.h:74
@ QUDA_RECONSTRUCT_9
Definition: enum_quda.h:73
@ QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:56
@ QUDA_PERIODIC_T
Definition: enum_quda.h:57
@ QUDA_EVEN_PARITY
Definition: enum_quda.h:284
@ QUDA_ODD_PARITY
Definition: enum_quda.h:284
@ QUDA_DIRAC_ORDER
Definition: enum_quda.h:245
QudaResidualType_s
Definition: enum_quda.h:192
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195
@ QUDA_L2_RELATIVE_RESIDUAL
Definition: enum_quda.h:193
@ QUDA_TRANSFER_COARSE_KD
Definition: enum_quda.h:454
@ QUDA_TRANSFER_AGGREGATE
Definition: enum_quda.h:453
enum QudaSolutionType_s QudaSolutionType
enum QudaInverterType_s QudaInverterType
@ QUDA_EIG_TR_LANCZOS
Definition: enum_quda.h:137
enum QudaFieldLocation_s QudaFieldLocation
enum QudaExtLibType_s QudaExtLibType
@ QUDA_GAUGE_FIXED_NO
Definition: enum_quda.h:80
@ QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:218
@ QUDA_MATPC_ODD_ODD
Definition: enum_quda.h:217
@ QUDA_MATPC_EVEN_EVEN
Definition: enum_quda.h:216
@ QUDA_GCR_INVERTER
Definition: enum_quda.h:109
@ QUDA_CGNE_INVERTER
Definition: enum_quda.h:124
@ QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:117
@ QUDA_CGNR_INVERTER
Definition: enum_quda.h:125
@ QUDA_CA_GCR_INVERTER
Definition: enum_quda.h:132
@ QUDA_CG_INVERTER
Definition: enum_quda.h:107
@ QUDA_INVALID_INVERTER
Definition: enum_quda.h:133
@ QUDA_EIGCG_INVERTER
Definition: enum_quda.h:116
@ QUDA_MG_INVERTER
Definition: enum_quda.h:122
@ QUDA_BICGSTAB_INVERTER
Definition: enum_quda.h:108
@ QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:238
@ QUDA_PRESERVE_SOURCE_YES
Definition: enum_quda.h:239
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
enum QudaMemoryType_s QudaMemoryType
enum QudaReconstructType_s QudaReconstructType
@ QUDA_MATPC_SOLUTION
Definition: enum_quda.h:159
@ QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:161
@ QUDA_MAT_SOLUTION
Definition: enum_quda.h:157
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_INVALID_PRECISION
Definition: enum_quda.h:66
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
@ QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:189
@ QUDA_MILC_SITE_GAUGE_ORDER
Definition: enum_quda.h:48
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_SPECTRUM_LR_EIG
Definition: enum_quda.h:149
@ QUDA_SPECTRUM_SR_EIG
Definition: enum_quda.h:150
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
enum QudaVerbosity_s QudaVerbosity
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_INVALID_SOLVE
Definition: enum_quda.h:175
@ QUDA_DIRECT_SOLVE
Definition: enum_quda.h:167
@ QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:170
@ QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:169
enum QudaParity_s QudaParity
@ QUDA_SU3_LINKS
Definition: enum_quda.h:24
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25
@ QUDA_THREE_LINKS
Definition: enum_quda.h:26
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
enum QudaLinkType_s QudaLinkType
@ QUDA_COMPUTE_NULL_VECTOR_YES
Definition: enum_quda.h:442
int length[]
GaugeFieldParam gParam
cudaGaugeField * cudaGauge
QudaPrecision & cuda_prec
Definition: host_utils.cpp:58
QudaPrecision & cuda_prec_sloppy
Definition: host_utils.cpp:59
QudaPrecision & cpu_prec
Definition: host_utils.cpp:57
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173
#define managed_free(ptr)
Definition: malloc_quda.h:114
#define managed_malloc(size)
Definition: malloc_quda.h:109
#define host_free(ptr)
Definition: malloc_quda.h:115
void qudaLoadCloverField(int external_precision, int quda_precision, QudaInvertArgs_t inv_args, void *milc_clover, void *milc_clover_inv, QudaSolutionType solution_type, QudaSolveType solve_type, QudaInverterType inverter, double clover_coeff, int compute_trlog, double *trlog)
void qudaLoadGaugeField(int external_precision, int quda_precision, QudaInvertArgs_t inv_args, const void *milc_link)
void qudaSetLayout(QudaLayout_t input)
void setDeflationParam(QudaPrecision ritz_prec, QudaFieldLocation location_ritz, QudaMemoryType mem_type_ritz, QudaExtLibType deflation_ext_lib, char vec_infile[], char vec_outfile[], QudaEigParam *df_param)
void qudaMomLoad(int prec, QudaMILCSiteArg_t *arg)
void qudaUnitarizeSU3Phased(int prec, double tol, QudaMILCSiteArg_t *arg, int phase_in)
void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args, int external_precision, int quda_precision, double kappa, double reliable_delta)
void qudamilc_called(const char *func, QudaVerbosity verb)
void qudaMultigridDestroy(void *mg_pack_ptr)
void qudaInit(QudaInitArgs_t input)
void qudaInvertMsrc(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *const fatlink, const void *const longlink, void **sourceArray, void **solutionArray, double *const final_residual, double *const final_fermilab_residual, int *num_iters, int num_src)
void qudaHisqForce(int prec, int num_terms, int num_naik_terms, double dt, double **coeff, void **quark_field, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void *const milc_momentum)
void qudaUnitarizeSU3(int prec, double tol, QudaMILCSiteArg_t *arg)
void qudaGaugeForce(int precision, int num_loop_types, double milc_loop_coeff[3], double eb3, QudaMILCSiteArg_t *arg)
#define POP_RANGE
void qudaRephase(int prec, void *gauge, int flag, double i_mu)
void qudaGaugeFixingOVR(int precision, unsigned int gauge_dir, int Nsteps, int verbose_interval, double relax_boost, double tolerance, unsigned int reunit_interval, unsigned int stopWtheta, void *milc_sitelink)
Gauge fixing with overrelaxation with support for single and multi GPU.
void qudaMomSave(int prec, QudaMILCSiteArg_t *arg)
void * qudaAllocateManaged(size_t bytes)
void qudaFreeManaged(void *ptr)
void milcSetMultigridEigParam(QudaEigParam &mg_eig_param, mgInputStruct &input_struct, int level)
void qudaMultishiftInvert(int external_precision, int quda_precision, int num_offsets, double *const offset, QudaInvertArgs_t inv_args, const double target_residual[], const double target_fermilab_residual[], const void *const fatlink, const void *const longlink, void *source, void **solutionArray, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
void qudaFinalize()
void * qudaMultigridCreate(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args, const void *const fatlink, const void *const longlink, const char *const mg_param_file)
void qudaSetMPICommHandle(void *mycomm)
void qudaFreeGaugeField()
void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], QudaInvertArgs_t &inv_args, int external_precision, int quda_precision)
double qudaMomAction(int prec, QudaMILCSiteArg_t *arg)
void qudaComputeOprod(int prec, int num_terms, int num_naik_terms, double **coeff, double scale, void **quark_field, void *oprod[3])
void qudaGaugeFixingFFT(int precision, unsigned int gauge_dir, int Nsteps, int verbose_interval, double alpha, unsigned int autotune, double tolerance, unsigned int stopWtheta, void *milc_sitelink)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void qudaFreePinned(void *ptr)
#define PUSH_RANGE(name, cid)
void qudaEigCGCloverInvert(int external_precision, int quda_precision, double kappa, double clover_coeff, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *link, void *clover, void *cloverInverse, void *source, void *solution, QudaEigArgs_t eig_args, const int rhs_idx, const int last_rhs_flag, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
void qudaDslash(int external_precision, int quda_precision, QudaInvertArgs_t inv_args, const void *const fatlink, const void *const longlink, void *src, void *dst, int *num_iters)
void * qudaAllocatePinned(size_t bytes)
void qudaHisqParamsInit(QudaHisqParams_t params)
void qudaUpdateUPhased(int prec, double eps, QudaMILCSiteArg_t *arg, int phase_in)
void qudaAsqtadForce(int prec, const double act_path_coeff[6], const void *const one_link_src[4], const void *const naik_src[4], const void *const link, void *const milc_momentum)
void qudaUpdateU(int prec, double eps, QudaMILCSiteArg_t *arg)
void qudaEigCGInvert(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *const fatlink, const void *const longlink, void *source, void *solution, QudaEigArgs_t eig_args, const int rhs_idx, const int last_rhs_flag, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
void qudaLoadUnitarizedLink(int prec, QudaFatLinkArgs_t fatlink_args, const double act_path_coeff[6], void *inlink, void *fatlink, void *ulink)
void qudaCloverInvert(int external_precision, int quda_precision, double kappa, double clover_coeff, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *link, void *clover, void *cloverInverse, void *source, void *solution, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
void qudaGaugeForcePhased(int precision, int num_loop_types, double milc_loop_coeff[3], double eb3, QudaMILCSiteArg_t *arg, int phase_in)
void qudaCloverForce(void *mom, double dt, void **x, void **p, double *coeff, double kappa, double ck, int nvec, double multiplicity, void *gauge, int precision, QudaInvertArgs_t inv_args)
void qudaDestroyGaugeField(void *gauge)
void qudaSaveGaugeField(void *gauge, void *inGauge)
void qudaFreeCloverField()
void qudaInvertMG(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *const fatlink, const void *const longlink, void *mg_pack_ptr, int mg_rebuild_type, void *source, void *solution, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
void qudaUpdateUPhasedPipeline(int prec, double eps, QudaMILCSiteArg_t *arg, int phase_in, int want_gaugepipe)
void qudaLoadKSLink(int prec, QudaFatLinkArgs_t fatlink_args, const double act_path_coeff[6], void *inlink, void *fatlink, void *longlink)
void * qudaCreateGaugeField(void *gauge, int geometry, int precision)
void qudaInvert(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *const fatlink, const void *const longlink, void *source, void *solution, double *const final_residual, double *const final_fermilab_residual, int *num_iters)
#define MAX(a, b)
void qudaCloverMultishiftInvert(int external_precision, int quda_precision, int num_offsets, double *const offset, double kappa, double clover_coeff, QudaInvertArgs_t inv_args, const double *target_residual_offset, const void *milc_link, void *milc_clover, void *milc_clover_inv, void *source, void **solutionArray, double *const final_residual, int *num_iters)
void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precision, QudaPrecision device_precision, QudaPrecision device_precision_sloppy, double mass, const char *const mg_param_file)
unsigned long long bytes
void start()
Start profiling.
Definition: device.cpp:226
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
bool canReuseResidentGauge(QudaInvertParam *inv_param)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
::std::string string
Definition: gtest-port.h:891
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
Main header file for the QUDA library.
double momActionQuda(void *momentum, QudaGaugeParam *param)
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into sub-partitions...
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
void destroyGaugeFieldQuda(void *gauge)
void destroyDeflationQuda(void *df_instance)
void momResidentQuda(void *mom, QudaGaugeParam *param)
int computeGaugeForceQuda(void *mom, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam)
void setMPICommHandleQuda(void *mycomm)
void * newMultigridQuda(QudaMultigridParam *param)
void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
QudaGaugeParam newQudaGaugeParam(void)
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param)
void freeGaugeQuda(void)
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void initQuda(int device)
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
QudaMultigridParam newQudaMultigridParam(void)
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void freeCloverQuda(void)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
QudaInvertParam newQudaInvertParam(void)
QudaEigParam newQudaEigParam(void)
void endQuda(void)
void updateMultigridQuda(void *mg_instance, QudaMultigridParam *param)
Updates the multigrid preconditioner for the new gauge / clover field.
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
void destroyMultigridQuda(void *mg_instance)
Free resources allocated by the multigrid solver.
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
void computeHISQForceQuda(void *momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **quark, int num, int num_naik, double **coeff, QudaGaugeParam *param)
void * newDeflationQuda(QudaEigParam *param)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
QudaExtLibType deflation_ext_lib
QudaMemoryType mem_type_ritz
QudaFieldLocation location_ritz
QudaPrecision prec_ritz
QudaEigSpectrumType spectrum
Definition: quda.h:466
QudaPrecision save_prec
Definition: quda.h:528
QudaEigType eig_type
Definition: quda.h:416
int check_interval
Definition: quda.h:483
QudaBoolean import_vectors
Definition: quda.h:507
QudaBoolean use_dagger
Definition: quda.h:449
int n_ev_deflate
Definition: quda.h:477
QudaBoolean compute_svd
Definition: quda.h:456
QudaBoolean io_parity_inflate
Definition: quda.h:533
int batched_rotate
Definition: quda.h:487
QudaBoolean use_poly_acc
Definition: quda.h:419
char vec_outfile[256]
Definition: quda.h:525
QudaBoolean use_norm_op
Definition: quda.h:450
QudaPrecision cuda_prec_ritz
Definition: quda.h:510
char vec_infile[256]
Definition: quda.h:522
QudaFieldLocation location
Definition: quda.h:516
int poly_deg
Definition: quda.h:422
double a_min
Definition: quda.h:425
char QUDA_logfile[512]
Definition: quda.h:497
double tol
Definition: quda.h:479
double a_max
Definition: quda.h:426
QudaBoolean run_verify
Definition: quda.h:519
QudaBoolean require_convergence
Definition: quda.h:463
int n_ev
Definition: quda.h:469
int max_restarts
Definition: quda.h:485
QudaInvertParam * invert_param
Definition: quda.h:413
QudaMemoryType mem_type_ritz
Definition: quda.h:513
int n_kr
Definition: quda.h:471
int n_conv
Definition: quda.h:475
double tadpole_coeff
Definition: quda.h:38
QudaReconstructType reconstruct_precondition
Definition: quda.h:58
double anisotropy
Definition: quda.h:37
size_t mom_offset
Definition: quda.h:90
QudaReconstructType reconstruct
Definition: quda.h:49
QudaPrecision cuda_prec_precondition
Definition: quda.h:57
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
double scale
Definition: quda.h:39
int make_resident_mom
Definition: quda.h:85
int return_result_mom
Definition: quda.h:87
size_t gauge_offset
Definition: quda.h:89
int use_resident_gauge
Definition: quda.h:82
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:54
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaGaugeFixed gauge_fix
Definition: quda.h:63
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42
int make_resident_gauge
Definition: quda.h:84
double i_mu
Definition: quda.h:76
QudaPrecision cuda_prec
Definition: quda.h:48
size_t site_size
Definition: quda.h:91
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
int use_resident_mom
Definition: quda.h:83
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:55
int staggered_phase_applied
Definition: quda.h:74
QudaTboundary t_boundary
Definition: quda.h:44
int return_result_gauge
Definition: quda.h:86
int overwrite_mom
Definition: quda.h:80
QudaLayout_t layout
QudaVerbosity verbosity
QudaInverterType solver_type
int use_sloppy_partial_accumulator
Definition: quda.h:149
QudaSolveType solve_type
Definition: quda.h:229
double gflops
Definition: quda.h:277
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:240
int maxiter_precondition
Definition: quda.h:320
QudaSolutionType solution_type
Definition: quda.h:228
double tol_hq
Definition: quda.h:140
double tol_precondition
Definition: quda.h:317
QudaCloverFieldOrder clover_order
Definition: quda.h:256
int compute_true_res
Definition: quda.h:142
double eigenval_tol
Definition: quda.h:366
int chrono_make_resident
Definition: quda.h:381
QudaMassNormalization mass_normalization
Definition: quda.h:232
QudaPreserveSource preserve_source
Definition: quda.h:235
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:206
int chrono_use_resident
Definition: quda.h:387
double tol_restart
Definition: quda.h:139
QudaResidualType residual_type
Definition: quda.h:348
double true_res
Definition: quda.h:143
int heavy_quark_check
Definition: quda.h:182
double inc_tol
Definition: quda.h:372
int make_resident_solution
Definition: quda.h:375
int num_offset
Definition: quda.h:186
double mass
Definition: quda.h:109
QudaPrecision clover_cuda_prec
Definition: quda.h:250
int max_search_dim
Definition: quda.h:360
double reliable_delta_refinement
Definition: quda.h:147
int eigcg_max_restarts
Definition: quda.h:368
QudaPrecision clover_cpu_prec
Definition: quda.h:249
QudaPrecision cuda_prec_ritz
Definition: quda.h:352
int max_hq_res_restart_total
Definition: quda.h:179
QudaDslashType dslash_type
Definition: quda.h:106
int compute_clover_trlog
Definition: quda.h:263
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:253
int max_hq_res_increase
Definition: quda.h:174
QudaPrecision cuda_prec
Definition: quda.h:238
QudaVerbosity verbosity
Definition: quda.h:271
double true_res_hq
Definition: quda.h:144
double clover_coeff
Definition: quda.h:260
double trlogA[2]
Definition: quda.h:264
int gcrNkrylov
Definition: quda.h:286
double reliable_delta
Definition: quda.h:146
QudaVerbosity verbosity_precondition
Definition: quda.h:314
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:200
QudaDagType dagger
Definition: quda.h:231
QudaInverterType inv_type
Definition: quda.h:107
int max_restart_num
Definition: quda.h:370
int compute_action
Definition: quda.h:221
QudaPrecision cpu_prec
Definition: quda.h:237
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:209
int chrono_max_dim
Definition: quda.h:390
int deflation_grid
Definition: quda.h:364
QudaMatPCType matpc_type
Definition: quda.h:230
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:251
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:203
QudaInverterType inv_type_precondition
Definition: quda.h:297
QudaGammaBasis gamma_basis
Definition: quda.h:246
double tol
Definition: quda.h:138
QudaPrecision cuda_prec_sloppy
Definition: quda.h:239
QudaFieldLocation input_location
Definition: quda.h:103
QudaFieldLocation output_location
Definition: quda.h:104
void * deflation_op
Definition: quda.h:303
QudaPrecision cuda_prec_precondition
Definition: quda.h:241
int use_resident_solution
Definition: quda.h:378
QudaDiracFieldOrder dirac_order
Definition: quda.h:244
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:215
QudaUseInitGuess use_init_guess
Definition: quda.h:257
double kappa
Definition: quda.h:110
void * preconditioner
Definition: quda.h:300
const int * machsize
const int * latsize
QudaBoolean pre_orthonormalize
Definition: quda.h:607
int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]
Definition: quda.h:655
QudaBoolean run_verify
Definition: quda.h:691
double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]
Definition: quda.h:628
double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]
Definition: quda.h:601
QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:659
double coarse_solver_tol[QUDA_MAX_MG_LEVEL]
Definition: quda.h:616
QudaMultigridCycleType cycle_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:665
double omega[QUDA_MAX_MG_LEVEL]
Definition: quda.h:646
QudaBoolean thin_update_only
Definition: quda.h:733
int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]
Definition: quda.h:589
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
Definition: quda.h:568
QudaBoolean use_mma
Definition: quda.h:730
char vec_infile[QUDA_MAX_MG_LEVEL][256]
Definition: quda.h:703
int nu_post[QUDA_MAX_MG_LEVEL]
Definition: quda.h:643
QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]
Definition: quda.h:677
int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]
Definition: quda.h:619
QudaEigParam * eig_param[QUDA_MAX_MG_LEVEL]
Definition: quda.h:553
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:565
int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]
Definition: quda.h:559
QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]
Definition: quda.h:613
QudaBoolean coarse_guess
Definition: quda.h:712
int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]
Definition: quda.h:625
QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:727
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:674
QudaBoolean post_orthonormalize
Definition: quda.h:610
int num_setup_iter[QUDA_MAX_MG_LEVEL]
Definition: quda.h:580
double mu_factor[QUDA_MAX_MG_LEVEL]
Definition: quda.h:724
QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]
Definition: quda.h:592
QudaInverterType setup_inv_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:577
QudaInverterType smoother[QUDA_MAX_MG_LEVEL]
Definition: quda.h:634
int setup_maxiter[QUDA_MAX_MG_LEVEL]
Definition: quda.h:586
int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]
Definition: quda.h:595
QudaBoolean preserve_deflation
Definition: quda.h:715
double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]
Definition: quda.h:631
char vec_outfile[QUDA_MAX_MG_LEVEL][256]
Definition: quda.h:709
QudaBoolean run_low_mode_check
Definition: quda.h:694
double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]
Definition: quda.h:598
double smoother_tol[QUDA_MAX_MG_LEVEL]
Definition: quda.h:637
QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]
Definition: quda.h:622
int nu_pre[QUDA_MAX_MG_LEVEL]
Definition: quda.h:640
QudaSetupType setup_type
Definition: quda.h:604
QudaBoolean vec_store[QUDA_MAX_MG_LEVEL]
Definition: quda.h:706
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:662
QudaFieldLocation location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:671
QudaVerbosity verbosity[QUDA_MAX_MG_LEVEL]
Definition: quda.h:574
QudaBoolean setup_minimize_memory
Definition: quda.h:682
int spin_block_size[QUDA_MAX_MG_LEVEL]
Definition: quda.h:562
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: quda.h:571
QudaBoolean generate_all_levels
Definition: quda.h:688
QudaComputeNullVector compute_null_vector
Definition: quda.h:685
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:652
QudaBoolean run_oblique_proj_check
Definition: quda.h:697
QudaInvertParam * invert_param
Definition: quda.h:551
double setup_tol[QUDA_MAX_MG_LEVEL]
Definition: quda.h:583
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
Definition: quda.h:649
QudaBoolean global_reduction[QUDA_MAX_MG_LEVEL]
Definition: quda.h:668
QudaBoolean vec_load[QUDA_MAX_MG_LEVEL]
Definition: quda.h:700
double coarse_solver_tol[QUDA_MAX_MG_LEVEL]
QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]
QudaInverterType getQudaInverterType(const char *name)
char mg_vec_infile[QUDA_MAX_MG_LEVEL][256]
bool update(std::vector< std::string > &input_line)
QudaVerbosity mg_verbosity[QUDA_MAX_MG_LEVEL]
double setup_maxiter[QUDA_MAX_MG_LEVEL]
QudaPrecision getQudaPrecision(const char *name)
QudaVerbosity getQudaVerbosity(const char *name)
int nvec[QUDA_MAX_MG_LEVEL]
char mg_vec_outfile[QUDA_MAX_MG_LEVEL][256]
QudaPrecision preconditioner_precision
double setup_tol[QUDA_MAX_MG_LEVEL]
int nu_post[QUDA_MAX_MG_LEVEL]
int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]
int geo_block_size[QUDA_MAX_MG_LEVEL][4]
int nu_pre[QUDA_MAX_MG_LEVEL]
QudaInverterType setup_inv[QUDA_MAX_MG_LEVEL]
QudaSolveType getQudaSolveType(const char *name)
QudaInverterType smoother_type[QUDA_MAX_MG_LEVEL]
QudaSolveType coarse_solve_type[QUDA_MAX_MG_LEVEL]
QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]
QudaMultigridParam mg_param
QudaInvertParam mg_inv_param
QudaPrecision preconditioner_precision
QudaReconstructType reconstruct
Definition: gauge_field.h:50
QudaTboundary t_boundary
Definition: gauge_field.h:54
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:120