QUDA  v1.1.0
A library for QCD on GPUs
interface_quda.cpp
Go to the documentation of this file.
1 #include <cmath>
2 #include <cstdio>
3 #include <cstdlib>
4 #include <cstring>
5 #include <iostream>
6 #include <sys/time.h>
7 #include <complex.h>
8 
9 #include <quda.h>
10 #include <quda_fortran.h>
11 #include <quda_internal.h>
12 #include <device.h>
13 #include <comm_quda.h>
14 #include <tune_quda.h>
15 #include <blas_quda.h>
16 #include <gauge_field.h>
17 #include <dirac_quda.h>
18 #include <dslash_quda.h>
19 #include <invert_quda.h>
20 #include <eigensolve_quda.h>
21 #include <color_spinor_field.h>
22 #include <clover_field.h>
23 #include <llfat_quda.h>
24 #include <unitarization_links.h>
25 #include <algorithm>
26 #include <staggered_oprod.h>
27 #include <ks_improved_force.h>
28 #include <ks_force_quda.h>
29 #include <random_quda.h>
30 #include <mpi_comm_handle.h>
31 
32 #include <multigrid.h>
33 #include <deflation.h>
34 
35 #include <split_grid.h>
36 
37 #include <ks_force_quda.h>
38 
39 #ifdef GPU_GAUGE_FORCE
40 #include <gauge_force_quda.h>
41 #endif
42 #include <gauge_update_quda.h>
43 
44 #define MAX(a,b) ((a)>(b)? (a):(b))
45 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
46 
47 // define newQudaGaugeParam() and newQudaInvertParam()
48 #define INIT_PARAM
49 #include "check_params.h"
50 #undef INIT_PARAM
51 
52 // define (static) checkGaugeParam() and checkInvertParam()
53 #define CHECK_PARAM
54 #include "check_params.h"
55 #undef CHECK_PARAM
57 
58 // define printQudaGaugeParam() and printQudaInvertParam()
59 #define PRINT_PARAM
60 #include "check_params.h"
61 #undef PRINT_PARAM
62 
63 #include <gauge_tools.h>
64 #include <contract_quda.h>
65 #include <momentum.h>
66 
67 using namespace quda;
68 
69 static int R[4] = {0, 0, 0, 0};
70 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels
71 static bool redundant_comms = false;
72 
73 #include <blas_lapack.h>
74 
75 //for MAGMA lib:
76 #include <blas_magma.h>
77 
78 static bool InitMagma = false;
79 
80 void openMagma() {
81 
82  if (!InitMagma) {
83  OpenMagma();
84  InitMagma = true;
85  } else {
86  printfQuda("\nMAGMA library was already initialized..\n");
87  }
88 
89 }
90 
91 void closeMagma(){
92 
93  if (InitMagma) {
94  CloseMagma();
95  InitMagma = false;
96  } else {
97  printfQuda("\nMAGMA library was not initialized..\n");
98  }
99 
100 }
101 
108 
115 
122 
124 
130 
133 
134 std::vector<cudaColorSpinorField*> solutionResident;
135 
136 // vector of spinors used for forecasting solutions in HMC
137 #define QUDA_MAX_CHRONO 12
138 // each entry is one p
139 std::vector< std::vector<ColorSpinorField*> > chronoResident(QUDA_MAX_CHRONO);
140 
141 // Mapped memory buffer used to hold unitarization failures
142 static int *num_failures_h = nullptr;
143 static int *num_failures_d = nullptr;
144 
145 static bool initialized = false;
146 
148 static TimeProfile profileInit("initQuda");
149 
151 static TimeProfile profileGauge("loadGaugeQuda");
152 
154 static TimeProfile profileClover("loadCloverQuda");
155 
157 static TimeProfile profileDslash("dslashQuda");
158 
160 static TimeProfile profileInvert("invertQuda");
161 
163 static TimeProfile profileInvertMultiSrc("invertMultiSrcQuda");
164 
166 static TimeProfile profileMulti("invertMultiShiftQuda");
167 
169 static TimeProfile profileEigensolve("eigensolveQuda");
170 
172 static TimeProfile profileFatLink("computeKSLinkQuda");
173 
175 static TimeProfile profileGaugeForce("computeGaugeForceQuda");
176 
178 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");
179 
181 static TimeProfile profileExtendedGauge("createExtendedGaugeField");
182 
184 static TimeProfile profileCloverForce("computeCloverForceQuda");
185 
187 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");
188 
190 static TimeProfile profileHISQForce("computeHISQForceQuda");
191 
193 static TimeProfile profilePlaq("plaqQuda");
194 
196 static TimeProfile profileWuppertal("wuppertalQuda");
197 
199 static TimeProfile profileGauss("gaussQuda");
200 
202 static TimeProfile profileGaugeObs("gaugeObservablesQuda");
203 
205 static TimeProfile profileAPE("APEQuda");
206 
208 static TimeProfile profileSTOUT("STOUTQuda");
209 
211 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");
212 
214 static TimeProfile profileWFlow("wFlowQuda");
215 
217 static TimeProfile profileProject("projectSU3Quda");
218 
220 static TimeProfile profilePhase("staggeredPhaseQuda");
221 
223 static TimeProfile profileContract("contractQuda");
224 
226 static TimeProfile profileBLAS("blasQuda");
227 TimeProfile &getProfileBLAS() { return profileBLAS; }
228 
230 static TimeProfile profileCovDev("covDevQuda");
231 
233 static TimeProfile profileMomAction("momActionQuda");
234 
236 static TimeProfile profileEnd("endQuda");
237 
239 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");
240 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");
241 
243 static TimeProfile profileInit2End("initQuda-endQuda",false);
244 
245 static bool enable_profiler = false;
246 static bool do_not_profile_quda = false;
247 
248 static void profilerStart(const char *f)
249 {
250  static std::vector<int> target_list;
251  static bool enable = false;
252  static bool init = false;
253  if (!init) {
254  char *profile_target_env = getenv("QUDA_ENABLE_TARGET_PROFILE"); // selectively enable profiling for a given solve
255 
256  if ( profile_target_env ) {
257  std::stringstream target_stream(profile_target_env);
258 
259  int target;
260  while(target_stream >> target) {
261  target_list.push_back(target);
262  if (target_stream.peek() == ',') target_stream.ignore();
263  }
264 
265  if (target_list.size() > 0) {
266  std::sort(target_list.begin(), target_list.end());
267  target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );
268  warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size());
269  enable = true;
270  }
271  }
272 
273  char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts
274  if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) {
275  do_not_profile_quda=true;
276  printfQuda("Disabling profiling in QUDA\n");
277  }
278  init = true;
279  }
280 
281  static int target_count = 0;
282  static unsigned int i = 0;
283  if (do_not_profile_quda){
285  printfQuda("Stopping profiling in QUDA\n");
286  } else {
287  if (enable) {
288  if (i < target_list.size() && target_count++ == target_list[i]) {
289  enable_profiler = true;
290  printfQuda("Starting profiling for %s\n", f);
292  i++; // advance to next target
293  }
294  }
295 }
296 }
297 
298 static void profilerStop(const char *f) {
299  if (do_not_profile_quda) {
301  } else {
302 
303  if (enable_profiler) {
304  printfQuda("Stopping profiling for %s\n", f);
306  enable_profiler = false;
307  }
308  }
309 }
310 
311 
312 namespace quda {
313  void printLaunchTimer();
314 }
315 
316 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
317 {
319  setOutputPrefix(prefix);
320  setOutputFile(outfile);
321 }
322 
323 
324 typedef struct {
325  int ndim;
326  int dims[QUDA_MAX_DIM];
327 } LexMapData;
328 
332 static int lex_rank_from_coords(const int *coords, void *fdata)
333 {
334  auto *md = static_cast<LexMapData *>(fdata);
335 
336  int rank = coords[0];
337  for (int i = 1; i < md->ndim; i++) {
338  rank = md->dims[i] * rank + coords[i];
339  }
340  return rank;
341 }
342 
343 #ifdef QMP_COMMS
347 static int qmp_rank_from_coords(const int *coords, void *fdata)
348 {
349  return QMP_get_node_number_from(coords);
350 }
351 #endif
352 
353 // Provision for user control over MPI comm handle
354 // Assumes an MPI implementation of QMP
355 
356 #if defined(QMP_COMMS) || defined(MPI_COMMS)
357 MPI_Comm MPI_COMM_HANDLE_USER;
358 static bool user_set_comm_handle = false;
359 #endif
360 
361 void setMPICommHandleQuda(void *mycomm)
362 {
363 #if defined(QMP_COMMS) || defined(MPI_COMMS)
364  MPI_COMM_HANDLE_USER = *((MPI_Comm *)mycomm);
365  user_set_comm_handle = true;
366 #endif
367 }
368 
369 static bool comms_initialized = false;
370 
371 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
372 {
373  if (comms_initialized) return;
374 
375  if (nDim != 4) {
376  errorQuda("Number of communication grid dimensions must be 4");
377  }
378 
379  LexMapData map_data;
380  if (!func) {
381 
382 #if QMP_COMMS
383  if (QMP_logical_topology_is_declared()) {
384  if (QMP_get_logical_number_of_dimensions() != 4) {
385  errorQuda("QMP logical topology must have 4 dimensions");
386  }
387  for (int i=0; i<nDim; i++) {
388  int qdim = QMP_get_logical_dimensions()[i];
389  if(qdim != dims[i]) {
390  errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
391  }
392  }
393  fdata = nullptr;
394  func = qmp_rank_from_coords;
395  } else {
396  warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
397 #endif
398 
399  map_data.ndim = nDim;
400  for (int i=0; i<nDim; i++) {
401  map_data.dims[i] = dims[i];
402  }
403  fdata = (void *) &map_data;
404  func = lex_rank_from_coords;
405 
406 #if QMP_COMMS
407  }
408 #endif
409 
410  }
411 
412 #if defined(QMP_COMMS) || defined(MPI_COMMS)
413  comm_init(nDim, dims, func, fdata, user_set_comm_handle, (void *)&MPI_COMM_HANDLE_USER);
414 #else
415  comm_init(nDim, dims, func, fdata);
416 #endif
417 
418  comms_initialized = true;
419 }
420 
421 
422 static void init_default_comms()
423 {
424 #if defined(QMP_COMMS)
425  if (QMP_logical_topology_is_declared()) {
426  int ndim = QMP_get_logical_number_of_dimensions();
427  const int *dims = QMP_get_logical_dimensions();
428  initCommsGridQuda(ndim, dims, nullptr, nullptr);
429  } else {
430  errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
431  " and QMP logical topology has not been declared");
432  }
433 #elif defined(MPI_COMMS)
434  errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
435 #else // single-GPU
436  const int dims[4] = {1, 1, 1, 1};
437  initCommsGridQuda(4, dims, nullptr, nullptr);
438 #endif
439 }
440 
441 
442 #define STR_(x) #x
443 #define STR(x) STR_(x)
444  static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
445 #undef STR
446 #undef STR_
447 
448 extern char* gitversion;
449 
450 /*
451  * Set the device that QUDA uses.
452  */
453 void initQudaDevice(int dev)
454 {
455  //static bool initialized = false;
456  if (initialized) return;
457  initialized = true;
458 
459  profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
460  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
461  profileInit.TPSTART(QUDA_PROFILE_INIT);
462 
463  if (getVerbosity() >= QUDA_SUMMARIZE) {
464 #ifdef GITVERSION
465  printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
466 #else
467  printfQuda("QUDA %s\n",quda_version.c_str());
468 #endif
469  }
470 
471 #ifdef MULTI_GPU
472  if (dev < 0) {
473  if (!comms_initialized) {
474  errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
475  }
476  dev = comm_gpuid();
477  }
478 #else
479  if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
480 #endif
481 
482  device::init(dev);
483 
484  { // determine if we will do CPU or GPU data reordering (default is GPU)
485  char *reorder_str = getenv("QUDA_REORDER_LOCATION");
486 
487  if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {
488  warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
490  } else {
491  warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
493  }
494  }
495 
496  profileInit.TPSTOP(QUDA_PROFILE_INIT);
497  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
498 }
499 
500 /*
501  * Any persistent memory allocations that QUDA uses are done here.
502  */
504 {
505  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
506  profileInit.TPSTART(QUDA_PROFILE_INIT);
507 
508  if (!comms_initialized) init_default_comms();
509 
511 
512  loadTuneCache();
513 
514  // initalize the memory pool allocators
515  pool::init();
516 
518 
520  blas::init();
521 
522  num_failures_h = static_cast<int *>(mapped_malloc(sizeof(int)));
523  num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
524 
525  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
526 
527  profileInit.TPSTOP(QUDA_PROFILE_INIT);
528  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
529 }
530 
531 void updateR()
532 {
533  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
534 }
535 
536 void initQuda(int dev)
537 {
538  // initialize communications topology, if not already done explicitly via initCommsGridQuda()
539  if (!comms_initialized) init_default_comms();
540 
541  // set the device that QUDA uses
542  initQudaDevice(dev);
543 
544  // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
545  initQudaMemory();
546 }
547 
548 // This is a flag used to signal when we have downloaded new gauge
549 // field. Set by loadGaugeQuda and consumed by loadCloverQuda as one
550 // possible flag to indicate we need to recompute the clover field
551 static bool invalidate_clover = true;
552 
553 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
554 {
555  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
556 
557  if (!initialized) errorQuda("QUDA not initialized");
559 
560  checkGaugeParam(param);
561 
562  profileGauge.TPSTART(QUDA_PROFILE_INIT);
563  // Set the specific input parameters and create the cpu gauge field
564  GaugeFieldParam gauge_param(h_gauge, *param);
565 
566  if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
568  static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
569  static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
570 
571  if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
572  static size_t checksum = SIZE_MAX;
573  size_t in_checksum = in->checksum(true);
574  if (in_checksum == checksum) {
575  if (getVerbosity() >= QUDA_VERBOSE)
576  printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
577  profileGauge.TPSTOP(QUDA_PROFILE_INIT);
578  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
579  delete in;
580  invalidate_clover = false;
581  return;
582  }
583  checksum = in_checksum;
584  invalidate_clover = true;
585  }
586 
587  // free any current gauge field before new allocations to reduce memory overhead
588  switch (param->type) {
589  case QUDA_WILSON_LINKS:
591  delete gaugeRefinement;
592 
595  delete gaugePrecondition;
596 
598  && gaugeEigensolver)
599  delete gaugeEigensolver;
600 
602 
604 
605  break;
608  delete gaugeFatRefinement;
609 
612  delete gaugeFatPrecondition;
613 
616  delete gaugeFatEigensolver;
617 
619 
621 
622  break;
624 
626  delete gaugeLongRefinement;
627 
630  delete gaugeLongPrecondition;
631 
634  delete gaugeLongEigensolver;
635 
637 
639 
640  break;
641  case QUDA_SMEARED_LINKS:
642  if (gaugeSmeared) delete gaugeSmeared;
643  break;
644  default:
645  errorQuda("Invalid gauge type %d", param->type);
646  }
647 
648  // if not preserving then copy the gauge field passed in
649  cudaGaugeField *precise = nullptr;
650 
651  // switch the parameters for creating the mirror precise cuda gauge field
654  gauge_param.setPrecision(param->cuda_prec, true);
655  gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
656  gauge_param.pad = param->ga_pad;
657 
658  precise = new cudaGaugeField(gauge_param);
659 
660  if (param->use_resident_gauge) {
661  if(gaugePrecise == nullptr) errorQuda("No resident gauge field");
662  // copy rather than point at to ensure that the padded region is filled in
663  precise->copy(*gaugePrecise);
664  precise->exchangeGhost();
665  delete gaugePrecise;
666  gaugePrecise = nullptr;
667  profileGauge.TPSTOP(QUDA_PROFILE_INIT);
668  } else {
669  profileGauge.TPSTOP(QUDA_PROFILE_INIT);
670  profileGauge.TPSTART(QUDA_PROFILE_H2D);
671  precise->copy(*in);
672  profileGauge.TPSTOP(QUDA_PROFILE_H2D);
673  }
674 
675  // for gaugeSmeared we are interested only in the precise version
676  if (param->type == QUDA_SMEARED_LINKS) {
677  gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);
678 
679  profileGauge.TPSTART(QUDA_PROFILE_FREE);
680  delete precise;
681  delete in;
682  profileGauge.TPSTOP(QUDA_PROFILE_FREE);
683 
684  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
685  return;
686  }
687 
688  // creating sloppy fields isn't really compute, but it is work done on the gpu
689  profileGauge.TPSTART(QUDA_PROFILE_COMPUTE);
690 
691  // switch the parameters for creating the mirror sloppy cuda gauge field
693  gauge_param.setPrecision(param->cuda_prec_sloppy, true);
694  cudaGaugeField *sloppy = nullptr;
696  sloppy = precise;
697  } else {
698  sloppy = new cudaGaugeField(gauge_param);
699  sloppy->copy(*precise);
700  }
701 
702  // switch the parameters for creating the mirror preconditioner cuda gauge field
704  gauge_param.setPrecision(param->cuda_prec_precondition, true);
705  cudaGaugeField *precondition = nullptr;
707  precondition = precise;
710  precondition = sloppy;
711  } else {
712  precondition = new cudaGaugeField(gauge_param);
713  precondition->copy(*precise);
714  }
715 
716  // switch the parameters for creating the refinement cuda gauge field
718  gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);
719  cudaGaugeField *refinement = nullptr;
722  refinement = sloppy;
723  } else {
724  refinement = new cudaGaugeField(gauge_param);
725  refinement->copy(*sloppy);
726  }
727 
728  // switch the parameters for creating the eigensolver cuda gauge field
730  gauge_param.setPrecision(param->cuda_prec_eigensolver, true);
731  cudaGaugeField *eigensolver = nullptr;
733  eigensolver = precise;
736  eigensolver = precondition;
739  eigensolver = sloppy;
740  } else {
741  eigensolver = new cudaGaugeField(gauge_param);
742  eigensolver->copy(*precise);
743  }
744 
745  profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);
746 
747  // create an extended preconditioning field
748  cudaGaugeField* extended = nullptr;
749  if (param->overlap){
750  int R[4]; // domain-overlap widths in different directions
751  for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
752  extended = createExtendedGauge(*precondition, R, profileGauge);
753  }
754 
755  switch (param->type) {
756  case QUDA_WILSON_LINKS:
757  gaugePrecise = precise;
758  gaugeSloppy = sloppy;
759  gaugePrecondition = precondition;
760  gaugeRefinement = refinement;
761  gaugeEigensolver = eigensolver;
762 
763  if(param->overlap) gaugeExtended = extended;
764  break;
766  gaugeFatPrecise = precise;
767  gaugeFatSloppy = sloppy;
768  gaugeFatPrecondition = precondition;
769  gaugeFatRefinement = refinement;
770  gaugeFatEigensolver = eigensolver;
771 
772  if(param->overlap){
773  if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
774  gaugeFatExtended = extended;
775  }
776  break;
778  gaugeLongPrecise = precise;
779  gaugeLongSloppy = sloppy;
780  gaugeLongPrecondition = precondition;
781  gaugeLongRefinement = refinement;
782  gaugeLongEigensolver = eigensolver;
783 
784  if(param->overlap){
785  if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
786  gaugeLongExtended = extended;
787  }
788  break;
789  default:
790  errorQuda("Invalid gauge type %d", param->type);
791  }
792 
793  profileGauge.TPSTART(QUDA_PROFILE_FREE);
794  delete in;
795  profileGauge.TPSTOP(QUDA_PROFILE_FREE);
796 
797  if (extendedGaugeResident) {
798  // updated the resident gauge field if needed
800  delete extendedGaugeResident;
801  // Use the static R (which is defined at the very beginning of lib/interface_quda.cpp) here
802  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
803  }
804 
805  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
806 }
807 
808 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
809 {
810  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
811 
812  if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported");
813 
814  if (!initialized) errorQuda("QUDA not initialized");
815  checkGaugeParam(param);
816 
817  // Set the specific cpu parameters and create the cpu gauge field
818  GaugeFieldParam gauge_param(h_gauge, *param);
820  cudaGaugeField *cudaGauge = nullptr;
821  switch (param->type) {
825  case QUDA_SMEARED_LINKS:
828  gauge_param.setPrecision(param->cuda_prec, true);
829  gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
830  gauge_param.pad = param->ga_pad;
833  break;
834  default: errorQuda("Invalid gauge type");
835  }
836 
837  profileGauge.TPSTART(QUDA_PROFILE_D2H);
839  profileGauge.TPSTOP(QUDA_PROFILE_D2H);
840 
841  if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
842 
843  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
844 }
845 
847 void freeSloppyCloverQuda();
848 
849 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
850 {
851  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
852  profileClover.TPSTART(QUDA_PROFILE_INIT);
853 
854  checkCloverParam(inv_param);
855  bool device_calc = false; // calculate clover and inverse on the device?
856 
859 
860  if (!initialized) errorQuda("QUDA not initialized");
861 
862  if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
863  device_calc = true;
864  if (inv_param->clover_coeff == 0.0 && inv_param->clover_csw == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient nor Csw not set");
865  if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
866  }
867 
868  if (inv_param->clover_cpu_prec < QUDA_SINGLE_PRECISION) errorQuda("Fixed-point precision not supported on CPU");
869  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded before clover");
872  errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
873  }
874 
875  // determines whether operator is preconditioned when calling invertQuda()
876  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
879 
880  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
881  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
883 
884  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
886 
887  // uninverted clover term is required when applying unpreconditioned operator,
888  // but note that dslashQuda() is always preconditioned
889  if (!h_clover && !pc_solve && !pc_solution) {
890  //warningQuda("Uninverted clover term not loaded");
891  }
892 
893  // uninverted clover term is also required for "asymmetric" preconditioning
894  if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
895  warningQuda("Uninverted clover term not loaded");
896  }
897 
898  bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
899 
900  CloverFieldParam clover_param;
901  clover_param.nDim = 4;
902  // If clover_coeff is not set manually, then it is the product Csw * kappa.
903  // If the user has set the clover_coeff manually, that value takes precedent.
904  clover_param.csw = inv_param->clover_csw;
906  // We must also adjust inv_param->clover_coeff here. If a user has set kappa and
907  // Csw, we must populate inv_param->clover_coeff for them as the computeClover
908  // routines uses that value
910  clover_param.twisted = twisted;
911  clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
912  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
913  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
914  clover_param.pad = inv_param->cl_pad;
915  clover_param.create = QUDA_NULL_FIELD_CREATE;
916  clover_param.norm = nullptr;
917  clover_param.invNorm = nullptr;
918  clover_param.setPrecision(inv_param->clover_cuda_prec, true);
919  clover_param.direct = h_clover || device_calc ? true : false;
920  clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover_inverse() ? true : false;
921  CloverField *in = nullptr;
922  profileClover.TPSTOP(QUDA_PROFILE_INIT);
923 
924  // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)
925  bool clover_update = false;
926  // If either of the clover params have changed, trigger a recompute
927  double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
928  double coeff_old = cloverPrecise ? cloverPrecise->Coeff() : 0.0;
929  if (!cloverPrecise || invalidate_clover ||
930  inv_param->clover_coeff != coeff_old ||
931  inv_param->clover_csw != csw_old) clover_update = true;
932 
933  // compute or download clover field only if gauge field has been updated or clover field doesn't exist
934  if (clover_update) {
935  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
937  if (cloverPrecise) delete cloverPrecise;
938 
939  profileClover.TPSTART(QUDA_PROFILE_INIT);
940  cloverPrecise = new cudaCloverField(clover_param);
941 
942  if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
943  // create a param for the cpu clover field
944  CloverFieldParam inParam(clover_param);
945  inParam.order = inv_param->clover_order;
947  inParam.direct = h_clover ? true : false;
948  inParam.inverse = h_clovinv ? true : false;
949  inParam.clover = h_clover;
950  inParam.cloverInv = h_clovinv;
953  static_cast<CloverField*>(new cpuCloverField(inParam)) :
954  static_cast<CloverField*>(new cudaCloverField(inParam));
955  }
956  profileClover.TPSTOP(QUDA_PROFILE_INIT);
957 
958  if (!device_calc) {
959  profileClover.TPSTART(QUDA_PROFILE_H2D);
960  bool inverse = (h_clovinv && !inv_param->compute_clover_inverse && !dynamic_clover_inverse());
961  cloverPrecise->copy(*in, inverse);
962  profileClover.TPSTOP(QUDA_PROFILE_H2D);
963  } else {
964  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
966  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
967  }
968 
969  // inverted clover term is required when applying preconditioned operator
970  if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
971  profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
972  if (!dynamic_clover_inverse()) {
975  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
976  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
977  }
978  }
979  profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
980  }
981  } else {
982  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
983  }
984 
985  clover_param.direct = true;
986  clover_param.inverse = dynamic_clover_inverse() ? false : true;
987 
989 
993 
994  // if requested, copy back the clover / inverse field
996  if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");
997 
998  // copy the inverted clover term into host application order on the device
999  clover_param.direct = (h_clover && inv_param->return_clover);
1000  clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);
1001 
1002  // this isn't really "epilogue" but this label suffices
1003  profileClover.TPSTART(QUDA_PROFILE_EPILOGUE);
1004  cudaCloverField *hack = nullptr;
1005  if (!dynamic_clover_inverse()) {
1006  clover_param.order = inv_param->clover_order;
1007  clover_param.setPrecision(inv_param->clover_cpu_prec);
1008  hack = new cudaCloverField(clover_param);
1009  hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1010  } else {
1011  clover_param.setPrecision(inv_param->clover_cuda_prec, true);
1012  auto *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack
1013  hackOfTheHack->copy(*cloverPrecise, false);
1014  cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog);
1016  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
1017  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
1018  }
1019  clover_param.order = inv_param->clover_order;
1020  clover_param.setPrecision(inv_param->clover_cpu_prec);
1021  hack = new cudaCloverField(clover_param);
1022  hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1023  delete hackOfTheHack;
1024  }
1025  profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE);
1026 
1027  // copy the field into the host application's clover field
1028  profileClover.TPSTART(QUDA_PROFILE_D2H);
1029  if (inv_param->return_clover) {
1030  qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
1031  }
1033  qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
1034  }
1035 
1036  profileClover.TPSTOP(QUDA_PROFILE_D2H);
1037 
1038  delete hack;
1039  }
1040 
1041  profileClover.TPSTART(QUDA_PROFILE_FREE);
1042  if (in) delete in; // delete object referencing input field
1043  profileClover.TPSTOP(QUDA_PROFILE_FREE);
1044 
1045  popVerbosity();
1046 
1047  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
1048 }
1049 
1050 void freeSloppyCloverQuda();
1051 
1053 {
1055 
1056  if (cloverPrecise) {
1057  // create the mirror sloppy clover field
1058  CloverFieldParam clover_param(*cloverPrecise);
1059  clover_param.setPrecision(prec[0], true);
1060 
1061  if (cloverPrecise->V(false) != cloverPrecise->V(true)) {
1062  clover_param.direct = true;
1063  clover_param.inverse = true;
1064  } else {
1065  clover_param.direct = false;
1066  clover_param.inverse = true;
1067  }
1068 
1069  if (clover_param.Precision() != cloverPrecise->Precision()) {
1070  cloverSloppy = new cudaCloverField(clover_param);
1071  cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
1072  } else {
1074  }
1075 
1076  // switch the parameters for creating the mirror preconditioner clover field
1077  clover_param.setPrecision(prec[1], true);
1078 
1079  // create the mirror preconditioner clover field
1080  if (clover_param.Precision() == cloverPrecise->Precision()) {
1082  } else if (clover_param.Precision() == cloverSloppy->Precision()) {
1084  } else {
1085  cloverPrecondition = new cudaCloverField(clover_param);
1086  cloverPrecondition->copy(*cloverPrecise, clover_param.inverse);
1087  }
1088 
1089  // switch the parameters for creating the mirror refinement clover field
1090  clover_param.setPrecision(prec[2], true);
1091 
1092  // create the mirror refinement clover field
1093  if (clover_param.Precision() != cloverSloppy->Precision()) {
1094  cloverRefinement = new cudaCloverField(clover_param);
1095  cloverRefinement->copy(*cloverSloppy, clover_param.inverse);
1096  } else {
1098  }
1099  // switch the parameters for creating the mirror eigensolver clover field
1100  clover_param.setPrecision(prec[3]);
1101 
1102  // create the mirror eigensolver clover field
1103  if (clover_param.Precision() == cloverPrecise->Precision()) {
1105  } else if (clover_param.Precision() == cloverSloppy->Precision()) {
1107  } else if (clover_param.Precision() == cloverPrecondition->Precision()) {
1109  } else {
1110  cloverEigensolver = new cudaCloverField(clover_param);
1111  cloverEigensolver->copy(*cloverPrecise, clover_param.inverse);
1112  }
1113  }
1114 
1115 }
1116 
1117 // just free the sloppy fields used in mixed-precision solvers
1119 {
1120  if (!initialized) errorQuda("QUDA not initialized");
1121 
1122  // Wilson gauges
1123  //---------------------------------------------------------------------------
1124  // Delete gaugeRefinement if it does not alias gaugeSloppy.
1126 
1127  // Delete gaugePrecondition if it does not alias gaugePrecise, gaugeSloppy, or gaugeEigensolver.
1129  && gaugePrecondition)
1130  delete gaugePrecondition;
1131 
1132  // Delete gaugeEigensolver if it does not alias gaugePrecise or gaugeSloppy.
1134 
1135  // Delete gaugeSloppy if it does not alias gaugePrecise.
1136  if (gaugeSloppy != gaugePrecise && gaugeSloppy) delete gaugeSloppy;
1137 
1138  gaugeEigensolver = nullptr;
1139  gaugeRefinement = nullptr;
1140  gaugePrecondition = nullptr;
1141  gaugeSloppy = nullptr;
1142  //---------------------------------------------------------------------------
1143 
1144  // Long gauges
1145  //---------------------------------------------------------------------------
1146  // Delete gaugeLongRefinement if it does not alias gaugeLongSloppy.
1148 
1149  // Delete gaugeLongPrecondition if it does not alias gaugeLongPrecise, gaugeLongSloppy, or gaugeLongEigensolver.
1152  delete gaugeLongPrecondition;
1153 
1154  // Delete gaugeLongEigensolver if it does not alias gaugeLongPrecise or gaugeLongSloppy.
1156  delete gaugeLongEigensolver;
1157 
1158  // Delete gaugeLongSloppy if it does not alias gaugeLongPrecise.
1160 
1161  gaugeLongEigensolver = nullptr;
1162  gaugeLongRefinement = nullptr;
1163  gaugeLongPrecondition = nullptr;
1164  gaugeLongSloppy = nullptr;
1165  //---------------------------------------------------------------------------
1166 
1167  // Fat gauges
1168  //---------------------------------------------------------------------------
1169  // Delete gaugeFatRefinement if it does not alias gaugeFatSloppy.
1171 
1172  // Delete gaugeFatPrecondition if it does not alias gaugeFatPrecise, gaugeFatSloppy, or gaugeFatEigensolver.
1175  delete gaugeFatPrecondition;
1176 
1177  // Delete gaugeFatEigensolver if it does not alias gaugeFatPrecise or gaugeFatSloppy.
1179  delete gaugeFatEigensolver;
1180 
1181  // Delete gaugeFatSloppy if it does not alias gaugeFatPrecise.
1183 
1184  gaugeFatEigensolver = nullptr;
1185  gaugeFatRefinement = nullptr;
1186  gaugeFatPrecondition = nullptr;
1187  gaugeFatSloppy = nullptr;
1188 }
1189 
1190 void freeGaugeQuda(void)
1191 {
1192  if (!initialized) errorQuda("QUDA not initialized");
1193 
1195 
1196  if (gaugePrecise) delete gaugePrecise;
1197  if (gaugeExtended) delete gaugeExtended;
1198 
1199  gaugePrecise = nullptr;
1200  gaugeExtended = nullptr;
1201 
1202  if (gaugeLongPrecise) delete gaugeLongPrecise;
1204 
1205  gaugeLongPrecise = nullptr;
1206  gaugeLongExtended = nullptr;
1207 
1208  if (gaugeFatPrecise) delete gaugeFatPrecise;
1209 
1210  gaugeFatPrecise = nullptr;
1211  gaugeFatExtended = nullptr;
1212 
1213  if (gaugeSmeared) delete gaugeSmeared;
1214 
1215  gaugeSmeared = nullptr;
1216  // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
1217  if (extendedGaugeResident) {
1218  delete extendedGaugeResident;
1219  extendedGaugeResident = nullptr;
1220  }
1221 }
1222 
1224 {
1225  // first do SU3 links (if they exist)
1226  if (gaugePrecise) {
1228  // switch the parameters for creating the mirror sloppy cuda gauge field
1229 
1230  gauge_param.reconstruct = recon[0];
1231  gauge_param.setPrecision(prec[0], true);
1232 
1233  if (gaugeSloppy) errorQuda("gaugeSloppy already exists");
1234 
1237  } else {
1240  }
1241 
1242  // switch the parameters for creating the mirror preconditioner cuda gauge field
1243  gauge_param.reconstruct = recon[1];
1244  gauge_param.setPrecision(prec[1], true);
1245 
1246  if (gaugePrecondition) errorQuda("gaugePrecondition already exists");
1247 
1250  } else if (gauge_param.Precision() == gaugeSloppy->Precision()
1253  } else {
1256  }
1257 
1258  // switch the parameters for creating the mirror refinement cuda gauge field
1259  gauge_param.reconstruct = recon[2];
1260  gauge_param.setPrecision(prec[2], true);
1261 
1262  if (gaugeRefinement) errorQuda("gaugeRefinement already exists");
1263 
1266  } else {
1269  }
1270 
1271  // switch the parameters for creating the mirror eigensolver cuda gauge field
1272  gauge_param.reconstruct = recon[3];
1273  gauge_param.setPrecision(prec[3], true);
1274 
1275  if (gaugeEigensolver) errorQuda("gaugeEigensolver already exists");
1276 
1279  } else if (gauge_param.Precision() == gaugeSloppy->Precision()
1282  } else if (gauge_param.Precision() == gaugePrecondition->Precision()
1285  } else {
1288  }
1289  }
1290 
1291  // fat links (if they exist)
1292  if (gaugeFatPrecise) {
1294  // switch the parameters for creating the mirror sloppy cuda gauge field
1295 
1296  gauge_param.setPrecision(prec[0], true);
1297 
1298  if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");
1299 
1300  if (gauge_param.Precision() == gaugeFatPrecise->Precision()
1303  } else {
1306  }
1307 
1308  // switch the parameters for creating the mirror preconditioner cuda gauge field
1309  gauge_param.setPrecision(prec[1], true);
1310 
1311  if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");
1312 
1313  if (gauge_param.Precision() == gaugeFatPrecise->Precision()
1316  } else if (gauge_param.Precision() == gaugeFatSloppy->Precision()
1319  } else {
1322  }
1323 
1324  // switch the parameters for creating the mirror refinement cuda gauge field
1325  gauge_param.setPrecision(prec[2], true);
1326 
1327  if (gaugeFatRefinement) errorQuda("gaugeFatRefinement already exists\n");
1328 
1329  if (gauge_param.Precision() == gaugeFatSloppy->Precision()
1332  } else {
1335  }
1336 
1337  // switch the parameters for creating the mirror eigensolver cuda gauge field
1338  gauge_param.setPrecision(prec[3], true);
1339 
1340  if (gaugeFatEigensolver) errorQuda("gaugeFatEigensolver already exists");
1341 
1342  if (gauge_param.Precision() == gaugeFatPrecise->Precision()
1345  } else if (gauge_param.Precision() == gaugeFatSloppy->Precision()
1348  } else if (gauge_param.Precision() == gaugeFatPrecondition->Precision()
1351  } else {
1354  }
1355  }
1356 
1357  // long links (if they exist)
1358  if (gaugeLongPrecise) {
1360  // switch the parameters for creating the mirror sloppy cuda gauge field
1361 
1362  gauge_param.reconstruct = recon[0];
1363  gauge_param.setPrecision(prec[0], true);
1364 
1365  if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");
1366 
1367  if (gauge_param.Precision() == gaugeLongPrecise->Precision()
1370  } else {
1373  }
1374 
1375  // switch the parameters for creating the mirror preconditioner cuda gauge field
1376  gauge_param.reconstruct = recon[1];
1377  gauge_param.setPrecision(prec[1], true);
1378 
1379  if (gaugeLongPrecondition) errorQuda("gaugeLongPrecondition already exists\n");
1380 
1381  if (gauge_param.Precision() == gaugeLongPrecise->Precision()
1384  } else if (gauge_param.Precision() == gaugeLongSloppy->Precision()
1387  } else {
1390  }
1391 
1392  // switch the parameters for creating the mirror refinement cuda gauge field
1393  gauge_param.reconstruct = recon[2];
1394  gauge_param.setPrecision(prec[2], true);
1395 
1396  if (gaugeLongRefinement) errorQuda("gaugeLongRefinement already exists\n");
1397 
1398  if (gauge_param.Precision() == gaugeLongSloppy->Precision()
1401  } else {
1404  }
1405 
1406  // switch the parameters for creating the mirror eigensolver cuda gauge field
1407  gauge_param.reconstruct = recon[3];
1408  gauge_param.setPrecision(prec[3], true);
1409 
1410  if (gaugeLongEigensolver) errorQuda("gaugePrecondition already exists");
1411 
1412  if (gauge_param.Precision() == gaugeLongPrecise->Precision()
1415  } else if (gauge_param.Precision() == gaugeLongSloppy->Precision()
1418  } else if (gauge_param.Precision() == gaugeLongPrecondition->Precision()
1421  } else {
1424  }
1425  }
1426 }
1427 
1429 {
1430  if (!initialized) errorQuda("QUDA not initialized");
1431 
1432  // Delete cloverRefinement if it does not alias gaugeSloppy.
1434 
1435  // Delete cloverPrecondition if it does not alias cloverPrecise, cloverSloppy, or cloverEigensolver.
1438  delete cloverPrecondition;
1439 
1440  // Delete cloverEigensolver if it does not alias cloverPrecise or cloverSloppy.
1442  delete cloverEigensolver;
1443 
1444  // Delete cloverSloppy if it does not alias cloverPrecise.
1446 
1447  cloverEigensolver = nullptr;
1448  cloverRefinement = nullptr;
1449  cloverPrecondition = nullptr;
1450  cloverSloppy = nullptr;
1451 }
1452 
1453 void freeCloverQuda(void)
1454 {
1455  if (!initialized) errorQuda("QUDA not initialized");
1457  if (cloverPrecise) delete cloverPrecise;
1458  cloverPrecise = nullptr;
1459 }
1460 
1461 void flushChronoQuda(int i)
1462 {
1463  if (i >= QUDA_MAX_CHRONO)
1464  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
1465 
1466  auto &basis = chronoResident[i];
1467 
1468  for (auto v : basis) {
1469  if (v) delete v;
1470  }
1471  basis.clear();
1472 }
1473 
1474 void endQuda(void)
1475 {
1476  profileEnd.TPSTART(QUDA_PROFILE_TOTAL);
1477 
1478  if (!initialized) return;
1479 
1480  freeGaugeQuda();
1481  freeCloverQuda();
1482 
1483  for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
1484 
1485  for (auto v : solutionResident) if (v) delete v;
1486  solutionResident.clear();
1487 
1488  if(momResident) delete momResident;
1489 
1492 
1495  blas::destroy();
1496 
1499 
1500  host_free(num_failures_h);
1501  num_failures_h = nullptr;
1502  num_failures_d = nullptr;
1503 
1505 
1506  saveTuneCache();
1507  saveProfile();
1508 
1509  // flush any outstanding force monitoring (if enabled)
1511 
1512  initialized = false;
1513 
1514  comm_finalize();
1515  comms_initialized = false;
1516 
1517  profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);
1518  profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);
1519 
1520  // print out the profile information of the lifetime of the library
1521  if (getVerbosity() >= QUDA_SUMMARIZE) {
1522  profileInit.Print();
1523  profileGauge.Print();
1524  profileClover.Print();
1525  profileDslash.Print();
1526  profileInvert.Print();
1527  profileInvertMultiSrc.Print();
1528  profileMulti.Print();
1529  profileEigensolve.Print();
1530  profileFatLink.Print();
1531  profileGaugeForce.Print();
1532  profileGaugeUpdate.Print();
1533  profileExtendedGauge.Print();
1534  profileCloverForce.Print();
1535  profileStaggeredForce.Print();
1536  profileHISQForce.Print();
1537  profileContract.Print();
1538  profileBLAS.Print();
1539  profileCovDev.Print();
1540  profilePlaq.Print();
1541  profileGaugeObs.Print();
1542  profileAPE.Print();
1543  profileSTOUT.Print();
1544  profileOvrImpSTOUT.Print();
1545  profileWFlow.Print();
1546  profileProject.Print();
1547  profilePhase.Print();
1548  profileMomAction.Print();
1549  profileEnd.Print();
1550 
1551  profileInit2End.Print();
1553 
1554  printLaunchTimer();
1555  printAPIProfile();
1556 
1557  printfQuda("\n");
1559  printfQuda("\n");
1560  }
1561 
1562  assertAllMemFree();
1563 
1564  device::destroy();
1565 }
1566 
1567 
1568 namespace quda {
1569 
1570  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1571  {
1572  double kappa = inv_param->kappa;
1575  }
1576 
1577  switch (inv_param->dslash_type) {
1578  case QUDA_WILSON_DSLASH:
1579  diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
1580  break;
1582  diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
1583  break;
1586  break;
1589  diracParam.Ls = inv_param->Ls;
1590  break;
1593  diracParam.Ls = inv_param->Ls;
1594  break;
1596  if (inv_param->Ls > QUDA_MAX_DWF_LS) {
1597  errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
1598  }
1600  diracParam.Ls = inv_param->Ls;
1601  if (sizeof(Complex) != sizeof(double _Complex)) {
1602  errorQuda("Irreconcilable difference between interface and internal complex number conventions");
1603  }
1604  memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
1605  memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
1606  diracParam.eofa_shift = inv_param->eofa_shift;
1607  diracParam.eofa_pm = inv_param->eofa_pm;
1608  diracParam.mq1 = inv_param->mq1;
1609  diracParam.mq2 = inv_param->mq2;
1610  diracParam.mq3 = inv_param->mq3;
1611  break;
1613  if (inv_param->Ls > QUDA_MAX_DWF_LS)
1614  errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
1616  diracParam.Ls = inv_param->Ls;
1617  if (sizeof(Complex) != sizeof(double _Complex)) {
1618  errorQuda("Irreconcilable difference between interface and internal complex number conventions");
1619  }
1620  memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
1621  memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
1622  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1623  printfQuda("Printing b_5 and c_5 values\n");
1624  for (int i = 0; i < diracParam.Ls; i++) {
1625  printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),
1626  diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
1627  // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,
1628  // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,
1629  // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );
1630  }
1631  }
1632  break;
1633  case QUDA_STAGGERED_DSLASH:
1634  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1635  break;
1636  case QUDA_ASQTAD_DSLASH:
1637  diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
1638  break;
1642  diracParam.Ls = 1;
1643  diracParam.epsilon = 0.0;
1644  } else {
1645  diracParam.Ls = 2;
1647  }
1648  break;
1652  diracParam.Ls = 1;
1653  diracParam.epsilon = 0.0;
1654  } else {
1655  diracParam.Ls = 2;
1657  }
1658  break;
1659  case QUDA_LAPLACE_DSLASH:
1661  diracParam.laplace3D = inv_param->laplace3D;
1662  break;
1663  case QUDA_COVDEV_DSLASH:
1664  diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
1665  break;
1666  default:
1667  errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
1668  }
1669 
1670  diracParam.matpcType = inv_param->matpc_type;
1671  diracParam.dagger = inv_param->dagger;
1673  diracParam.fatGauge = gaugeFatPrecise;
1674  diracParam.longGauge = gaugeLongPrecise;
1675  diracParam.clover = cloverPrecise;
1676  diracParam.kappa = kappa;
1677  diracParam.mass = inv_param->mass;
1678  diracParam.m5 = inv_param->m5;
1679  diracParam.mu = inv_param->mu;
1680 
1681  for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on
1682 
1683  if (diracParam.gauge->Precision() != inv_param->cuda_prec)
1684  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1685  inv_param->cuda_prec);
1686  }
1687 
1688 
1689  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1690  {
1691  setDiracParam(diracParam, inv_param, pc);
1692 
1694  diracParam.fatGauge = gaugeFatSloppy;
1695  diracParam.longGauge = gaugeLongSloppy;
1696  diracParam.clover = cloverSloppy;
1697 
1698  for (int i=0; i<4; i++) {
1699  diracParam.commDim[i] = 1; // comms are always on
1700  }
1701 
1702  if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy)
1703  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1705  }
1706 
1707  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1708  {
1709  setDiracParam(diracParam, inv_param, pc);
1710 
1712  diracParam.fatGauge = gaugeFatRefinement;
1713  diracParam.longGauge = gaugeLongRefinement;
1714  diracParam.clover = cloverRefinement;
1715 
1716  for (int i=0; i<4; i++) {
1717  diracParam.commDim[i] = 1; // comms are always on
1718  }
1719 
1720  if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy)
1721  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1723  }
1724 
1725  // The preconditioner currently mimicks the sloppy operator with no comms
1726  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
1727  {
1728  setDiracParam(diracParam, inv_param, pc);
1729 
1730  if (inv_param->overlap) {
1732  diracParam.fatGauge = gaugeFatExtended;
1733  diracParam.longGauge = gaugeLongExtended;
1734  } else {
1736  diracParam.fatGauge = gaugeFatPrecondition;
1737  diracParam.longGauge = gaugeLongPrecondition;
1738  }
1739  diracParam.clover = cloverPrecondition;
1740 
1741  for (int i=0; i<4; i++) {
1742  diracParam.commDim[i] = comms ? 1 : 0;
1743  }
1744 
1745  // In the preconditioned staggered CG allow a different dslash type in the preconditioning
1748  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1749  diracParam.gauge = gaugeFatPrecondition;
1750  }
1751 
1752  if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition)
1753  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1755  }
1756 
1757  // The deflation preconditioner currently mimicks the sloppy operator with no comms
1758  void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
1759  {
1760  setDiracParam(diracParam, inv_param, pc);
1761 
1762  if (inv_param->overlap) {
1764  diracParam.fatGauge = gaugeFatExtended;
1765  diracParam.longGauge = gaugeLongExtended;
1766  } else {
1768  diracParam.fatGauge = gaugeFatEigensolver;
1769  diracParam.longGauge = gaugeLongEigensolver;
1770  }
1771  diracParam.clover = cloverEigensolver;
1772 
1773  for (int i = 0; i < 4; i++) { diracParam.commDim[i] = comms ? 1 : 0; }
1774 
1775  // In the deflated staggered CG allow a different dslash type
1778  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1779  diracParam.gauge = gaugeFatEigensolver;
1780  }
1781 
1782  if (diracParam.gauge->Precision() != inv_param->cuda_prec_eigensolver)
1783  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1785  }
1786 
1787  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
1788  {
1789  DiracParam diracParam;
1790  DiracParam diracSloppyParam;
1791  DiracParam diracPreParam;
1792 
1793  setDiracParam(diracParam, &param, pc_solve);
1794  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1795  // eigCG and deflation need 2 sloppy precisions and do not use Schwarz
1796  bool comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true;
1797  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1798 
1799  d = Dirac::create(diracParam); // create the Dirac operator
1800  dSloppy = Dirac::create(diracSloppyParam);
1801  dPre = Dirac::create(diracPreParam);
1802  }
1803 
1804  void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param,
1805  const bool pc_solve)
1806  {
1807  DiracParam diracParam;
1808  DiracParam diracSloppyParam;
1809  DiracParam diracPreParam;
1810  DiracParam diracRefParam;
1811 
1812  setDiracParam(diracParam, &param, pc_solve);
1813  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1814  setDiracRefineParam(diracRefParam, &param, pc_solve);
1815  // eigCG and deflation need 2 sloppy precisions and do not use Schwarz
1816  bool comms_flag = (param.inv_type == QUDA_INC_EIGCG_INVERTER || param.eig_param) ? true : false;
1817  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1818 
1819  d = Dirac::create(diracParam); // create the Dirac operator
1820  dSloppy = Dirac::create(diracSloppyParam);
1821  dPre = Dirac::create(diracPreParam);
1822  dRef = Dirac::create(diracRefParam);
1823  }
1824 
1825  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,
1826  const bool pc_solve)
1827  {
1828  DiracParam diracParam;
1829  DiracParam diracSloppyParam;
1830  DiracParam diracPreParam;
1831  DiracParam diracEigParam;
1832 
1833  setDiracParam(diracParam, &param, pc_solve);
1834  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1835  // eigCG and deflation need 2 sloppy precisions and do not use Schwarz
1836  bool comms_flag = (param.inv_type == QUDA_INC_EIGCG_INVERTER || param.eig_param) ? true : false;
1837  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1838  setDiracEigParam(diracEigParam, &param, pc_solve, comms_flag);
1839 
1840  d = Dirac::create(diracParam); // create the Dirac operator
1841  dSloppy = Dirac::create(diracSloppyParam);
1842  dPre = Dirac::create(diracPreParam);
1843  dEig = Dirac::create(diracEigParam);
1844  }
1845 
1846  void massRescale(cudaColorSpinorField &b, QudaInvertParam &param, bool for_multishift)
1847  {
1848 
1849  double kappa5 = (0.5/(5.0 + param.m5));
1850  double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH || param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH
1851  || param.dslash_type == QUDA_MOBIUS_DWF_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) ?
1852  kappa5 :
1853  param.kappa;
1854 
1855  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1856  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1857  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1858  double nin = blas::norm2(b);
1859  printfQuda("Mass rescale: norm of source in = %g\n", nin);
1860  }
1861 
1862  // staggered dslash uses mass normalization internally
1863  if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {
1864  switch (param.solution_type) {
1865  case QUDA_MAT_SOLUTION:
1866  case QUDA_MATPC_SOLUTION:
1867  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);
1868  break;
1871  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);
1872  break;
1873  default:
1874  errorQuda("Not implemented");
1875  }
1876  return;
1877  }
1878 
1879  // multiply the source to compensate for normalization of the Dirac operator, if necessary
1880  // you are responsible for restoring what's in param.offset
1881  switch (param.solution_type) {
1882  case QUDA_MAT_SOLUTION:
1883  if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
1884  param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1885  blas::ax(2.0*kappa, b);
1886  if (for_multishift)
1887  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa;
1888  }
1889  break;
1891  if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
1892  param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1893  blas::ax(4.0*kappa*kappa, b);
1894  if (for_multishift)
1895  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;
1896  }
1897  break;
1898  case QUDA_MATPC_SOLUTION:
1899  if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
1900  blas::ax(4.0*kappa*kappa, b);
1901  if (for_multishift)
1902  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;
1903  } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1904  blas::ax(2.0*kappa, b);
1905  if (for_multishift)
1906  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa;
1907  }
1908  break;
1910  if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
1911  blas::ax(16.0*std::pow(kappa,4), b);
1912  if (for_multishift)
1913  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4);
1914  } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1915  blas::ax(4.0*kappa*kappa, b);
1916  if (for_multishift)
1917  for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;
1918  }
1919  break;
1920  default:
1921  errorQuda("Solution type %d not supported", param.solution_type);
1922  }
1923 
1924  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
1925  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1926  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1927  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1928  double nin = blas::norm2(b);
1929  printfQuda("Mass rescale: norm of source out = %g\n", nin);
1930  }
1931  }
1932 }
1933 
1934 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
1935 {
1936  profileDslash.TPSTART(QUDA_PROFILE_TOTAL);
1937  profileDslash.TPSTART(QUDA_PROFILE_INIT);
1938 
1939  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
1940 
1943  errorQuda("Gauge field not allocated");
1945  errorQuda("Clover field not allocated");
1946 
1949 
1950  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
1951  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1952  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1953 
1954  cpuParam.v = h_out;
1955  cpuParam.location = inv_param->output_location;
1956  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1957 
1958  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1959  cudaColorSpinorField in(*in_h, cudaParam);
1960  cudaColorSpinorField out(in, cudaParam);
1961 
1962  bool pc = true;
1963  DiracParam diracParam;
1964  setDiracParam(diracParam, inv_param, pc);
1965 
1966  profileDslash.TPSTOP(QUDA_PROFILE_INIT);
1967 
1968  profileDslash.TPSTART(QUDA_PROFILE_H2D);
1969  in = *in_h;
1970  profileDslash.TPSTOP(QUDA_PROFILE_H2D);
1971 
1972  profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);
1973 
1974  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1975  double cpu = blas::norm2(*in_h);
1976  double gpu = blas::norm2(in);
1977  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1978  }
1979 
1983  blas::ax(1.0/(2.0*inv_param->mass), in);
1984 
1986  if (parity == QUDA_EVEN_PARITY) {
1988  } else {
1990  }
1991  blas::ax(gauge.Anisotropy(), in);
1992  }
1993 
1994  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1996  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1997  cudaColorSpinorField tmp1(in, cudaParam);
1998  ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
1999  dirac->Dslash(out, tmp1, parity); // apply the operator
2002  dirac->Dslash4(out, in, parity);
2003  } else {
2004  dirac->Dslash(out, in, parity); // apply the operator
2005  }
2006  profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);
2007 
2008  profileDslash.TPSTART(QUDA_PROFILE_D2H);
2009  *out_h = out;
2010  profileDslash.TPSTOP(QUDA_PROFILE_D2H);
2011 
2012  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2013  double cpu = blas::norm2(*out_h);
2014  double gpu = blas::norm2(out);
2015  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2016  }
2017 
2018  profileDslash.TPSTART(QUDA_PROFILE_FREE);
2019  delete dirac; // clean up
2020 
2021  delete out_h;
2022  delete in_h;
2023  profileDslash.TPSTOP(QUDA_PROFILE_FREE);
2024 
2025  popVerbosity();
2026  profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);
2027 }
2028 
2029 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
2030 {
2032 
2033  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
2034 
2037  errorQuda("Gauge field not allocated");
2039  errorQuda("Clover field not allocated");
2041 
2042  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
2044 
2045  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
2046  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
2047 
2048  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2049  cudaColorSpinorField in(*in_h, cudaParam);
2050 
2051  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2052  double cpu = blas::norm2(*in_h);
2053  double gpu = blas::norm2(in);
2054  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2055  }
2056 
2057  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2058  cudaColorSpinorField out(in, cudaParam);
2059 
2060  DiracParam diracParam;
2061  setDiracParam(diracParam, inv_param, pc);
2062 
2063  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
2064  dirac->M(out, in); // apply the operator
2065  delete dirac; // clean up
2066 
2067  double kappa = inv_param->kappa;
2068  if (pc) {
2070  blas::ax(0.25/(kappa*kappa), out);
2072  blas::ax(0.5/kappa, out);
2073  }
2074  } else {
2077  blas::ax(0.5/kappa, out);
2078  }
2079  }
2080 
2081  cpuParam.v = h_out;
2082  cpuParam.location = inv_param->output_location;
2083  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2084  *out_h = out;
2085 
2086  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2087  double cpu = blas::norm2(*out_h);
2088  double gpu = blas::norm2(out);
2089  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2090  }
2091 
2092  delete out_h;
2093  delete in_h;
2094 
2095  popVerbosity();
2096 }
2097 
2098 
2099 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
2100 {
2102 
2103  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
2104 
2107  errorQuda("Gauge field not allocated");
2109  errorQuda("Clover field not allocated");
2111 
2112  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
2114 
2115  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
2116  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
2117 
2118  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2119  cudaColorSpinorField in(*in_h, cudaParam);
2120 
2121  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
2122  double cpu = blas::norm2(*in_h);
2123  double gpu = blas::norm2(in);
2124  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2125  }
2126 
2127  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2128  cudaColorSpinorField out(in, cudaParam);
2129 
2130  // double kappa = inv_param->kappa;
2131  // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
2132 
2133  DiracParam diracParam;
2134  setDiracParam(diracParam, inv_param, pc);
2135 
2136  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
2137  dirac->MdagM(out, in); // apply the operator
2138  delete dirac; // clean up
2139 
2140  double kappa = inv_param->kappa;
2141  if (pc) {
2143  blas::ax(1.0/std::pow(2.0*kappa,4), out);
2145  blas::ax(0.25/(kappa*kappa), out);
2146  }
2147  } else {
2150  blas::ax(0.25/(kappa*kappa), out);
2151  }
2152  }
2153 
2154  cpuParam.v = h_out;
2155  cpuParam.location = inv_param->output_location;
2156  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2157  *out_h = out;
2158 
2159  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
2160  double cpu = blas::norm2(*out_h);
2161  double gpu = blas::norm2(out);
2162  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2163  }
2164 
2165  delete out_h;
2166  delete in_h;
2167 
2168  popVerbosity();
2169 }
2170 
2171 namespace quda
2172 {
2174  {
2175  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
2176  return (gaugePrecise != nullptr) and param->cuda_prec == gaugePrecise->Precision();
2177  } else {
2178  return (gaugeFatPrecise != nullptr) and param->cuda_prec == gaugeFatPrecise->Precision();
2179  }
2180  }
2181 } // namespace quda
2182 
2184 
2185  if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
2186  return;
2187  }
2188 
2189  if (param->cuda_prec != cloverPrecise->Precision()) {
2190  errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());
2191  }
2192 
2201  }
2202 
2203  if (cloverPrecise == nullptr) errorQuda("Precise clover field doesn't exist");
2204  if (cloverSloppy == nullptr) errorQuda("Sloppy clover field doesn't exist");
2205  if (cloverPrecondition == nullptr) errorQuda("Precondition clover field doesn't exist");
2206  if (cloverRefinement == nullptr) errorQuda("Refinement clover field doesn't exist");
2207  if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
2208 }
2209 
2211 {
2212  quda::cudaGaugeField *cudaGauge = nullptr;
2213  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
2214  if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
2215 
2216  if (param->cuda_prec != gaugePrecise->Precision()) {
2217  errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
2218  }
2219 
2229  loadSloppyGaugeQuda(precision, recon);
2230  }
2231 
2232  if (gaugeSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");
2233  if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
2234  if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
2235  if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");
2236  if (param->overlap) {
2237  if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
2238  }
2240  } else {
2241  if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");
2242  if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");
2243 
2245  errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());
2246  }
2247 
2256 
2259  // recon is always no for fat links, so just use long reconstructs here
2263  loadSloppyGaugeQuda(precision, recon);
2264  }
2265 
2266  if (gaugeFatSloppy == nullptr) errorQuda("Sloppy gauge fat field doesn't exist");
2267  if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");
2268  if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");
2269  if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist");
2270  if (param->overlap) {
2271  if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
2272  }
2273 
2274  if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");
2275  if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");
2276  if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");
2277  if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist");
2278  if (param->overlap) {
2279  if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
2280  }
2282  }
2283 
2284  checkClover(param);
2285 
2286  return cudaGauge;
2287 }
2288 
2289 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
2290 {
2292 
2293  if (!initialized) errorQuda("QUDA not initialized");
2294  if (gaugePrecise == nullptr) errorQuda("Gauge field not allocated");
2295  if (cloverPrecise == nullptr) errorQuda("Clover field not allocated");
2296 
2298 
2300  errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2301 
2302  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), true);
2303 
2305  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2306  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2307 
2308  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2309  cudaColorSpinorField in(*in_h, cudaParam);
2310 
2311  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2312  double cpu = blas::norm2(*in_h);
2313  double gpu = blas::norm2(in);
2314  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2315  }
2316 
2317  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2318  cudaColorSpinorField out(in, cudaParam);
2319 
2321  if (parity == QUDA_EVEN_PARITY) {
2323  } else {
2325  }
2327  }
2328  bool pc = true;
2329 
2330  DiracParam diracParam;
2331  setDiracParam(diracParam, inv_param, pc);
2332  //FIXME: Do we need this for twisted clover???
2333  DiracCloverPC dirac(diracParam); // create the Dirac operator
2334  if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
2335  else dirac.CloverInv(out, in, parity);
2336 
2337  cpuParam.v = h_out;
2338  cpuParam.location = inv_param->output_location;
2339  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2340  *out_h = out;
2341 
2342  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2343  double cpu = blas::norm2(*out_h);
2344  double gpu = blas::norm2(out);
2345  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2346  }
2347 
2348  /*for (int i=0; i<in_h->Volume(); i++) {
2349  ((cpuColorSpinorField*)out_h)->PrintVector(i);
2350  }*/
2351 
2352  delete out_h;
2353  delete in_h;
2354 
2355  popVerbosity();
2356 }
2357 
2358 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
2359 {
2360  profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL);
2361  profileEigensolve.TPSTART(QUDA_PROFILE_INIT);
2362 
2363  // Transfer the inv param structure contained in eig_param
2364  QudaInvertParam *inv_param = eig_param->invert_param;
2365 
2368  setKernelPackT(true);
2369 
2370  if (!initialized) errorQuda("QUDA not initialized");
2371 
2373  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2375  printQudaEigParam(eig_param);
2376  }
2377 
2378  checkInvertParam(inv_param);
2379  checkEigParam(eig_param);
2381 
2384 
2385  inv_param->secs = 0;
2386  inv_param->gflops = 0;
2387  inv_param->iter = 0;
2388 
2389  // Define problem matrix
2390  //------------------------------------------------------
2391  Dirac *d = nullptr;
2392  Dirac *dSloppy = nullptr;
2393  Dirac *dPre = nullptr;
2394 
2395  // Create the dirac operator with a sloppy and a precon.
2396  createDirac(d, dSloppy, dPre, *inv_param, pc_solve);
2397  Dirac &dirac = *d;
2398 
2399  // Create device side ColorSpinorField vector space and to pass to the
2400  // compute function.
2401  const int *X = cudaGauge->X();
2402  ColorSpinorParam cpuParam(host_evecs[0], *inv_param, X, inv_param->solution_type, inv_param->input_location);
2403 
2404  // create wrappers around application vector set
2405  std::vector<ColorSpinorField *> host_evecs_;
2406  for (int i = 0; i < eig_param->n_conv; i++) {
2407  cpuParam.v = host_evecs[i];
2408  host_evecs_.push_back(ColorSpinorField::Create(cpuParam));
2409  }
2410 
2411  ColorSpinorParam cudaParam(cpuParam);
2412  cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
2413  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2415  // Ensure device vectors qre in UKQCD basis for Wilson type fermions
2416  if (cudaParam.nSpin != 1) cudaParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
2417 
2418  std::vector<Complex> evals(eig_param->n_conv, 0.0);
2419  std::vector<ColorSpinorField *> kSpace;
2420  for (int i = 0; i < eig_param->n_conv; i++) { kSpace.push_back(ColorSpinorField::Create(cudaParam)); }
2421 
2422  // If you attempt to compute part of the imaginary spectrum of a symmetric matrix,
2423  // the solver will fail.
2424  if ((eig_param->spectrum == QUDA_SPECTRUM_LI_EIG || eig_param->spectrum == QUDA_SPECTRUM_SI_EIG)
2425  && ((eig_param->use_norm_op || (inv_param->dslash_type == QUDA_LAPLACE_DSLASH))
2428  errorQuda("Cannot compute imaginary spectra with a hermitian operator");
2429  }
2430 
2431  // Gamma5 pre-multiplication is only supported for the M type operator
2432  if (eig_param->compute_gamma5) {
2433  if (eig_param->use_norm_op || eig_param->use_dagger) {
2434  errorQuda("gamma5 premultiplication is only supported for M type operators: dag = %s, normop = %s",
2435  eig_param->use_dagger ? "true" : "false", eig_param->use_norm_op ? "true" : "false");
2436  }
2437  }
2438 
2439  profileEigensolve.TPSTOP(QUDA_PROFILE_INIT);
2440 
2441  if (!eig_param->use_norm_op && !eig_param->use_dagger && eig_param->compute_gamma5) {
2442  DiracG5M m(dirac);
2443  if (eig_param->arpack_check) {
2444  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2445  } else {
2446  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2447  (*eig_solve)(kSpace, evals);
2448  delete eig_solve;
2449  }
2450  } else if (!eig_param->use_norm_op && !eig_param->use_dagger && !eig_param->compute_gamma5) {
2451  DiracM m(dirac);
2452  if (eig_param->arpack_check) {
2453  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2454  } else {
2455  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2456  (*eig_solve)(kSpace, evals);
2457  delete eig_solve;
2458  }
2459  } else if (!eig_param->use_norm_op && eig_param->use_dagger) {
2460  DiracMdag m(dirac);
2461  if (eig_param->arpack_check) {
2462  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2463  } else {
2464  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2465  (*eig_solve)(kSpace, evals);
2466  delete eig_solve;
2467  }
2468  } else if (eig_param->use_norm_op && !eig_param->use_dagger) {
2469  DiracMdagM m(dirac);
2470  if (eig_param->arpack_check) {
2471  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2472  } else {
2473  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2474  (*eig_solve)(kSpace, evals);
2475  delete eig_solve;
2476  }
2477  } else if (eig_param->use_norm_op && eig_param->use_dagger) {
2478  DiracMMdag m(dirac);
2479  if (eig_param->arpack_check) {
2480  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2481  } else {
2482  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2483  (*eig_solve)(kSpace, evals);
2484  delete eig_solve;
2485  }
2486  } else {
2487  errorQuda("Invalid use_norm_op and dagger combination");
2488  }
2489 
2490  // Copy eigen values back
2491  for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }
2492 
2493  // Transfer Eigenpairs back to host if using GPU eigensolver. The copy
2494  // will automatically rotate from device UKQCD gamma basis to the
2495  // host side gamma basis.
2496  if (!(eig_param->arpack_check)) {
2497  profileEigensolve.TPSTART(QUDA_PROFILE_D2H);
2498  for (int i = 0; i < eig_param->n_conv; i++) *host_evecs_[i] = *kSpace[i];
2499  profileEigensolve.TPSTOP(QUDA_PROFILE_D2H);
2500  }
2501 
2502  profileEigensolve.TPSTART(QUDA_PROFILE_FREE);
2503  for (int i = 0; i < eig_param->n_conv; i++) delete host_evecs_[i];
2504  delete d;
2505  delete dSloppy;
2506  delete dPre;
2507  for (int i = 0; i < eig_param->n_conv; i++) delete kSpace[i];
2508  profileEigensolve.TPSTOP(QUDA_PROFILE_FREE);
2509 
2510  popVerbosity();
2511 
2512  // cache is written out even if a long benchmarking job gets interrupted
2513  saveTuneCache();
2514 
2515  profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL);
2516 }
2517 
2519  : profile(profile) {
2520  profile.TPSTART(QUDA_PROFILE_INIT);
2521  QudaInvertParam *param = mg_param.invert_param;
2522  // set whether we are going use native or generic blas
2523  blas_lapack::set_native(param->native_blas_lapack);
2524 
2525  checkMultigridParam(&mg_param);
2527 
2528  // check MG params (needs to go somewhere else)
2529  if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
2530  errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);
2531  for (int i=0; i<mg_param.n_level; i++) {
2533  errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);
2534  }
2535  if (param->solve_type != QUDA_DIRECT_SOLVE)
2536  errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2537 
2539  mg_param.secs = 0;
2540  mg_param.gflops = 0;
2541 
2542  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2543  (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2544 
2545  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2546  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2547 
2548  // create the dirac operators for the fine grid
2549 
2550  // this is the Dirac operator we use for inter-grid residual computation
2551  DiracParam diracParam;
2552  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2553  d = Dirac::create(diracParam);
2554  m = new DiracM(*d);
2555 
2556  // this is the Dirac operator we use for smoothing
2557  DiracParam diracSmoothParam;
2558  bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
2559  (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2560  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2561  diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0];
2562  dSmooth = Dirac::create(diracSmoothParam);
2563  mSmooth = new DiracM(*dSmooth);
2564 
2565  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2566  DiracParam diracSmoothSloppyParam;
2567  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve,
2568  mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false);
2569  diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0];
2570 
2571  dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);
2573 
2574  if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION)
2575  errorQuda("MG setup location %d disabled", mg_param.setup_location[0]);
2576  ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]);
2578  QudaPrecision Bprec = mg_param.precision_null[0];
2579  Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);
2580  csParam.setPrecision(Bprec, Bprec, true);
2583  B.resize(mg_param.n_vec[0]);
2584 
2585  if (mg_param.transfer_type[0] == QUDA_TRANSFER_COARSE_KD || mg_param.transfer_type[0] == QUDA_TRANSFER_OPTIMIZED_KD) {
2586  // Create the ColorSpinorField as a "container" for metadata.
2588 
2589  // These never get accessed, `nullptr` on its own leads to an error in texture binding
2590  csParam.v = (void *)std::numeric_limits<uint64_t>::max();
2591  csParam.norm = (void *)std::numeric_limits<uint64_t>::max();
2592  }
2593 
2594  for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); }
2595 
2596  // fill out the MG parameters for the fine level
2597  mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);
2598 
2599  mg = new MG(*mgParam, profile);
2601 
2602  // cache is written out even if a long benchmarking job gets interrupted
2603  saveTuneCache();
2604  profile.TPSTOP(QUDA_PROFILE_INIT);
2605 }
2606 
2608  profilerStart(__func__);
2609 
2610  pushVerbosity(mg_param->invert_param->verbosity);
2611 
2612  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
2613  auto *mg = new multigrid_solver(*mg_param, profileInvert);
2614  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
2615 
2616  saveTuneCache();
2617 
2618  popVerbosity();
2619 
2620  profilerStop(__func__);
2621  return static_cast<void*>(mg);
2622 }
2623 
2624 void destroyMultigridQuda(void *mg) {
2625  delete static_cast<multigrid_solver*>(mg);
2626 }
2627 
2628 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
2629 {
2630  profilerStart(__func__);
2631 
2632  pushVerbosity(mg_param->invert_param->verbosity);
2633 
2634  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
2635  profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
2636 
2637  auto *mg = static_cast<multigrid_solver*>(mg_);
2638  checkMultigridParam(mg_param);
2639 
2640  QudaInvertParam *param = mg_param->invert_param;
2641  // check the gauge fields have been created and set the precision as needed
2642  checkGauge(param);
2643 
2644  // for reporting level 1 is the fine level but internally use level 0 for indexing
2645  // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" );
2646  // setOutputPrefix(prefix);
2647  setOutputPrefix("MG level 1 (GPU): "); //fix me
2648 
2649  // Check if we're doing a thin update only
2650  if (mg_param->thin_update_only) {
2651  // FIXME: add support for updating kappa, mu as appropriate
2652 
2653  // FIXME: assumes gauge parameters haven't changed.
2654  // These routines will set gauge = gaugeFat for DiracImprovedStaggered
2655  mg->d->updateFields(gaugeSloppy, gaugeFatSloppy, gaugeLongSloppy, cloverSloppy);
2656  mg->d->setMass(param->mass);
2657 
2658  mg->dSmooth->updateFields(gaugeSloppy, gaugeFatSloppy, gaugeLongSloppy, cloverSloppy);
2659  mg->dSmooth->setMass(param->mass);
2660 
2661  if (mg->dSmoothSloppy != mg->dSmooth) {
2662  if (param->overlap) {
2663  mg->dSmoothSloppy->updateFields(gaugeExtended, gaugeFatExtended, gaugeLongExtended, cloverPrecondition);
2664  } else {
2665  mg->dSmoothSloppy->updateFields(gaugePrecondition, gaugeFatPrecondition, gaugeLongPrecondition,
2667  }
2668  mg->dSmoothSloppy->setMass(param->mass);
2669  }
2670  // The above changes are propagated internally by use of references, pointers, etc, so
2671  // no further updates are needed.
2672 
2673  } else {
2674 
2675  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2676 
2677  // free the previous dirac operators
2678  if (mg->m) delete mg->m;
2679  if (mg->mSmooth) delete mg->mSmooth;
2680  if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;
2681 
2682  if (mg->d) delete mg->d;
2683  if (mg->dSmooth) delete mg->dSmooth;
2684  if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;
2685 
2686  // create new fine dirac operators
2687 
2688  // this is the Dirac operator we use for inter-grid residual computation
2689  DiracParam diracParam;
2690  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2691  mg->d = Dirac::create(diracParam);
2692  mg->m = new DiracM(*(mg->d));
2693 
2694  // this is the Dirac operator we use for smoothing
2695  DiracParam diracSmoothParam;
2696  bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE)
2697  || (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2698  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2699  mg->dSmooth = Dirac::create(diracSmoothParam);
2700  mg->mSmooth = new DiracM(*(mg->dSmooth));
2701 
2702  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2703  DiracParam diracSmoothSloppyParam;
2704  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
2705  mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);
2706  ;
2707  mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));
2708 
2709  mg->mgParam->matResidual = mg->m;
2710  mg->mgParam->matSmooth = mg->mSmooth;
2711  mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;
2712 
2713  mg->mgParam->updateInvertParam(*param);
2714  if (mg->mgParam->mg_global.invert_param != param) mg->mgParam->mg_global.invert_param = param;
2715 
2716  bool refresh = true;
2717  mg->mg->reset(refresh);
2718  }
2719 
2720  setOutputPrefix("");
2721 
2722  // cache is written out even if a long benchmarking job gets interrupted
2723  saveTuneCache();
2724 
2725  profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
2726  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
2727 
2728  popVerbosity();
2729 
2730  profilerStop(__func__);
2731 }
2732 
2733 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
2734 {
2735  profilerStart(__func__);
2736  pushVerbosity(mg_param->invert_param->verbosity);
2737  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
2738 
2739  auto *mg = static_cast<multigrid_solver*>(mg_);
2740  checkMultigridParam(mg_param);
2741  checkGauge(mg_param->invert_param);
2742 
2743  mg->mg->dumpNullVectors();
2744 
2745  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
2746  popVerbosity();
2747  profilerStop(__func__);
2748 }
2749 
2751  : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2752 
2753  QudaInvertParam *param = eig_param.invert_param;
2754 
2755  if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
2756 
2757  profile.TPSTART(QUDA_PROFILE_INIT);
2758 
2760  eig_param.secs = 0;
2761  eig_param.gflops = 0;
2762 
2763  DiracParam diracParam;
2764  if(eig_param.cuda_prec_ritz == param->cuda_prec)
2765  {
2766  setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2767  } else {
2768  setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2769  }
2770 
2771  const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2772 
2773  d = Dirac::create(diracParam);
2774  m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));
2775 
2776  ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location);
2777 
2778  ritzParam.create = QUDA_ZERO_FIELD_CREATE;
2779  ritzParam.is_composite = true;
2780  ritzParam.is_component = false;
2781  ritzParam.composite_dim = param->n_ev * param->deflation_grid;
2782  ritzParam.setPrecision(param->cuda_prec_ritz);
2783 
2784  if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
2785  ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order
2786  if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
2787 
2788  //select memory location here, by default ritz vectors will be allocated on the device
2789  //but if not sufficient device memory, then the user may choose mapped type of memory
2790  ritzParam.mem_type = eig_param.mem_type_ritz;
2791  } else { //host location
2792  ritzParam.mem_type = QUDA_MEMORY_PINNED;
2793  }
2794 
2795  int ritzVolume = 1;
2796  for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];
2797 
2798  if (getVerbosity() == QUDA_DEBUG_VERBOSE) {
2799 
2800  size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());
2801  printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());
2802  if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");
2803  else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED)
2804  printfQuda("Using mapped memory type.\n");
2805  }
2806 
2807  RV = ColorSpinorField::Create(ritzParam);
2808 
2809  deflParam = new DeflationParam(eig_param, RV, *m);
2810 
2811  defl = new Deflation(*deflParam, profile);
2812 
2813  profile.TPSTOP(QUDA_PROFILE_INIT);
2814 }
2815 
2816 void* newDeflationQuda(QudaEigParam *eig_param) {
2817  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
2818 #ifdef MAGMA_LIB
2819  openMagma();
2820 #endif
2821  auto *defl = new deflated_solver(*eig_param, profileInvert);
2822 
2823  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
2824 
2825  saveProfile(__func__);
2826  flushProfile();
2827  return static_cast<void*>(defl);
2828 }
2829 
2830 void destroyDeflationQuda(void *df) {
2831 #ifdef MAGMA_LIB
2832  closeMagma();
2833 #endif
2834  delete static_cast<deflated_solver*>(df);
2835 }
2836 
2837 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
2838 {
2839  profilerStart(__func__);
2840 
2841  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH || param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH
2842  || param->dslash_type == QUDA_MOBIUS_DWF_DSLASH || param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH)
2843  setKernelPackT(true);
2844 
2845  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
2846 
2847  if (!initialized) errorQuda("QUDA not initialized");
2848 
2849  pushVerbosity(param->verbosity);
2851 
2852  checkInvertParam(param, hp_x, hp_b);
2853 
2854  // check the gauge fields have been created
2856 
2857  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
2858  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
2859  // for now, though, so here we factorize everything for convenience.
2860 
2861  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2862  (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2863  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2864  (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2865  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2866  (param->solution_type == QUDA_MATPC_SOLUTION);
2867  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2868  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2869  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2870  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2871 
2872  param->secs = 0;
2873  param->gflops = 0;
2874  param->iter = 0;
2875 
2876  Dirac *d = nullptr;
2877  Dirac *dSloppy = nullptr;
2878  Dirac *dPre = nullptr;
2879  Dirac *dEig = nullptr;
2880 
2881  // Create the dirac operator and operators for sloppy, precondition,
2882  // and an eigensolver
2883  createDiracWithEig(d, dSloppy, dPre, dEig, *param, pc_solve);
2884 
2885  Dirac &dirac = *d;
2886  Dirac &diracSloppy = *dSloppy;
2887  Dirac &diracPre = *dPre;
2888  Dirac &diracEig = *dEig;
2889 
2890  profileInvert.TPSTART(QUDA_PROFILE_H2D);
2891 
2892  ColorSpinorField *b = nullptr;
2893  ColorSpinorField *x = nullptr;
2894  ColorSpinorField *in = nullptr;
2895  ColorSpinorField *out = nullptr;
2896 
2897  const int *X = cudaGauge->X();
2898 
2899  // wrap CPU host side pointers
2900  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
2901  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
2902 
2903  cpuParam.v = hp_x;
2904  cpuParam.location = param->output_location;
2905  ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);
2906 
2907  // download source
2908  ColorSpinorParam cudaParam(cpuParam, *param);
2909  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2910  b = new cudaColorSpinorField(*h_b, cudaParam);
2911 
2912  // now check if we need to invalidate the solutionResident vectors
2913  bool invalidate = false;
2914  if (param->use_resident_solution == 1) {
2915  for (auto v : solutionResident)
2916  if (b->Precision() != v->Precision() || b->SiteSubset() != v->SiteSubset()) { invalidate = true; break; }
2917 
2918  if (invalidate) {
2919  for (auto v : solutionResident) if (v) delete v;
2920  solutionResident.clear();
2921  }
2922 
2923  if (!solutionResident.size()) {
2924  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2925  solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution
2926  }
2927  x = solutionResident[0];
2928  } else {
2929  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2930  x = new cudaColorSpinorField(cudaParam);
2931  }
2932 
2933  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2934  // initial guess only supported for single-pass solvers
2935  if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
2936  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2937  errorQuda("Initial guess not supported for two-pass solver");
2938  }
2939 
2940  *x = *h_x; // solution
2941  } else { // zero initial guess
2942  blas::zero(*x);
2943  }
2944 
2945  // if we're doing a managed memory MG solve and prefetching is
2946  // enabled, prefetch all the Dirac matrices. There's probably
2947  // a better place to put this...
2948  if (param->inv_type_precondition == QUDA_MG_INVERTER) {
2950  diracSloppy.prefetch(QUDA_CUDA_FIELD_LOCATION);
2952  }
2953 
2954  profileInvert.TPSTOP(QUDA_PROFILE_H2D);
2955  profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
2956 
2957  double nb = blas::norm2(*b);
2958  if (nb==0.0) errorQuda("Source has zero norm");
2959 
2960  if (getVerbosity() >= QUDA_VERBOSE) {
2961  double nh_b = blas::norm2(*h_b);
2962  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2963  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) {
2964  double nh_x = blas::norm2(*h_x);
2965  double nx = blas::norm2(*x);
2966  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2967  }
2968  }
2969 
2970  // rescale the source and solution vectors to help prevent the onset of underflow
2971  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
2972  blas::ax(1.0/sqrt(nb), *b);
2973  blas::ax(1.0/sqrt(nb), *x);
2974  }
2975 
2976  massRescale(*static_cast<cudaColorSpinorField *>(b), *param, false);
2977 
2978  dirac.prepare(in, out, *x, *b, param->solution_type);
2979 
2980  if (getVerbosity() >= QUDA_VERBOSE) {
2981  double nin = blas::norm2(*in);
2982  double nout = blas::norm2(*out);
2983  printfQuda("Prepared source = %g\n", nin);
2984  printfQuda("Prepared solution = %g\n", nout);
2985  }
2986 
2987  if (getVerbosity() >= QUDA_VERBOSE) {
2988  double nin = blas::norm2(*in);
2989  printfQuda("Prepared source post mass rescale = %g\n", nin);
2990  }
2991 
2992  // solution_type specifies *what* system is to be solved.
2993  // solve_type specifies *how* the system is to be solved.
2994  //
2995  // We have the following four cases (plus preconditioned variants):
2996  //
2997  // solution_type solve_type Effect
2998  // ------------- ---------- ------
2999  // MAT DIRECT Solve Ax=b
3000  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
3001  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
3002  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
3003  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
3004  //
3005  // We generally require that the solution_type and solve_type
3006  // preconditioning match. As an exception, the unpreconditioned MAT
3007  // solution_type may be used with any solve_type, including
3008  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
3009  // preconditioned source and reconstruction of the full solution are
3010  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
3011  // respectively.
3012 
3013  if (pc_solution && !pc_solve) {
3014  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
3015  }
3016 
3017  if (!mat_solution && !pc_solution && pc_solve) {
3018  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3019  }
3020 
3021  if (!mat_solution && norm_error_solve) {
3022  errorQuda("Normal-error solve requires Mat solution");
3023  }
3024 
3025  if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {
3026  errorQuda("Multigrid preconditioning only supported for direct solves");
3027  }
3028 
3029  if (param->chrono_use_resident && ( norm_error_solve) ){
3030  errorQuda("Chronological forcasting only presently supported for M^dagger M solver");
3031  }
3032 
3033  profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
3034 
3035  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
3037  dirac.Mdag(*in, tmp);
3038  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
3039  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3040  SolverParam solverParam(*param);
3041  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);
3042  (*solve)(*out, *in);
3043  blas::copy(*in, *out);
3044  delete solve;
3045  solverParam.updateInvertParam(*param);
3046  }
3047 
3048  if (direct_solve) {
3049  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3050  SolverParam solverParam(*param);
3051  // chronological forecasting
3052  if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
3053  profileInvert.TPSTART(QUDA_PROFILE_CHRONO);
3054 
3055  auto &basis = chronoResident[param->chrono_index];
3056 
3057  ColorSpinorParam cs_param(*basis[0]);
3059  ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
3060  std::vector<ColorSpinorField*> Ap;
3061  for (unsigned int k=0; k < basis.size(); k++) {
3062  Ap.emplace_back((ColorSpinorField::Create(cs_param)));
3063  }
3064 
3065  if (param->chrono_precision == param->cuda_prec) {
3066  for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
3067  } else if (param->chrono_precision == param->cuda_prec_sloppy) {
3068  for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3069  } else {
3070  errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3071  param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
3072  }
3073 
3074  bool orthogonal = true;
3075  bool apply_mat = false;
3076  bool hermitian = false;
3077  MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3078 
3079  blas::copy(*tmp, *in);
3080  mre(*out, *tmp, basis, Ap);
3081 
3082  for (auto ap: Ap) {
3083  if (ap) delete (ap);
3084  }
3085  delete tmp;
3086  if (tmp2 != out) delete tmp2;
3087 
3088  profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
3089  }
3090 
3091  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);
3092  (*solve)(*out, *in);
3093  delete solve;
3094  solverParam.updateInvertParam(*param);
3095  } else if (!norm_error_solve) {
3096  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3097  SolverParam solverParam(*param);
3098 
3099  // chronological forecasting
3100  if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
3101  profileInvert.TPSTART(QUDA_PROFILE_CHRONO);
3102 
3103  auto &basis = chronoResident[param->chrono_index];
3104 
3105  ColorSpinorParam cs_param(*basis[0]);
3106  std::vector<ColorSpinorField*> Ap;
3108  ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
3109  for (unsigned int k=0; k < basis.size(); k++) {
3110  Ap.emplace_back((ColorSpinorField::Create(cs_param)));
3111  }
3112 
3113  if (param->chrono_precision == param->cuda_prec) {
3114  for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
3115  } else if (param->chrono_precision == param->cuda_prec_sloppy) {
3116  for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3117  } else {
3118  errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3119  param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
3120  }
3121 
3122  bool orthogonal = true;
3123  bool apply_mat = false;
3124  bool hermitian = true;
3125  MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3126 
3127  blas::copy(*tmp, *in);
3128  mre(*out, *tmp, basis, Ap);
3129 
3130  for (auto ap: Ap) {
3131  if (ap) delete(ap);
3132  }
3133  delete tmp;
3134  if (tmp2 != out) delete tmp2;
3135 
3136  profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
3137  }
3138 
3139  // if using a Schwarz preconditioner with a normal operator then we must use the DiracMdagMLocal operator
3140  if (param->inv_type_precondition != QUDA_INVALID_INVERTER && param->schwarz_type != QUDA_INVALID_SCHWARZ) {
3141  DiracMdagMLocal mPreLocal(diracPre);
3142  Solver *solve = Solver::create(solverParam, m, mSloppy, mPreLocal, mEig, profileInvert);
3143  (*solve)(*out, *in);
3144  delete solve;
3145  solverParam.updateInvertParam(*param);
3146  } else {
3147  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);
3148  (*solve)(*out, *in);
3149  delete solve;
3150  solverParam.updateInvertParam(*param);
3151  }
3152  } else { // norm_error_solve
3153  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3154  cudaColorSpinorField tmp(*out);
3155  SolverParam solverParam(*param);
3156  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);
3157  (*solve)(tmp, *in); // y = (M M^\dag) b
3158  dirac.Mdag(*out, tmp); // x = M^dag y
3159  delete solve;
3160  solverParam.updateInvertParam(*param);
3161  }
3162 
3163  if (getVerbosity() >= QUDA_VERBOSE){
3164  double nx = blas::norm2(*x);
3165  printfQuda("Solution = %g\n",nx);
3166  }
3167 
3168  profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
3169  if (param->chrono_make_resident) {
3170  if(param->chrono_max_dim < 1){
3171  errorQuda("Cannot chrono_make_resident with chrono_max_dim %i",param->chrono_max_dim);
3172  }
3173 
3174  const int i = param->chrono_index;
3175  if (i >= QUDA_MAX_CHRONO)
3176  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
3177 
3178  auto &basis = chronoResident[i];
3179 
3180  if(param->chrono_max_dim < (int)basis.size()){
3181  errorQuda("Requested chrono_max_dim %i is smaller than already existing chroology %i",param->chrono_max_dim,(int)basis.size());
3182  }
3183 
3184  if(not param->chrono_replace_last){
3185  // if we have not filled the space yet just augment
3186  if ((int)basis.size() < param->chrono_max_dim) {
3187  ColorSpinorParam cs_param(*out);
3188  cs_param.setPrecision(param->chrono_precision);
3189  basis.emplace_back(ColorSpinorField::Create(cs_param));
3190  }
3191 
3192  // shuffle every entry down one and bring the last to the front
3193  ColorSpinorField *tmp = basis[basis.size()-1];
3194  for (unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];
3195  basis[0] = tmp;
3196  }
3197  *(basis[0]) = *out; // set first entry to new solution
3198  }
3199  dirac.reconstruct(*x, *b, param->solution_type);
3200 
3201  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
3202  // rescale the solution
3203  blas::ax(sqrt(nb), *x);
3204  }
3205  profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
3206 
3207  if (!param->make_resident_solution) {
3208  profileInvert.TPSTART(QUDA_PROFILE_D2H);
3209  *h_x = *x;
3210  profileInvert.TPSTOP(QUDA_PROFILE_D2H);
3211  }
3212 
3213  profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
3214 
3215  if (param->compute_action) {
3216  Complex action = blas::cDotProduct(*b, *x);
3217  param->action[0] = action.real();
3218  param->action[1] = action.imag();
3219  }
3220 
3221  if (getVerbosity() >= QUDA_VERBOSE){
3222  double nx = blas::norm2(*x);
3223  double nh_x = blas::norm2(*h_x);
3224  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3225  }
3226  profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
3227 
3228  profileInvert.TPSTART(QUDA_PROFILE_FREE);
3229 
3230  delete h_b;
3231  delete h_x;
3232  delete b;
3233 
3234  if (param->use_resident_solution && !param->make_resident_solution) {
3235  for (auto v: solutionResident) if (v) delete v;
3236  solutionResident.clear();
3237  } else if (!param->make_resident_solution) {
3238  delete x;
3239  }
3240 
3241  delete d;
3242  delete dSloppy;
3243  delete dPre;
3244  delete dEig;
3245 
3246  profileInvert.TPSTOP(QUDA_PROFILE_FREE);
3247 
3248  popVerbosity();
3249 
3250  // cache is written out even if a long benchmarking job gets interrupted
3251  saveTuneCache();
3252 
3253  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
3254 
3255  profilerStop(__func__);
3256 }
3257 
3259  void *milc_longlinks)
3260 {
3264 
3265  // Specific gauge parameters for MILC
3266  int pad_size = 0;
3267 #ifdef MULTI_GPU
3268  int x_face_size = gauge_param->X[1] * gauge_param->X[2] * gauge_param->X[3] / 2;
3269  int y_face_size = gauge_param->X[0] * gauge_param->X[2] * gauge_param->X[3] / 2;
3270  int z_face_size = gauge_param->X[0] * gauge_param->X[1] * gauge_param->X[3] / 2;
3271  int t_face_size = gauge_param->X[0] * gauge_param->X[1] * gauge_param->X[2] / 2;
3272  pad_size = MAX(x_face_size, y_face_size);
3273  pad_size = MAX(pad_size, z_face_size);
3274  pad_size = MAX(pad_size, t_face_size);
3275 #endif
3276 
3277  int fat_pad = pad_size;
3278  int link_pad = 3 * pad_size;
3279 
3281  QUDA_SU3_LINKS :
3283 
3284  gauge_param->ga_pad = fat_pad;
3289  } else {
3293  }
3295 
3296  loadGaugeQuda(milc_fatlinks, gauge_param);
3297 
3300  gauge_param->ga_pad = link_pad;
3306  loadGaugeQuda(milc_longlinks, gauge_param);
3307  }
3308 }
3309 
3310 template <class Interface, class... Args>
3311 void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param
3312  void *h_gauge, void *milc_fatlinks, void *milc_longlinks,
3313  QudaGaugeParam *gauge_param, // gauge field pointers
3314  void *h_clover, void *h_clovinv, // clover field pointers
3315  Interface op, Args... args)
3316 {
3323  profilerStart(__func__);
3324 
3325  CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};
3326  int num_sub_partition = quda::product(split_key);
3327 
3328  if (!split_key.is_valid()) {
3329  errorQuda("split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]);
3330  }
3331 
3332  if (num_sub_partition == 1) { // In this case we don't split the grid.
3333 
3334  for (int n = 0; n < param->num_src; n++) { op(_hp_x[n], _hp_b[n], param, args...); }
3335 
3336  } else {
3337 
3338  profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
3339  profileInvertMultiSrc.TPSTART(QUDA_PROFILE_INIT);
3340 
3341  if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr.\n"); }
3342 
3343  // Doing the sub-partition arithmatics
3344  if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) {
3345  errorQuda("We need to have split_grid[0](=%d) * split_grid[1](=%d) * split_grid[2](=%d) * split_grid[3](=%d) * "
3346  "num_src_per_sub_partition(=%d) == num_src(=%d).",
3347  split_key[0], split_key[1], split_key[2], split_key[3], param->num_src_per_sub_partition, param->num_src);
3348  }
3349 
3350  // Determine if the color spinor field is using a 5d e/o preconditioning
3351  QudaPCType pc_type = QUDA_4D_PC;
3352  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { pc_type = QUDA_5D_PC; }
3353 
3354  // Doesn't work for MG yet.
3355  if (param->inv_type_precondition == QUDA_MG_INVERTER) { errorQuda("Split Grid does NOT work with MG yet."); }
3356 
3357  checkInvertParam(param, _hp_x[0], _hp_b[0]);
3358 
3359  bool is_staggered;
3360  if (h_gauge) {
3361  is_staggered = false;
3362  } else if (milc_fatlinks) {
3363  is_staggered = true;
3364  } else {
3365  errorQuda("Both h_gauge and milc_fatlinks are null.");
3366  is_staggered = true; // to suppress compiler warning/error.
3367  }
3368 
3369  // Gauge fields/params
3370  GaugeFieldParam *gf_param = nullptr;
3371  GaugeField *in = nullptr;
3372  // Staggered gauge fields/params
3373  GaugeFieldParam *milc_fatlink_param = nullptr;
3374  GaugeFieldParam *milc_longlink_param = nullptr;
3375  GaugeField *milc_fatlink_field = nullptr;
3376  GaugeField *milc_longlink_field = nullptr;
3377 
3378  // set up the gauge field params.
3379  if (!is_staggered) { // not staggered
3380  gf_param = new GaugeFieldParam(h_gauge, *gauge_param);
3381  if (gf_param->order <= 4) { gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
3382  in = GaugeField::Create(*gf_param);
3383  } else { // staggered
3384  milc_fatlink_param = new GaugeFieldParam(milc_fatlinks, *gauge_param);
3385  if (milc_fatlink_param->order <= 4) { milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
3386  milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);
3387  milc_longlink_param = new GaugeFieldParam(milc_longlinks, *gauge_param);
3388  if (milc_longlink_param->order <= 4) { milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
3389  milc_longlink_field = GaugeField::Create(*milc_longlink_param);
3390  }
3391 
3392  // Create the temp host side helper fields, which are just wrappers of the input pointers.
3393  bool pc_solution
3394  = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
3395 
3396  const int *X = gauge_param->X;
3397  quda::CommKey field_dim = {X[0], X[1], X[2], X[3]};
3398  ColorSpinorParam cpuParam(_hp_b[0], *param, X, pc_solution, param->input_location);
3399  std::vector<ColorSpinorField *> _h_b(param->num_src);
3400  for (int i = 0; i < param->num_src; i++) {
3401  cpuParam.v = _hp_b[i];
3402  _h_b[i] = ColorSpinorField::Create(cpuParam);
3403  }
3404 
3405  cpuParam.location = param->output_location;
3406  std::vector<ColorSpinorField *> _h_x(param->num_src);
3407  for (int i = 0; i < param->num_src; i++) {
3408  cpuParam.v = _hp_x[i];
3409  _h_x[i] = ColorSpinorField::Create(cpuParam);
3410  }
3411 
3412  // Make the gauge param dimensions larger
3413  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
3414  printfQuda("Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n", comm_dim(0),
3415  comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
3416  }
3417  for (int d = 0; d < CommKey::n_dim; d++) {
3418  if (comm_dim(d) % split_key[d] != 0) {
3419  errorQuda("Split not possible: %2d %% %2d != 0.", comm_dim(d), split_key[d]);
3420  }
3421  if (!is_staggered) {
3422  gf_param->x[d] *= split_key[d];
3423  gf_param->pad *= split_key[d];
3424  } else {
3425  milc_fatlink_param->x[d] *= split_key[d];
3426  milc_fatlink_param->pad *= split_key[d];
3427  milc_longlink_param->x[d] *= split_key[d];
3428  milc_longlink_param->pad *= split_key[d];
3429  }
3430  gauge_param->X[d] *= split_key[d];
3431  gauge_param->ga_pad *= split_key[d];
3432  }
3433 
3434  // Deal with clover field. For Multi source computatons, clover field construction is done
3435  // exclusively on the GPU.
3436  if (param->clover_coeff == 0.0 && param->clover_csw == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient nor Csw not set");
3437  if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
3438 
3439  quda::CloverField *input_clover = nullptr;
3440  quda::CloverField *collected_clover = nullptr;
3441  if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH
3442  || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
3443  if (h_clover || h_clovinv) {
3444  CloverFieldParam clover_param;
3445  clover_param.nDim = 4;
3446  // If clover_coeff is not set manually, then it is the product Csw * kappa.
3447  // If the user has set the clover_coeff manually, that value takes precedent.
3448  clover_param.csw = param->clover_csw;
3449  clover_param.coeff = param->clover_coeff == 0.0 ? param->kappa * param->clover_csw : param->clover_coeff;
3450  // We must also adjust param->clover_coeff here. If a user has set kappa and
3451  // Csw, we must populate param->clover_coeff for them as the computeClover
3452  // routines uses that value
3453  param->clover_coeff = (param->clover_coeff == 0.0 ? param->kappa * param->clover_csw : param->clover_coeff);
3454  clover_param.twisted = param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH;
3455  clover_param.mu2 = clover_param.twisted ? 4.0 * param->kappa * param->kappa * param->mu * param->mu : 0.0;
3456  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
3457  for (int d = 0; d < 4; d++) { clover_param.x[d] = field_dim[d]; }
3458  clover_param.pad = param->cl_pad;
3459  clover_param.create = QUDA_REFERENCE_FIELD_CREATE;
3460  clover_param.norm = nullptr;
3461  clover_param.invNorm = nullptr;
3462  clover_param.setPrecision(param->clover_cpu_prec);
3463  clover_param.direct = h_clover ? true : false;
3464  clover_param.inverse = h_clovinv ? true : false;
3465  clover_param.clover = h_clover;
3466  clover_param.cloverInv = h_clovinv;
3467  clover_param.order = param->clover_order;
3468  clover_param.location = param->clover_location;
3469 
3470  input_clover = CloverField::Create(clover_param);
3471 
3472  for (int d = 0; d < CommKey::n_dim; d++) { clover_param.x[d] *= split_key[d]; }
3473  clover_param.create = QUDA_NULL_FIELD_CREATE;
3474  collected_clover = CloverField::Create(clover_param);
3475 
3476  std::vector<quda::CloverField *> v_c(1);
3477  v_c[0] = input_clover;
3478  quda::split_field(*collected_clover, v_c, split_key); // Clover uses 4d even-odd preconditioning.
3479  }
3480  }
3481 
3482  quda::GaugeField *collected_gauge = nullptr;
3483  quda::GaugeField *collected_milc_fatlink_field = nullptr;
3484  quda::GaugeField *collected_milc_longlink_field = nullptr;
3485 
3486  if (!is_staggered) {
3487  gf_param->create = QUDA_NULL_FIELD_CREATE;
3488  collected_gauge = new quda::cpuGaugeField(*gf_param);
3489  std::vector<quda::GaugeField *> v_g(1);
3490  v_g[0] = in;
3491  quda::split_field(*collected_gauge, v_g, split_key);
3492  } else {
3493  milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;
3494  milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
3495  collected_milc_fatlink_field = new quda::cpuGaugeField(*milc_fatlink_param);
3496  collected_milc_longlink_field = new quda::cpuGaugeField(*milc_longlink_param);
3497  std::vector<quda::GaugeField *> v_g(1);
3498  v_g[0] = milc_fatlink_field;
3499  quda::split_field(*collected_milc_fatlink_field, v_g, split_key);
3500  v_g[0] = milc_longlink_field;
3501  quda::split_field(*collected_milc_longlink_field, v_g, split_key);
3502  }
3503 
3504  profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_INIT);
3505  profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE);
3506 
3507  comm_barrier();
3508 
3509  // Split input fermion field
3510  quda::ColorSpinorParam cpu_cs_param_split(*_h_x[0]);
3511  for (int d = 0; d < CommKey::n_dim; d++) { cpu_cs_param_split.x[d] *= split_key[d]; }
3512  std::vector<quda::ColorSpinorField *> _collect_b(param->num_src_per_sub_partition, nullptr);
3513  std::vector<quda::ColorSpinorField *> _collect_x(param->num_src_per_sub_partition, nullptr);
3514  for (int n = 0; n < param->num_src_per_sub_partition; n++) {
3515  _collect_b[n] = new quda::cpuColorSpinorField(cpu_cs_param_split);
3516  _collect_x[n] = new quda::cpuColorSpinorField(cpu_cs_param_split);
3517  auto first = _h_b.begin() + n * num_sub_partition;
3518  auto last = _h_b.begin() + (n + 1) * num_sub_partition;
3519  std::vector<ColorSpinorField *> _v_b(first, last);
3520  split_field(*_collect_b[n], _v_b, split_key, pc_type);
3521  }
3522  comm_barrier();
3523 
3524  push_communicator(split_key);
3525  updateR();
3526  comm_barrier();
3527 
3528  profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_PREAMBLE);
3529  profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
3530 
3531  // Load gauge field after pushing the split communicator so the comm buffers, etc are setup according to
3532  // the split topology.
3533  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); }
3534  if (!is_staggered) {
3535  loadGaugeQuda(collected_gauge->Gauge_p(), gauge_param);
3536  } else {
3537  // freeGaugeQuda();
3538  loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->Gauge_p(),
3539  collected_milc_longlink_field->Gauge_p());
3540  }
3541  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); }
3542 
3543  if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH
3544  || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
3545  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); }
3546  if (collected_clover) {
3547  loadCloverQuda(collected_clover->V(false), collected_clover->V(true), param);
3548  } else {
3549  loadCloverQuda(nullptr, nullptr, param);
3550  }
3551  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded clover field...\n"); }
3552  }
3553 
3554  for (int n = 0; n < param->num_src_per_sub_partition; n++) {
3555  op(_collect_x[n]->V(), _collect_b[n]->V(), param, args...);
3556  }
3557 
3558  profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
3559  profileInvertMultiSrc.TPSTART(QUDA_PROFILE_EPILOGUE);
3561  updateR();
3562  comm_barrier();
3563 
3564  for (int d = 0; d < CommKey::n_dim; d++) {
3565  gauge_param->X[d] /= split_key[d];
3566  gauge_param->ga_pad /= split_key[d];
3567  }
3568 
3569  for (int n = 0; n < param->num_src_per_sub_partition; n++) {
3570  auto first = _h_x.begin() + n * num_sub_partition;
3571  auto last = _h_x.begin() + (n + 1) * num_sub_partition;
3572  std::vector<ColorSpinorField *> _v_x(first, last);
3573  join_field(_v_x, *_collect_x[n], split_key, pc_type);
3574  }
3575 
3576  for (auto p : _collect_b) { delete p; }
3577  for (auto p : _collect_x) { delete p; }
3578 
3579  for (auto p : _h_x) { delete p; }
3580  for (auto p : _h_b) { delete p; }
3581 
3582  if (!is_staggered) {
3583  delete in;
3584  delete collected_gauge;
3585  } else {
3586  delete milc_fatlink_field;
3587  delete milc_longlink_field;
3588  delete collected_milc_fatlink_field;
3589  delete collected_milc_longlink_field;
3590  }
3591 
3592  if (input_clover) { delete input_clover; }
3593  if (collected_clover) { delete collected_clover; }
3594 
3595  profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_EPILOGUE);
3596  profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
3597 
3598  // Restore the gauge field
3599  if (!is_staggered) {
3600  loadGaugeQuda(h_gauge, gauge_param);
3601  } else {
3602  freeGaugeQuda();
3603  loadFatLongGaugeQuda(param, gauge_param, milc_fatlinks, milc_longlinks);
3604  }
3605 
3606  if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
3607  loadCloverQuda(h_clover, h_clovinv, param);
3608  }
3609  }
3610 
3611  profilerStop(__func__);
3612 }
3613 
3614 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)
3615 {
3616  auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };
3617  callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, nullptr, nullptr, op);
3618 }
3619 
3620 void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks,
3621  void *milc_longlinks, QudaGaugeParam *gauge_param)
3622 {
3623  auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };
3624  callMultiSrcQuda(_hp_x, _hp_b, param, nullptr, milc_fatlinks, milc_longlinks, gauge_param, nullptr, nullptr, op);
3625 }
3626 
3627 void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge,
3628  QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
3629 {
3630  auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };
3631  callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, h_clover, h_clovinv, op);
3632 }
3633 
3634 void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge,
3636 {
3637  auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };
3638  callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, nullptr, nullptr, op, parity);
3639 }
3640 
3642  void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
3643 {
3644  auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };
3645  callMultiSrcQuda(_hp_x, _hp_b, param, nullptr, milc_fatlinks, milc_longlinks, gauge_param, nullptr, nullptr, op,
3646  parity);
3647 }
3648 
3649 void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge,
3650  QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
3651 {
3652  auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };
3653  callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, h_clover, h_clovinv, op, parity);
3654 }
3655 
3668 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
3669 {
3670  profilerStart(__func__);
3671 
3672  profileMulti.TPSTART(QUDA_PROFILE_TOTAL);
3673  profileMulti.TPSTART(QUDA_PROFILE_INIT);
3674 
3675  if (!initialized) errorQuda("QUDA not initialized");
3676 
3677  checkInvertParam(param, _hp_x[0], _hp_b);
3678 
3679  // check the gauge fields have been created
3680  checkGauge(param);
3681 
3682  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
3683  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", param->num_offset,
3685 
3686  pushVerbosity(param->verbosity);
3687 
3688  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
3689  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
3690  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
3691  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
3692 
3693  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3694  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3695 
3696  if (param->solution_type != QUDA_MATPC_SOLUTION) {
3697  errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type");
3698  }
3699 
3700  if (param->solve_type != QUDA_DIRECT_PC_SOLVE) {
3701  errorQuda("For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");
3702  }
3703 
3704  } else { // Wilson type
3705 
3706  if (mat_solution) {
3707  errorQuda("For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
3708  }
3709  if (direct_solve) {
3710  errorQuda("For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3711  }
3712  if (pc_solution & !pc_solve) {
3713  errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");
3714  }
3715  if (!pc_solution & pc_solve) {
3716  errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3717  }
3718  }
3719 
3720  // Timing and FLOP counters
3721  param->secs = 0;
3722  param->gflops = 0;
3723  param->iter = 0;
3724 
3725  for (int i=0; i<param->num_offset-1; i++) {
3726  for (int j=i+1; j<param->num_offset; j++) {
3727  if (param->offset[i] > param->offset[j])
3728  errorQuda("Offsets must be ordered from smallest to largest");
3729  }
3730  }
3731 
3732  // Host pointers for x, take a copy of the input host pointers
3733  void** hp_x;
3734  hp_x = new void* [ param->num_offset ];
3735 
3736  void* hp_b = _hp_b;
3737  for(int i=0;i < param->num_offset;i++){
3738  hp_x[i] = _hp_x[i];
3739  }
3740 
3741  // Create the matrix.
3742  // The way this works is that createDirac will create 'd' and 'dSloppy'
3743  // which are global. We then grab these with references...
3744  //
3745  // Balint: Isn't there a nice construction pattern we could use here? This is
3746  // expedient but yucky.
3747  // DiracParam diracParam;
3748  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3749  param->dslash_type == QUDA_STAGGERED_DSLASH){
3750  param->mass = sqrt(param->offset[0]/4);
3751  }
3752 
3753  Dirac *d = nullptr;
3754  Dirac *dSloppy = nullptr;
3755  Dirac *dPre = nullptr;
3756  Dirac *dRefine = nullptr;
3757 
3758  // Create the dirac operator and a sloppy, precon, and refine.
3759  createDiracWithRefine(d, dSloppy, dPre, dRefine, *param, pc_solve);
3760  Dirac &dirac = *d;
3761  Dirac &diracSloppy = *dSloppy;
3762 
3763 
3764  cudaColorSpinorField *b = nullptr; // Cuda RHS
3765  std::vector<ColorSpinorField*> x; // Cuda Solutions
3766  x.resize(param->num_offset);
3767  std::vector<ColorSpinorField*> p;
3768  std::unique_ptr<double[]> r2_old(new double[param->num_offset]);
3769 
3770  // Grab the dimension array of the input gauge field.
3771  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
3772  gaugeFatPrecise->X() : gaugePrecise->X();
3773 
3774  // This creates a ColorSpinorParam struct, from the host data
3775  // pointer, the definitions in param, the dimensions X, and whether
3776  // the solution is on a checkerboard instruction or not. These can
3777  // then be used as 'instructions' to create the actual
3778  // ColorSpinorField
3779  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
3780  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
3781 
3782  std::vector<ColorSpinorField*> h_x;
3783  h_x.resize(param->num_offset);
3784 
3785  cpuParam.location = param->output_location;
3786  for(int i=0; i < param->num_offset; i++) {
3787  cpuParam.v = hp_x[i];
3788  h_x[i] = ColorSpinorField::Create(cpuParam);
3789  }
3790 
3791  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
3792  profileMulti.TPSTART(QUDA_PROFILE_H2D);
3793  // Now I need a colorSpinorParam for the device
3794  ColorSpinorParam cudaParam(cpuParam, *param);
3795  // This setting will download a host vector
3796  cudaParam.create = QUDA_COPY_FIELD_CREATE;
3797  cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
3798  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
3799  profileMulti.TPSTOP(QUDA_PROFILE_H2D);
3800 
3801  profileMulti.TPSTART(QUDA_PROFILE_INIT);
3802  // Create the solution fields filled with zero
3803  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3804 
3805  // now check if we need to invalidate the solutionResident vectors
3806  bool invalidate = false;
3807  for (auto v : solutionResident) {
3808  if (cudaParam.Precision() != v->Precision()) {
3809  invalidate = true;
3810  break;
3811  }
3812  }
3813 
3814  if (invalidate) {
3815  for (auto v : solutionResident) delete v;
3816  solutionResident.clear();
3817  }
3818 
3819  // grow resident solutions to be big enough
3820  for (int i=solutionResident.size(); i < param->num_offset; i++) {
3821  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Adding vector %d to solutionsResident\n", i);
3822  solutionResident.push_back(new cudaColorSpinorField(cudaParam));
3823  }
3824  for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];
3825  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
3826 
3827  profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);
3828 
3829  // Check source norms
3830  double nb = blas::norm2(*b);
3831  if (nb==0.0) errorQuda("Source has zero norm");
3832 
3833  if(getVerbosity() >= QUDA_VERBOSE ) {
3834  double nh_b = blas::norm2(*h_b);
3835  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3836  }
3837 
3838  // rescale the source vector to help prevent the onset of underflow
3839  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
3840  blas::ax(1.0/sqrt(nb), *b);
3841  }
3842 
3843  // backup shifts
3844  double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];
3845  for (int i = 0; i < param->num_offset; i++) { unscaled_shifts[i] = param->offset[i]; }
3846 
3847  // rescale
3848  massRescale(*b, *param, true);
3849  profileMulti.TPSTOP(QUDA_PROFILE_PREAMBLE);
3850 
3851  DiracMatrix *m, *mSloppy;
3852 
3853  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3854  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3855  m = new DiracM(dirac);
3856  mSloppy = new DiracM(diracSloppy);
3857  } else {
3858  m = new DiracMdagM(dirac);
3859  mSloppy = new DiracMdagM(diracSloppy);
3860  }
3861 
3862  SolverParam solverParam(*param);
3863  {
3864  MultiShiftCG cg_m(*m, *mSloppy, solverParam, profileMulti);
3865  cg_m(x, *b, p, r2_old.get());
3866  }
3867  solverParam.updateInvertParam(*param);
3868 
3869  delete m;
3870  delete mSloppy;
3871 
3872  if (param->compute_true_res) {
3873  // check each shift has the desired tolerance and use sequential CG to refine
3874  profileMulti.TPSTART(QUDA_PROFILE_INIT);
3875  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3876  cudaColorSpinorField r(*b, cudaParam);
3877  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
3878  QudaInvertParam refineparam = *param;
3880  Dirac &dirac = *d;
3881  Dirac &diracSloppy = *dRefine;
3882 
3883 #define REFINE_INCREASING_MASS
3884 #ifdef REFINE_INCREASING_MASS
3885  for(int i=0; i < param->num_offset; i++) {
3886 #else
3887  for(int i=param->num_offset-1; i >= 0; i--) {
3888 #endif
3889  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3890  param->true_res_hq_offset[i] : 0;
3891  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3892  param->tol_hq_offset[i] : 0;
3893 
3894  /*
3895  In the case where the shifted systems have zero tolerance
3896  specified, we refine these systems until either the limit of
3897  precision is reached (prec_tol) or until the tolerance reaches
3898  the iterated residual tolerance of the previous multi-shift
3899  solver (iter_res_offset[i]), which ever is greater.
3900  */
3901  const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12
3902  const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));
3903  const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
3904  // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
3905  if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
3906  if (getVerbosity() >= QUDA_SUMMARIZE)
3907  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3908  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
3909 
3910  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
3911  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3912  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3913  dirac.setMass(sqrt(param->offset[i]/4));
3914  diracSloppy.setMass(sqrt(param->offset[i]/4));
3915  }
3916 
3917  DiracMatrix *m, *mSloppy;
3918 
3919  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3920  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3921  m = new DiracM(dirac);
3922  mSloppy = new DiracM(diracSloppy);
3923  } else {
3924  m = new DiracMdagM(dirac);
3925  mSloppy = new DiracMdagM(diracSloppy);
3926  }
3927 
3928  // need to curry in the shift if we are not doing staggered
3929  if (param->dslash_type != QUDA_ASQTAD_DSLASH && param->dslash_type != QUDA_STAGGERED_DSLASH) {
3930  m->shift = param->offset[i];
3931  mSloppy->shift = param->offset[i];
3932  }
3933 
3934  if (false) { // experimenting with Minimum residual extrapolation
3935  // only perform MRE using current and previously refined solutions
3936 #ifdef REFINE_INCREASING_MASS
3937  const int nRefine = i+1;
3938 #else
3939  const int nRefine = param->num_offset - i + 1;
3940 #endif
3941 
3942  std::vector<ColorSpinorField *> q;
3943  q.resize(nRefine);
3944  std::vector<ColorSpinorField *> z;
3945  z.resize(nRefine);
3946  cudaParam.create = QUDA_NULL_FIELD_CREATE;
3947  cudaColorSpinorField tmp(cudaParam);
3948 
3949  for (int j = 0; j < nRefine; j++) {
3950  q[j] = new cudaColorSpinorField(cudaParam);
3951  z[j] = new cudaColorSpinorField(cudaParam);
3952  }
3953 
3954  *z[0] = *x[0]; // zero solution already solved
3955 #ifdef REFINE_INCREASING_MASS
3956  for (int j=1; j<nRefine; j++) *z[j] = *x[j];
3957 #else
3958  for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];
3959 #endif
3960 
3961  bool orthogonal = true;
3962  bool apply_mat = true;
3963  bool hermitian = true;
3964  MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
3965  blas::copy(tmp, *b);
3966  mre(*x[i], tmp, z, q);
3967 
3968  for(int j=0; j < nRefine; j++) {
3969  delete q[j];
3970  delete z[j];
3971  }
3972  }
3973 
3974  SolverParam solverParam(refineparam);
3975  solverParam.iter = 0;
3977  solverParam.tol = (param->tol_offset[i] > 0.0 ? param->tol_offset[i] : iter_tol); // set L2 tolerance
3978  solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
3979  solverParam.delta = param->reliable_delta_refinement;
3980 
3981  {
3982  CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam, profileMulti);
3983  if (i==0)
3984  cg(*x[i], *b, p[i], r2_old[i]);
3985  else
3986  cg(*x[i], *b);
3987  }
3988 
3989  solverParam.true_res_offset[i] = solverParam.true_res;
3990  solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
3991  solverParam.updateInvertParam(*param,i);
3992 
3993  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3994  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3995  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
3996  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
3997  }
3998 
3999  delete m;
4000  delete mSloppy;
4001  }
4002  }
4003  }
4004 
4005  // restore shifts
4006  for(int i=0; i < param->num_offset; i++) {
4007  param->offset[i] = unscaled_shifts[i];
4008  }
4009 
4010  profileMulti.TPSTART(QUDA_PROFILE_D2H);
4011 
4012  if (param->compute_action) {
4013  Complex action(0);
4014  for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);
4015  param->action[0] = action.real();
4016  param->action[1] = action.imag();
4017  }
4018 
4019  for(int i=0; i < param->num_offset; i++) {
4020  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
4021  blas::ax(sqrt(nb), *x[i]);
4022  }
4023 
4024  if (getVerbosity() >= QUDA_VERBOSE){
4025  double nx = blas::norm2(*x[i]);
4026  printfQuda("Solution %d = %g\n", i, nx);
4027  }
4028 
4029  if (!param->make_resident_solution) *h_x[i] = *x[i];
4030  }
4031  profileMulti.TPSTOP(QUDA_PROFILE_D2H);
4032 
4033  profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);
4034 
4035  if (!param->make_resident_solution) {
4036  for (auto v: solutionResident) if (v) delete v;
4037  solutionResident.clear();
4038  }
4039 
4040  profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);
4041 
4042  profileMulti.TPSTART(QUDA_PROFILE_FREE);
4043  for(int i=0; i < param->num_offset; i++){
4044  delete h_x[i];
4045  //if (!param->make_resident_solution) delete x[i];
4046  }
4047 
4048  delete h_b;
4049  delete b;
4050 
4051  delete [] hp_x;
4052 
4053  delete d;
4054  delete dSloppy;
4055  delete dPre;
4056  delete dRefine;
4057  for (auto& pp : p) delete pp;
4058 
4059  profileMulti.TPSTOP(QUDA_PROFILE_FREE);
4060 
4061  popVerbosity();
4062 
4063  // cache is written out even if a long benchmarking job gets interrupted
4064  saveTuneCache();
4065 
4066  profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);
4067 
4068  profilerStop(__func__);
4069 }
4070 
4071 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param)
4072 {
4073 #ifdef GPU_FATLINK
4074  profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);
4075  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
4076 
4077  checkGaugeParam(param);
4078 
4080  cpuGaugeField cpuFatLink(gParam); // create the host fatlink
4081  gParam.gauge = longlink;
4082  cpuGaugeField cpuLongLink(gParam); // create the host longlink
4083  gParam.gauge = ulink;
4084  cpuGaugeField cpuUnitarizedLink(gParam);
4086  gParam.gauge = inlink;
4087  cpuGaugeField cpuInLink(gParam); // create the host sitelink
4088 
4089  // create the device fields
4093  cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
4094  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
4095 
4096  cudaInLink->loadCPUField(cpuInLink, profileFatLink);
4097  cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
4098 
4099  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
4100  delete cudaInLink;
4101  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
4102 
4108 
4109  if (longlink) {
4110  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
4111  cudaGaugeField *cudaLongLink = new cudaGaugeField(gParam);
4112  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
4113 
4114  profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
4115  longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);
4116  profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
4117 
4118  cudaLongLink->saveCPUField(cpuLongLink, profileFatLink);
4119 
4120  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
4121  delete cudaLongLink;
4122  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
4123  }
4124 
4125  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
4127  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
4128 
4129  profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
4130  fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff);
4131  profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
4132 
4133  if (fatlink) cudaFatLink->saveCPUField(cpuFatLink, profileFatLink);
4134 
4135  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
4136  delete cudaInLinkEx;
4137  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
4138 
4139  if (ulink) {
4140  const double unitarize_eps = 1e-14;
4141  const double max_error = 1e-10;
4142  const int reunit_allow_svd = 1;
4143  const int reunit_svd_only = 0;
4144  const double svd_rel_error = 1e-6;
4145  const double svd_abs_error = 1e-6;
4146  quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only,
4147  svd_rel_error, svd_abs_error);
4148 
4149  cudaGaugeField *cudaUnitarizedLink = new cudaGaugeField(gParam);
4150 
4151  profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
4152  *num_failures_h = 0;
4153  quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
4154  if (*num_failures_h > 0) errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);
4155  profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
4156 
4157  cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink);
4158 
4159  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
4160  delete cudaUnitarizedLink;
4161  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
4162  }
4163 
4164  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
4165  delete cudaFatLink;
4166  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
4167 
4168  profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);
4169 #else
4170  errorQuda("Fat-link has not been built");
4171 #endif // GPU_FATLINK
4172 }
4173 
4175  int pad = 0;
4176 #ifdef MULTI_GPU
4177  int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
4178  int face_size[4];
4179  for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
4180  pad = *std::max_element(face_size, face_size+4);
4181 #endif
4182 
4183  return pad;
4184 }
4185 
4186 int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length,
4187  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
4188 {
4189 #ifdef GPU_GAUGE_FORCE
4190  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
4191  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
4192 
4193  checkGaugeParam(qudaGaugeParam);
4194 
4195  GaugeFieldParam gParam(siteLink, *qudaGaugeParam);
4196  gParam.site_offset = qudaGaugeParam->gauge_offset;
4197  gParam.site_size = qudaGaugeParam->site_size;
4198  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
4199 
4200  cudaGaugeField* cudaSiteLink = nullptr;
4201 
4202  if (qudaGaugeParam->use_resident_gauge) {
4203  if (!gaugePrecise) errorQuda("No resident gauge field to use");
4204  cudaSiteLink = gaugePrecise;
4205  } else {
4207  gParam.reconstruct = qudaGaugeParam->reconstruct;
4208  gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
4209 
4210  cudaSiteLink = new cudaGaugeField(gParam);
4211  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
4212 
4213  profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
4214  cudaSiteLink->loadCPUField(*cpuSiteLink);
4215  profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
4216 
4217  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
4218  }
4219 
4220  GaugeFieldParam gParamMom(mom, *qudaGaugeParam, QUDA_ASQTAD_MOM_LINKS);
4221  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
4222  gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
4223  else
4224  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4225 
4226  gParamMom.site_offset = qudaGaugeParam->mom_offset;
4227  gParamMom.site_size = qudaGaugeParam->site_size;
4228  cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;
4229 
4230  cudaGaugeField* cudaMom = nullptr;
4231  if (qudaGaugeParam->use_resident_mom) {
4232  if (!momResident) errorQuda("No resident momentum field to use");
4233  cudaMom = momResident;
4234  if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
4235  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
4236  } else {
4237  gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
4238  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4239  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
4240  gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
4241  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
4242  cudaMom = new cudaGaugeField(gParamMom);
4243  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
4244  if (!qudaGaugeParam->overwrite_mom) {
4245  profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
4247  profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
4248  }
4249  }
4250 
4251  cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);
4252  // apply / remove phase as appropriate
4254 
4255  // actually do the computation
4256  profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
4257  if (!forceMonitor()) {
4258  gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4259  } else {
4260  // if we are monitoring the force, separate the force computation from the momentum update
4264  gaugeForce(*force, *cudaGauge, 1.0, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4265  updateMomentum(*cudaMom, eb3, *force, "gauge");
4266  delete force;
4267  }
4268  profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4269 
4270  if (qudaGaugeParam->return_result_mom) {
4271  profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
4273  profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
4274  }
4275 
4276  profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
4277  if (qudaGaugeParam->make_resident_gauge) {
4278  if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
4279  gaugePrecise = cudaSiteLink;
4280  } else {
4281  delete cudaSiteLink;
4282  }
4283 
4284  if (qudaGaugeParam->make_resident_mom) {
4285  if (momResident && momResident != cudaMom) delete momResident;
4286  momResident = cudaMom;
4287  } else {
4288  delete cudaMom;
4289  }
4290 
4291  if (cpuSiteLink) delete cpuSiteLink;
4292  if (cpuMom) delete cpuMom;
4293 
4294  if (qudaGaugeParam->make_resident_gauge) {
4297  } else {
4298  delete cudaGauge;
4299  }
4300  profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
4301 
4302  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
4303 
4304 #else
4305  errorQuda("Gauge force has not been built");
4306 #endif // GPU_GAUGE_FORCE
4307  return 0;
4308 }
4309 
4311 {
4312  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
4313  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
4314 
4315  checkGaugeParam(param);
4316 
4317  GaugeFieldParam gParamMom(mom, *param, QUDA_ASQTAD_MOM_LINKS);
4318  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
4319  gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
4320  else
4321  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4322  gParamMom.site_offset = param->mom_offset;
4323  gParamMom.site_size = param->site_size;
4324 
4325  cpuGaugeField cpuMom(gParamMom);
4326 
4328  if (momResident) delete momResident;
4329 
4330  gParamMom.create = QUDA_NULL_FIELD_CREATE;
4331  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4332  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
4333  gParamMom.setPrecision(param->cuda_prec, true);
4334  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
4335  momResident = new cudaGaugeField(gParamMom);
4336  } else if (param->return_result_mom && !param->make_resident_mom) {
4337  if (!momResident) errorQuda("No resident momentum to return");
4338  } else {
4339  errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom,
4341  }
4342 
4343  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
4344 
4345  if (param->make_resident_mom) {
4346  // we are downloading the momentum from the host
4347  profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
4349  profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
4350  } else if (param->return_result_mom) {
4351  // we are uploading the momentum to the host
4352  profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
4354  profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
4355 
4356  profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
4357  delete momResident;
4358  momResident = nullptr;
4359  profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
4360  }
4361 
4362  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
4363 }
4364 
4366 {
4367  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
4368  if (!cloverPrecise) errorQuda("Clover field not allocated");
4369 
4371  // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
4372  int R[4];
4373  for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
4374  cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
4375 
4376  profileClover.TPSTART(QUDA_PROFILE_INIT);
4377  // create the Fmunu field
4379  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
4380  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4381  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4382  cudaGaugeField Fmunu(tensorParam);
4383  profileClover.TPSTOP(QUDA_PROFILE_INIT);
4384  profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
4385  computeFmunu(Fmunu, *gauge);
4386  computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff);
4387  profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
4388  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
4389 
4390  // FIXME always preserve the extended gauge
4391  extendedGaugeResident = gauge;
4392 }
4393 
4394 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
4395 {
4397  gParam.geometry = static_cast<QudaFieldGeometry>(geometry);
4398  if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
4399  errorQuda("Only scalar and vector geometries are supported\n");
4400 
4401  cpuGaugeField *cpuGauge = nullptr;
4402  if (gauge) cpuGauge = new cpuGaugeField(gParam);
4403 
4406  auto* cudaGauge = new cudaGaugeField(gParam);
4407 
4408  if (gauge) {
4410  delete cpuGauge;
4411  }
4412 
4413  return cudaGauge;
4414 }
4415 
4416 
4417 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){
4418 
4419  auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
4420 
4423 
4426 
4427 }
4428 
4429 
4430 void destroyGaugeFieldQuda(void* gauge){
4431  auto* g = reinterpret_cast<cudaGaugeField*>(gauge);
4432  delete g;
4433 }
4434 
4435 
4436 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *, void **x,
4438 {
4439  profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);
4440  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
4441 
4443 
4444  // create the host momentum field
4448 
4449  // create the device momentum field
4455 
4456  // create temporary field for quark-field outer product
4461  GaugeField *cudaForce_[2] = {&cudaForce};
4462 
4463  ColorSpinorParam qParam;
4465  qParam.nColor = 3;
4466  qParam.nSpin = 1;
4469  qParam.nDim = 5; // 5 since staggered mrhs
4470  qParam.setPrecision(gParam.Precision());
4471  qParam.pad = 0;
4472  for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];
4473  qParam.x[4] = 1;
4474  qParam.create = QUDA_NULL_FIELD_CREATE;
4477 
4478  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
4479  profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);
4480 
4482  if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
4483  cudaMom = momResident;
4484  } else {
4485  // download the initial momentum (FIXME make an option just to return?)
4487  }
4488 
4489  // resident gauge field is required
4491  errorQuda("Resident gauge field is required");
4492 
4494  errorQuda("Gauge field requires the staggered phase factors to be applied");
4495  }
4496 
4497  // check if staggered phase is the desired one
4499  errorQuda("Requested staggered phase %d, but found %d\n",
4501  }
4502 
4503  profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);
4504  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
4505 
4506  const int nvector = inv_param->num_offset;
4507  std::vector<ColorSpinorField*> X(nvector);
4508  for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
4509 
4511  if (solutionResident.size() < (unsigned int)nvector)
4512  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
4513  solutionResident.size(), nvector);
4514  }
4515 
4516  // create the staggered operator
4517  DiracParam diracParam;
4518  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
4520  if (!pc_solve)
4521  errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type);
4522  setDiracParam(diracParam, inv_param, pc_solve);
4523  Dirac *dirac = Dirac::create(diracParam);
4524 
4525  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
4526  profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);
4527 
4528  for (int i=0; i<nvector; i++) {
4529  ColorSpinorField &x = *(X[i]);
4530 
4532  else errorQuda("%s requires resident solution", __func__);
4533 
4534  // set the odd solution component
4535  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
4536  }
4537 
4538  profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
4539  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
4540 
4541 #if 0
4543  for (auto v : solutionResident) if (v) delete solutionResident[i];
4544  solutionResident.clear();
4545  }
4546 #endif
4547  delete dirac;
4548 
4549  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
4550  profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);
4551 
4552  // compute quark-field outer product
4553  for (int i=0; i<nvector; i++) {
4554  ColorSpinorField &x = *(X[i]);
4555  // second component is zero since we have no three hop term
4556  double coeff[2] = {inv_param->residue[i], 0.0};
4557 
4558  // Operate on even-parity sites
4559  computeStaggeredOprod(cudaForce_, x, coeff, 1);
4560  }
4561 
4562  // mom += delta * [U * force]TA
4564  updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");
4566 
4567  profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4568  profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);
4569 
4571  // copy the momentum field back to the host
4573  }
4574 
4576  // make the momentum field resident
4577  momResident = cudaMom;
4578  } else {
4579  delete cudaMom;
4580  }
4581 
4582  profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);
4583  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
4584 
4585  for (int i=0; i<nvector; i++) delete X[i];
4586 
4587  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
4588  profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);
4589 }
4590 
4591 void computeHISQForceQuda(void* const milc_momentum,
4592  double dt,
4593  const double level2_coeff[6],
4594  const double fat7_coeff[6],
4595  const void* const w_link,
4596  const void* const v_link,
4597  const void* const u_link,
4598  void **fermion,
4599  int num_terms,
4600  int num_naik_terms,
4601  double **coeff,
4603 {
4604 #ifdef GPU_STAGGERED_OPROD
4605  using namespace quda;
4606  using namespace quda::fermion_force;
4607  profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);
4608  if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
4609 
4610  checkGaugeParam(gParam);
4611 
4612  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
4613 
4614  // create the device outer-product field
4616  oParam.nFace = 0;
4617  oParam.create = QUDA_ZERO_FIELD_CREATE;
4618  oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4619  cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
4620  cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
4621  cudaGaugeField *naikOprod = new cudaGaugeField(oParam);
4622 
4623  {
4624  // default settings for the unitarization
4625  const double unitarize_eps = 1e-14;
4626  const double hisq_force_filter = 5e-5;
4627  const double max_det_error = 1e-10;
4628  const bool allow_svd = true;
4629  const bool svd_only = false;
4630  const double svd_rel_err = 1e-8;
4631  const double svd_abs_err = 1e-8;
4632 
4633  setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);
4634  }
4635 
4636  double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4637  // You have to look at the MILC routine to understand the following
4638  // Basically, I have already absorbed the one-link coefficient
4639 
4641  //param.nFace = 0;
4642  param.order = QUDA_MILC_GAUGE_ORDER;
4644  param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4645  cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;
4646 
4647  param.link_type = QUDA_GENERAL_LINKS;
4649  param.gauge = (void*)w_link;
4650  cpuGaugeField cpuWLink(param);
4651  param.gauge = (void*)v_link;
4652  cpuGaugeField cpuVLink(param);
4653  param.gauge = (void*)u_link;
4655 
4656  param.create = QUDA_ZERO_FIELD_CREATE;
4658  param.link_type = QUDA_ASQTAD_MOM_LINKS;
4660  GaugeFieldParam momParam(param);
4661 
4662  param.create = QUDA_ZERO_FIELD_CREATE;
4663  param.link_type = QUDA_GENERAL_LINKS;
4664  param.setPrecision(gParam->cpu_prec, true);
4665 
4667  for (int dir=0; dir<4; ++dir) {
4668  param.x[dir] += 2*R[dir];
4669  param.r[dir] = R[dir];
4670  }
4671 
4673  param.create = QUDA_ZERO_FIELD_CREATE;
4674  param.setPrecision(gParam->cpu_prec);
4675  param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
4676 
4677  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
4678 
4679  { // do outer-product computation
4680  ColorSpinorParam qParam;
4681  qParam.nColor = 3;
4682  qParam.nSpin = 1;
4685  qParam.nDim = 4;
4686  qParam.setPrecision(oParam.Precision());
4687  qParam.pad = 0;
4688  for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];
4689 
4690  // create the device quark field
4691  qParam.create = QUDA_NULL_FIELD_CREATE;
4693  cudaColorSpinorField cudaQuark(qParam);
4694 
4695  // create the host quark field
4698  qParam.v = fermion[0];
4699 
4700  { // regular terms
4701  GaugeField *oprod[2] = {stapleOprod, naikOprod};
4702 
4703  // loop over different quark fields
4704  for(int i=0; i<num_terms; ++i){
4705 
4706  // Wrap the MILC quark field
4707  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
4708  qParam.v = fermion[i];
4709  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4710  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
4711 
4712  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
4713  cudaQuark = cpuQuark;
4714  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
4715 
4716  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4717  computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
4718  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4719  }
4720  }
4721 
4722  { // naik terms
4723  oneLinkOprod->copy(*stapleOprod);
4724  ax(level2_coeff[0], *oneLinkOprod);
4725  GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4726 
4727  // loop over different quark fields
4728  for(int i=0; i<num_naik_terms; ++i){
4729 
4730  // Wrap the MILC quark field
4731  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
4732  qParam.v = fermion[i + num_terms - num_naik_terms];
4733  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4734  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
4735 
4736  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
4737  cudaQuark = cpuQuark;
4738  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
4739 
4740  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4741  computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
4742  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4743  }
4744  }
4745  }
4746 
4747  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
4748  cudaGaugeField* cudaInForce = new cudaGaugeField(param);
4749  copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
4750  delete stapleOprod;
4751 
4752  cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
4753  copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
4754  delete oneLinkOprod;
4755 
4757  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
4758 
4759  cudaGauge->loadCPUField(cpuWLink, profileHISQForce);
4760 
4761  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4762  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4763  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4764 
4765  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4766  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff);
4767  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4768 
4769  // Load naik outer product
4770  copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
4771  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4772  delete naikOprod;
4773 
4774  // Compute Naik three-link term
4775  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4776  hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1]);
4777  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4778 
4779  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4780 
4781  // load v-link
4782  cudaGauge->loadCPUField(cpuVLink, profileHISQForce);
4783  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4784 
4785  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4786  *num_failures_h = 0;
4787  unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d);
4788  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4789 
4790  if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
4791 
4792  qudaMemset((void **)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
4793 
4794  // read in u-link
4795  cudaGauge->loadCPUField(cpuULink, profileHISQForce);
4796  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4797 
4798  // Compute Fat7-staple term
4799  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4800  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff);
4801  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4802 
4803  delete cudaInForce;
4804  cudaGaugeField* cudaMom = new cudaGaugeField(momParam);
4805 
4806  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
4807  hisqCompleteForce(*cudaOutForce, *cudaGauge);
4808  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4809 
4810  if (gParam->use_resident_mom) {
4811  if (!momResident) errorQuda("No resident momentum field to use");
4812  updateMomentum(*momResident, dt, *cudaOutForce, "hisq");
4813  } else {
4814  updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
4815  }
4816 
4817  if (gParam->return_result_mom) {
4818  // Close the paths, make anti-hermitian, and store in compressed format
4819  if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
4820  }
4821 
4822  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
4823 
4824  if (cpuMom) delete cpuMom;
4825 
4826  if (!gParam->make_resident_mom) {
4827  delete momResident;
4828  momResident = nullptr;
4829  }
4830  if (cudaMom) delete cudaMom;
4831  delete cudaOutForce;
4832  delete cudaGauge;
4833  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
4834 
4835  profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);
4836 
4837 #else
4838  errorQuda("HISQ force has not been built");
4839 #endif
4840 }
4841 
4842 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,
4843  double *coeff, double kappa2, double ck,
4844  int nvector, double multiplicity, void *gauge,
4846 
4847  using namespace quda;
4848  profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
4849  profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
4850 
4851  checkGaugeParam(gauge_param);
4852  if (!gaugePrecise) errorQuda("No resident gauge field");
4853 
4855  // create the host momentum field
4857  fParam.order = gauge_param->gauge_order;
4858  cpuGaugeField cpuMom(fParam);
4859 
4860  // create the device momentum field
4861  fParam.create = QUDA_ZERO_FIELD_CREATE;
4862  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4863  cudaGaugeField cudaMom(fParam);
4864 
4865  // create the device force field
4866  fParam.link_type = QUDA_GENERAL_LINKS;
4867  fParam.create = QUDA_ZERO_FIELD_CREATE;
4868  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4870  cudaGaugeField cudaForce(fParam);
4871 
4872  ColorSpinorParam qParam;
4874  qParam.nColor = 3;
4875  qParam.nSpin = 4;
4878  qParam.nDim = 4;
4879  qParam.setPrecision(fParam.Precision());
4880  qParam.pad = 0;
4881  for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];
4882 
4883  // create the device quark field
4884  qParam.create = QUDA_NULL_FIELD_CREATE;
4887 
4888  std::vector<ColorSpinorField*> quarkX, quarkP;
4889  for (int i=0; i<nvector; i++) {
4890  quarkX.push_back(ColorSpinorField::Create(qParam));
4891  quarkP.push_back(ColorSpinorField::Create(qParam));
4892  }
4893 
4895  qParam.x[0] /= 2;
4896  cudaColorSpinorField tmp(qParam);
4897 
4898  // create the host quark field
4901  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface
4902 
4903  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
4905  DiracParam diracParam;
4906  setDiracParam(diracParam, inv_param, pc_solve);
4907  diracParam.tmp1 = &tmp; // use as temporary for dirac->M
4908  Dirac *dirac = Dirac::create(diracParam);
4909 
4911  if (solutionResident.size() < (unsigned int)nvector)
4912  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
4913  solutionResident.size(), nvector);
4914  }
4915 
4917 
4918  // create oprod and trace fields
4919  fParam.geometry = QUDA_TENSOR_GEOMETRY;
4920  cudaGaugeField oprod(fParam);
4921 
4922  profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
4923  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
4924 
4925  std::vector<double> force_coeff(nvector);
4926  // loop over different quark fields
4927  for(int i=0; i<nvector; i++){
4928  ColorSpinorField &x = *(quarkX[i]);
4929  ColorSpinorField &p = *(quarkP[i]);
4930 
4932  // for downloading x_e
4934  qParam.x[0] /= 2;
4935 
4936  // Wrap the even-parity MILC quark field
4937  profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4938  profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
4939  qParam.v = h_x[i];
4940  cpuColorSpinorField cpuQuarkX(qParam); // create host quark field
4941  profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
4942 
4943  profileCloverForce.TPSTART(QUDA_PROFILE_H2D);
4944  x.Even() = cpuQuarkX;
4945  profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);
4946 
4947  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
4948  gamma5(x.Even(), x.Even());
4949  } else {
4950  x.Even() = *(solutionResident[i]);
4951  }
4952 
4953  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
4954  dirac->M(p.Even(), x.Even());
4956  dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
4958 
4959  gamma5(x, x);
4960  gamma5(p, p);
4961 
4962  force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
4963  }
4964 
4965  computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
4966 
4967  // In double precision the clover derivative is faster with no reconstruct
4968  cudaGaugeField *u = &gaugeEx;
4969  if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
4970  GaugeFieldParam param(gaugeEx);
4972  u = new cudaGaugeField(param);
4973  u -> copy(gaugeEx);
4974  }
4975 
4976  computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);
4977 
4978  /* Now the U dA/dU terms */
4979  std::vector< std::vector<double> > ferm_epsilon(nvector);
4980  for (int shift = 0; shift < nvector; shift++) {
4981  ferm_epsilon[shift].reserve(2);
4982  ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
4983  ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
4984  }
4985 
4986  computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
4987 
4988  cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
4989 
4990  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
4991 
4992  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
4993  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
4994 
4995  if (u != &gaugeEx) delete u;
4996 
4997  updateMomentum(cudaMom, -1.0, cudaForce, "clover");
4998  profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
4999 
5000  // copy the outer product field back to the host
5001  profileCloverForce.TPSTART(QUDA_PROFILE_D2H);
5003  profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);
5004 
5005  profileCloverForce.TPSTART(QUDA_PROFILE_FREE);
5006 
5007  for (int i=0; i<nvector; i++) {
5008  delete quarkX[i];
5009  delete quarkP[i];
5010  }
5011 
5012 #if 0
5014  for (auto v : solutionResident) if (v) delete v;
5015  solutionResident.clear();
5016  }
5017 #endif
5018  delete dirac;
5019  profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);
5020 
5021  profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
5022 }
5023 
5024 
5025 
5026 void updateGaugeFieldQuda(void* gauge,
5027  void* momentum,
5028  double dt,
5029  int conj_mom,
5030  int exact,
5032 {
5033  profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);
5034 
5035  checkGaugeParam(param);
5036 
5037  profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);
5038 
5039  // create the host fields
5043  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
5044  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
5045 
5046  GaugeFieldParam gParamMom(momentum, *param);
5047  gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
5049  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
5050  gParamMom.site_offset = param->mom_offset;
5051  gParamMom.site_size = param->site_size;
5052  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;
5053 
5054  // create the device fields
5060  gParam.pad = 0;
5062 
5065  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
5066  auto *cudaOutGauge = new cudaGaugeField(gParam);
5067 
5068  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);
5069 
5070  profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);
5071 
5072  if (!param->use_resident_gauge) { // load fields onto the device
5073  cudaInGauge->loadCPUField(*cpuGauge);
5074  } else { // or use resident fields already present
5075  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5076  cudaInGauge = gaugePrecise;
5077  gaugePrecise = nullptr;
5078  }
5079 
5080  if (!param->use_resident_mom) {
5082  } else {
5083  if (!momResident) errorQuda("No resident mom field allocated");
5084  cudaMom = momResident;
5085  momResident = nullptr;
5086  }
5087 
5088  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);
5089 
5090  // perform the update
5091  profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);
5092  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
5093  (bool)conj_mom, (bool)exact);
5094  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
5095 
5096  if (param->return_result_gauge) {
5097  // copy the gauge field back to the host
5098  profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
5099  cudaOutGauge->saveCPUField(*cpuGauge);
5100  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
5101  }
5102 
5103  profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);
5104  if (param->make_resident_gauge) {
5105  if (gaugePrecise != nullptr) delete gaugePrecise;
5106  gaugePrecise = cudaOutGauge;
5107  } else {
5108  delete cudaOutGauge;
5109  }
5110 
5111  if (param->make_resident_mom) {
5112  if (momResident != nullptr && momResident != cudaMom) delete momResident;
5113  momResident = cudaMom;
5114  } else {
5115  delete cudaMom;
5116  }
5117 
5118  delete cudaInGauge;
5119  if (cpuMom) delete cpuMom;
5120  if (cpuGauge) delete cpuGauge;
5121 
5122  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);
5123  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);
5124 }
5125 
5126  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
5127  profileProject.TPSTART(QUDA_PROFILE_TOTAL);
5128 
5129  profileProject.TPSTART(QUDA_PROFILE_INIT);
5130  checkGaugeParam(param);
5131 
5132  // create the gauge field
5136  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
5137  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
5138 
5139  // create the device fields
5144  profileProject.TPSTOP(QUDA_PROFILE_INIT);
5145 
5146  if (param->use_resident_gauge) {
5147  if (!gaugePrecise) errorQuda("No resident gauge field to use");
5149  gaugePrecise = nullptr;
5150  } else {
5151  profileProject.TPSTART(QUDA_PROFILE_H2D);
5153  profileProject.TPSTOP(QUDA_PROFILE_H2D);
5154  }
5155 
5156  profileProject.TPSTART(QUDA_PROFILE_COMPUTE);
5157  *num_failures_h = 0;
5158 
5159  // project onto SU(3)
5161  projectSU3(*cudaGauge, tol, num_failures_d);
5163 
5164  profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);
5165 
5166  if(*num_failures_h>0)
5167  errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
5168 
5169  profileProject.TPSTART(QUDA_PROFILE_D2H);
5171  profileProject.TPSTOP(QUDA_PROFILE_D2H);
5172 
5173  if (param->make_resident_gauge) {
5174  if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
5176  } else {
5177  delete cudaGauge;
5178  }
5179 
5180  profileProject.TPSTART(QUDA_PROFILE_FREE);
5181  if (cpuGauge) delete cpuGauge;
5182  profileProject.TPSTOP(QUDA_PROFILE_FREE);
5183 
5184  profileProject.TPSTOP(QUDA_PROFILE_TOTAL);
5185  }
5186 
5187  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
5188  profilePhase.TPSTART(QUDA_PROFILE_TOTAL);
5189 
5190  profilePhase.TPSTART(QUDA_PROFILE_INIT);
5191  checkGaugeParam(param);
5192 
5193  // create the gauge field
5195  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
5196  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
5197 
5198  // create the device fields
5203  profilePhase.TPSTOP(QUDA_PROFILE_INIT);
5204 
5205  if (param->use_resident_gauge) {
5206  if (!gaugePrecise) errorQuda("No resident gauge field to use");
5208  } else {
5209  profilePhase.TPSTART(QUDA_PROFILE_H2D);
5211  profilePhase.TPSTOP(QUDA_PROFILE_H2D);
5212  }
5213 
5214  profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
5215  *num_failures_h = 0;
5216 
5217  // apply / remove phase as appropriate
5220 
5221  profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
5222 
5223  profilePhase.TPSTART(QUDA_PROFILE_D2H);
5225  profilePhase.TPSTOP(QUDA_PROFILE_D2H);
5226 
5227  if (param->make_resident_gauge) {
5228  if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
5230  } else {
5231  delete cudaGauge;
5232  }
5233 
5234  profilePhase.TPSTART(QUDA_PROFILE_FREE);
5235  if (cpuGauge) delete cpuGauge;
5236  profilePhase.TPSTOP(QUDA_PROFILE_FREE);
5237 
5238  profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);
5239  }
5240 
5241 // evaluate the momentum action
5242 double momActionQuda(void* momentum, QudaGaugeParam* param)
5243 {
5244  profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);
5245 
5246  profileMomAction.TPSTART(QUDA_PROFILE_INIT);
5247  checkGaugeParam(param);
5248 
5249  // create the momentum fields
5255 
5257 
5258  // create the device fields
5262 
5264 
5265  profileMomAction.TPSTOP(QUDA_PROFILE_INIT);
5266 
5267  profileMomAction.TPSTART(QUDA_PROFILE_H2D);
5268  if (!param->use_resident_mom) {
5270  } else {
5271  if (!momResident) errorQuda("No resident mom field allocated");
5272  cudaMom = momResident;
5273  }
5274  profileMomAction.TPSTOP(QUDA_PROFILE_H2D);
5275 
5276  // perform the update
5277  profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);
5278  double action = computeMomAction(*cudaMom);
5279  profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);
5280 
5281  profileMomAction.TPSTART(QUDA_PROFILE_FREE);
5282  if (param->make_resident_mom) {
5283  if (momResident != nullptr && momResident != cudaMom) delete momResident;
5284  momResident = cudaMom;
5285  } else {
5286  delete cudaMom;
5287  momResident = nullptr;
5288  }
5289  if (cpuMom) {
5290  delete cpuMom;
5291  }
5292 
5293  profileMomAction.TPSTOP(QUDA_PROFILE_FREE);
5294  profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);
5295 
5296  return action;
5297 }
5298 
5299 /*
5300  The following functions are for the Fortran interface.
5301 */
5302 
5303 void init_quda_(int *dev) { initQuda(*dev); }
5304 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
5306 void end_quda_() { endQuda(); }
5307 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
5310 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
5311 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
5313 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5314  QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
5315 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5316  QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
5317 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5318 { MatQuda(h_out, h_in, inv_param); }
5319 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5320 { MatDagMatQuda(h_out, h_in, inv_param); }
5321 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {
5322  fflush(stdout);
5323  // ensure that fifth dimension is set to 1
5324  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
5325  invertQuda(hp_x, hp_b, param);
5326  fflush(stdout);
5327 }
5328 
5329 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
5330 {
5331  // ensure that fifth dimension is set to 1
5332  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
5333 
5334  if (!gaugePrecise) errorQuda("Resident gauge field not allocated");
5335 
5336  // get data into array of pointers
5337  int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;
5338 
5339  // compute offset assuming TIFR padded ordering (FIXME)
5340  if (param->dirac_order != QUDA_TIFR_PADDED_DIRAC_ORDER)
5341  errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER and not %d", param->dirac_order);
5342 
5343  const int *X = gaugePrecise->X();
5344  size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;
5345  void *hp_x[QUDA_MAX_MULTI_SHIFT];
5346  for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;
5347 
5348  invertMultiShiftQuda(hp_x, hp_b, param);
5349 }
5350 
5351 void flush_chrono_quda_(int *index) { flushChronoQuda(*index); }
5352 
5353 void register_pinned_quda_(void *ptr, size_t *bytes) {
5354  cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
5355  checkCudaError();
5356 }
5357 
5358 void unregister_pinned_quda_(void *ptr) {
5359  cudaHostUnregister(ptr);
5360  checkCudaError();
5361 }
5362 
5364  *param = newQudaGaugeParam();
5365 }
5368 }
5369 
5370 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
5371  bool *conj_mom, bool *exact,
5372  QudaGaugeParam *param) {
5373  updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
5374 }
5375 
5376 static inline int opp(int dir) { return 7-dir; }
5377 
5378 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){
5379 
5380  int index=0;
5381  // Plaquette paths
5382  if (num_loop_types >= 1)
5383  for(int i=0; i<4; ++i){
5384  if(i==dir) continue;
5385  paths[index][0] = i; paths[index][1] = opp(dir); paths[index++][2] = opp(i);
5386  paths[index][0] = opp(i); paths[index][1] = opp(dir); paths[index++][2] = i;
5387  }
5388 
5389  // Rectangle Paths
5390  if (num_loop_types >= 2)
5391  for(int i=0; i<4; ++i){
5392  if(i==dir) continue;
5393  paths[index][0] = paths[index][1] = i; paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
5394  index++;
5395  paths[index][0] = paths[index][1] = opp(i); paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
5396  index++;
5397  paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
5398  index++;
5399  paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
5400  index++;
5401  paths[index][0] = i; paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
5402  index++;
5403  paths[index][0] = opp(i); paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
5404  index++;
5405  }
5406 
5407  if (num_loop_types >= 3) {
5408  // Staple paths
5409  for(int i=0; i<4; ++i){
5410  for(int j=0; j<4; ++j){
5411  if(i==dir || j==dir || i==j) continue;
5412  paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
5413  index++;
5414  paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
5415  index++;
5416  paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
5417  index++;
5418  paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
5419  index++;
5420  }
5421  }
5422  }
5423 
5424 }
5425 
5426 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,
5427  QudaGaugeParam *param) {
5428 
5429  int numPaths = 0;
5430  switch (*num_loop_types) {
5431  case 1:
5432  numPaths = 6;
5433  break;
5434  case 2:
5435  numPaths = 24;
5436  break;
5437  case 3:
5438  numPaths = 48;
5439  break;
5440  default:
5441  errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);
5442  }
5443 
5444  auto *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
5445  int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));
5446 
5447  if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
5448  loop_coeff[i] = coeff[0];
5449  path_length[i] = 3;
5450  }
5451  if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {
5452  loop_coeff[i] = coeff[1];
5453  path_length[i] = 5;
5454  }
5455  if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {
5456  loop_coeff[i] = coeff[2];
5457  path_length[i] = 5;
5458  }
5459 
5460  int** input_path_buf[4];
5461  for(int dir=0; dir<4; ++dir){
5462  input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
5463  for(int i=0; i<numPaths; ++i){
5464  input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));
5465  }
5466  createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
5467  }
5468 
5469  int max_length = 6;
5470 
5471  computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);
5472 
5473  for(auto & dir : input_path_buf){
5474  for(int i=0; i<numPaths; ++i) host_free(dir[i]);
5475  host_free(dir);
5476  }
5477 
5478  host_free(path_length);
5479  host_free(loop_coeff);
5480 }
5481 
5482 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {
5483  computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);
5484 }
5485 
5486 // apply the staggered phases
5488  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");
5489  if (gaugePrecise) {
5491  } else {
5492  errorQuda("No persistent gauge field");
5493  }
5494 }
5495 
5496 // remove the staggered phases
5498  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");
5499  if (gaugePrecise) {
5501  } else {
5502  errorQuda("No persistent gauge field");
5503  }
5505 }
5506 
5507 // evaluate the kinetic term
5508 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {
5509  *kin = momActionQuda(momentum, param);
5510 }
5511 
5512 
5516 #ifdef MULTI_GPU
5517 static int bqcd_rank_from_coords(const int *coords, void *fdata)
5518 {
5519  int *dims = static_cast<int *>(fdata);
5520 
5521  int rank = coords[3];
5522  for (int i = 2; i >= 0; i--) {
5523  rank = dims[i] * rank + coords[i];
5524  }
5525  return rank;
5526 }
5527 #endif
5528 
5529 void comm_set_gridsize_(int *grid)
5530 {
5531 #ifdef MULTI_GPU
5532  initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
5533 #endif
5534 }
5535 
5539 void set_kernel_pack_t_(int* pack)
5540 {
5541  bool pack_ = *pack ? true : false;
5542  setKernelPackT(pack_);
5543 }
5544 
5545 void gaussGaugeQuda(unsigned long long seed, double sigma)
5546 {
5547  profileGauss.TPSTART(QUDA_PROFILE_TOTAL);
5548 
5549  if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
5550 
5551  cudaGaugeField *data = gaugePrecise;
5552 
5553  GaugeFieldParam param(*data);
5555  param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
5556  cudaGaugeField u(param);
5557 
5558  profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
5559  quda::gaugeGauss(*data, seed, sigma);
5560  profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);
5561 
5562  if (extendedGaugeResident) {
5564  extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
5565  }
5566 
5567  profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
5568 }
5569 
5570 
5571 /*
5572  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.
5573  */
5574 void plaq_quda_(double plaq[3]) {
5575  plaqQuda(plaq);
5576 }
5577 
5578 void plaqQuda(double plaq[3])
5579 {
5580  profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);
5581 
5582  if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
5583 
5585  extendedGaugeResident = data;
5586 
5587  profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
5588  double3 plaq3 = quda::plaquette(*data);
5589  plaq[0] = plaq3.x;
5590  plaq[1] = plaq3.y;
5591  plaq[2] = plaq3.z;
5592  profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);
5593 
5594  profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
5595 }
5596 
5597 /*
5598  * Performs a deep copy from the internal extendedGaugeResident field.
5599  */
5600 void copyExtendedResidentGaugeQuda(void* resident_gauge, QudaFieldLocation loc)
5601 {
5602  //profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);
5603 
5604  if (!gaugePrecise) errorQuda("Cannot perform deep copy of resident gauge field as there is no resident gauge field");
5605 
5607  extendedGaugeResident = data;
5608 
5609  auto* io_gauge = (cudaGaugeField*)resident_gauge;
5610 
5611  copyExtendedGauge(*io_gauge, *extendedGaugeResident, loc);
5612 
5613  //profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
5614 }
5615 
5616 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
5617 {
5618  profileWuppertal.TPSTART(QUDA_PROFILE_TOTAL);
5619 
5620  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5621 
5624 
5625  cudaGaugeField *precise = nullptr;
5626 
5627  if (gaugeSmeared != nullptr) {
5628  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
5631  precise = new cudaGaugeField(gParam);
5633  precise->exchangeGhost();
5634  } else {
5635  if (getVerbosity() >= QUDA_VERBOSE)
5636  printfQuda("Wuppertal smearing done with gaugePrecise\n");
5637  precise = gaugePrecise;
5638  }
5639 
5640  ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), false, inv_param->input_location);
5641  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
5642 
5643  ColorSpinorParam cudaParam(cpuParam, *inv_param);
5644  cudaColorSpinorField in(*in_h, cudaParam);
5645 
5646  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5647  double cpu = blas::norm2(*in_h);
5648  double gpu = blas::norm2(in);
5649  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
5650  }
5651 
5652  cudaParam.create = QUDA_NULL_FIELD_CREATE;
5653  cudaColorSpinorField out(in, cudaParam);
5654  int parity = 0;
5655 
5656  // Computes out(x) = 1/(1+6*alpha)*(in(x) + alpha*\sum_mu (U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)))
5657  double a = alpha / (1. + 6. * alpha);
5658  double b = 1. / (1. + 6. * alpha);
5659 
5660  for (unsigned int i = 0; i < n_steps; i++) {
5661  if (i) in = out;
5662  ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, nullptr, profileWuppertal);
5663  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5664  double norm = blas::norm2(out);
5665  printfQuda("Step %d, vector norm %e\n", i, norm);
5666  }
5667  }
5668 
5669  cpuParam.v = h_out;
5670  cpuParam.location = inv_param->output_location;
5671  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
5672  *out_h = out;
5673 
5674  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5675  double cpu = blas::norm2(*out_h);
5676  double gpu = blas::norm2(out);
5677  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
5678  }
5679 
5680  if (gaugeSmeared != nullptr)
5681  delete precise;
5682 
5683  delete out_h;
5684  delete in_h;
5685 
5686  popVerbosity();
5687 
5688  profileWuppertal.TPSTOP(QUDA_PROFILE_TOTAL);
5689 }
5690 
5691 void performAPEnStep(unsigned int n_steps, double alpha, int meas_interval)
5692 {
5693  profileAPE.TPSTART(QUDA_PROFILE_TOTAL);
5694 
5695  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5696 
5697  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5698  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);
5699 
5701  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5702 
5704  param.compute_qcharge = QUDA_BOOLEAN_TRUE;
5705 
5706  if (getVerbosity() >= QUDA_SUMMARIZE) {
5708  printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);
5709  }
5710 
5711  for (unsigned int i = 0; i < n_steps; i++) {
5712  profileAPE.TPSTART(QUDA_PROFILE_COMPUTE);
5713  APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
5714  profileAPE.TPSTOP(QUDA_PROFILE_COMPUTE);
5715  if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {
5717  printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);
5718  }
5719  }
5720 
5721  delete cudaGaugeTemp;
5722  profileAPE.TPSTOP(QUDA_PROFILE_TOTAL);
5723 }
5724 
5725 void performSTOUTnStep(unsigned int n_steps, double rho, int meas_interval)
5726 {
5727  profileSTOUT.TPSTART(QUDA_PROFILE_TOTAL);
5728 
5729  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5730 
5731  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5732  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);
5733 
5735  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5736 
5738  param.compute_qcharge = QUDA_BOOLEAN_TRUE;
5739 
5740  if (getVerbosity() >= QUDA_SUMMARIZE) {
5742  printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);
5743  }
5744 
5745  for (unsigned int i = 0; i < n_steps; i++) {
5746  profileSTOUT.TPSTART(QUDA_PROFILE_COMPUTE);
5747  STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
5748  profileSTOUT.TPSTOP(QUDA_PROFILE_COMPUTE);
5749  if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {
5751  printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);
5752  }
5753  }
5754 
5755  delete cudaGaugeTemp;
5756  profileSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
5757 }
5758 
5759 void performOvrImpSTOUTnStep(unsigned int n_steps, double rho, double epsilon, int meas_interval)
5760 {
5761  profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_TOTAL);
5762 
5763  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5764 
5765  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5766  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileOvrImpSTOUT);
5767 
5769  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5770 
5772  param.compute_qcharge = QUDA_BOOLEAN_TRUE;
5773 
5774  if (getVerbosity() >= QUDA_SUMMARIZE) {
5776  printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);
5777  }
5778 
5779  for (unsigned int i = 0; i < n_steps; i++) {
5780  profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_COMPUTE);
5781  OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);
5782  profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_COMPUTE);
5783  if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {
5785  printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);
5786  }
5787  }
5788 
5789  delete cudaGaugeTemp;
5790  profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
5791 }
5792 
5793 void performWFlownStep(unsigned int n_steps, double step_size, int meas_interval, QudaWFlowType wflow_type)
5794 {
5795  pushOutputPrefix("performWFlownStep: ");
5796  profileWFlow.TPSTART(QUDA_PROFILE_TOTAL);
5797 
5798  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5799 
5800  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5801  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileWFlow);
5802 
5803  GaugeFieldParam gParamEx(*gaugeSmeared);
5804  auto *gaugeAux = GaugeField::Create(gParamEx);
5805 
5807  gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct
5808  auto *gaugeTemp = GaugeField::Create(gParam);
5809 
5810  GaugeField *in = gaugeSmeared;
5811  GaugeField *out = gaugeAux;
5812 
5814  param.compute_plaquette = QUDA_BOOLEAN_TRUE;
5815  param.compute_qcharge = QUDA_BOOLEAN_TRUE;
5816 
5817  if (getVerbosity() >= QUDA_SUMMARIZE) {
5818  gaugeObservables(*in, param, profileWFlow);
5819  printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
5820  printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, param.plaquette[0], param.energy[0], param.energy[1],
5821  param.energy[2], param.qcharge);
5822  }
5823 
5824  for (unsigned int i = 0; i < n_steps; i++) {
5825  // Perform W1, W2, and Vt Wilson Flow steps as defined in
5826  // https://arxiv.org/abs/1006.4518v3
5827  profileWFlow.TPSTART(QUDA_PROFILE_COMPUTE);
5828  if (i > 0) std::swap(in, out); // output from prior step becomes input for next step
5829 
5830  WFlowStep(*out, *gaugeTemp, *in, step_size, wflow_type);
5831  profileWFlow.TPSTOP(QUDA_PROFILE_COMPUTE);
5832 
5833  if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_SUMMARIZE) {
5834  gaugeObservables(*out, param, profileWFlow);
5835  printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", step_size * (i + 1), param.plaquette[0], param.energy[0],
5836  param.energy[1], param.energy[2], param.qcharge);
5837  }
5838  }
5839 
5840  delete gaugeTemp;
5841  delete gaugeAux;
5842  profileWFlow.TPSTOP(QUDA_PROFILE_TOTAL);
5843  popOutputPrefix();
5844 }
5845 
5846 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
5847  const unsigned int verbose_interval, const double relax_boost, const double tolerance,
5848  const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param,
5849  double *timeinfo)
5850 {
5851  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);
5852 
5853  checkGaugeParam(param);
5854 
5855  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);
5856  GaugeFieldParam gParam(gauge, *param);
5857  auto *cpuGauge = new cpuGaugeField(gParam);
5858 
5859  // gParam.pad = getFatLinkPadding(param->X);
5864  auto *cudaInGauge = new cudaGaugeField(gParam);
5865 
5866  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);
5867  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);
5868 
5870  cudaInGauge->loadCPUField(*cpuGauge);
5871  /* } else { // or use resident fields already present
5872  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5873  cudaInGauge = gaugePrecise;
5874  gaugePrecise = nullptr;
5875  } */
5876 
5877  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);
5878 
5879  if (comm_size() == 1) {
5880  // perform the update
5881  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
5882  gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
5883  stopWtheta);
5884  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
5885  } else {
5886  cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);
5887 
5888  // perform the update
5889  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
5890  gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
5891  stopWtheta);
5892  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
5893 
5894  //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge
5895  copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
5896  }
5897 
5898  // copy the gauge field back to the host
5899  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
5900  cudaInGauge->saveCPUField(*cpuGauge);
5901  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);
5902 
5903  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);
5904 
5905  if (param->make_resident_gauge) {
5906  if (gaugePrecise != nullptr) delete gaugePrecise;
5907  gaugePrecise = cudaInGauge;
5908  } else {
5909  delete cudaInGauge;
5910  }
5911 
5912  if(timeinfo){
5913  timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
5914  timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
5915  timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
5916  }
5917 
5918  return 0;
5919 }
5920 
5921 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \
5922  const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
5923  const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo)
5924 {
5925  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);
5926 
5927  checkGaugeParam(param);
5928 
5929  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);
5930 
5931  GaugeFieldParam gParam(gauge, *param);
5932  auto *cpuGauge = new cpuGaugeField(gParam);
5933 
5934  //gParam.pad = getFatLinkPadding(param->X);
5939  auto *cudaInGauge = new cudaGaugeField(gParam);
5940 
5941 
5942  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);
5943 
5944  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);
5945 
5946  //if (!param->use_resident_gauge) { // load fields onto the device
5947  cudaInGauge->loadCPUField(*cpuGauge);
5948  /*} else { // or use resident fields already present
5949  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5950  cudaInGauge = gaugePrecise;
5951  gaugePrecise = nullptr;
5952  } */
5953 
5954  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);
5955 
5956  // perform the update
5957  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);
5958 
5959  gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5960 
5961  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
5962 
5963  // copy the gauge field back to the host
5964  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
5965  cudaInGauge->saveCPUField(*cpuGauge);
5966  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
5967 
5968  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);
5969 
5970  if (param->make_resident_gauge) {
5971  if (gaugePrecise != nullptr) delete gaugePrecise;
5972  gaugePrecise = cudaInGauge;
5973  } else {
5974  delete cudaInGauge;
5975  }
5976 
5977  if (timeinfo) {
5978  timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
5979  timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
5980  timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
5981  }
5982 
5983  return 0;
5984 }
5985 
5986 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
5987  QudaInvertParam *param, const int *X)
5988 {
5989  // DMH: Easiest way to construct ColorSpinorField? Do we require the user
5990  // to declare and fill and invert_param, or can it just be hacked?.
5991 
5992  profileContract.TPSTART(QUDA_PROFILE_TOTAL);
5993  profileContract.TPSTART(QUDA_PROFILE_INIT);
5994  // wrap CPU host side pointers
5995  ColorSpinorParam cpuParam((void *)hp_x, *param, X, false, param->input_location);
5996  ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);
5997 
5998  cpuParam.v = (void *)hp_y;
5999  ColorSpinorField *h_y = ColorSpinorField::Create(cpuParam);
6000 
6001  // Create device parameter
6002  ColorSpinorParam cudaParam(cpuParam);
6003  cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
6004  cudaParam.create = QUDA_NULL_FIELD_CREATE;
6005  // Quda uses Degrand-Rossi gamma basis for contractions and will
6006  // automatically reorder data if necessary.
6008  cudaParam.setPrecision(cpuParam.Precision(), cpuParam.Precision(), true);
6009 
6010  std::vector<ColorSpinorField *> x, y;
6011  x.push_back(ColorSpinorField::Create(cudaParam));
6012  y.push_back(ColorSpinorField::Create(cudaParam));
6013 
6014  size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();
6015  void *d_result = pool_device_malloc(data_bytes);
6016  profileContract.TPSTOP(QUDA_PROFILE_INIT);
6017 
6018  profileContract.TPSTART(QUDA_PROFILE_H2D);
6019  *x[0] = *h_x;
6020  *y[0] = *h_y;
6021  profileContract.TPSTOP(QUDA_PROFILE_H2D);
6022 
6023  profileContract.TPSTART(QUDA_PROFILE_COMPUTE);
6024  contractQuda(*x[0], *y[0], d_result, cType);
6025  profileContract.TPSTOP(QUDA_PROFILE_COMPUTE);
6026 
6027  profileContract.TPSTART(QUDA_PROFILE_D2H);
6028  qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);
6029  profileContract.TPSTOP(QUDA_PROFILE_D2H);
6030 
6031  profileContract.TPSTART(QUDA_PROFILE_FREE);
6032  pool_device_free(d_result);
6033  delete x[0];
6034  delete y[0];
6035  delete h_y;
6036  delete h_x;
6037  profileContract.TPSTOP(QUDA_PROFILE_FREE);
6038 
6039  profileContract.TPSTOP(QUDA_PROFILE_TOTAL);
6040 }
6041 
6043 {
6044  profileGaugeObs.TPSTART(QUDA_PROFILE_TOTAL);
6045  checkGaugeObservableParam(param);
6046 
6047  cudaGaugeField *gauge = nullptr;
6048  if (!gaugeSmeared) {
6050  gauge = extendedGaugeResident;
6051  } else {
6052  gauge = gaugeSmeared;
6053  }
6054 
6055  gaugeObservables(*gauge, *param, profileGaugeObs);
6056  profileGaugeObs.TPSTOP(QUDA_PROFILE_TOTAL);
6057 }
void CloseMagma()
void OpenMagma()
Conjugate-Gradient Solver.
Definition: invert_quda.h:639
double * TrLog() const
Definition: clover_field.h:157
double Coeff() const
Definition: clover_field.h:202
static CloverField * Create(const CloverFieldParam &param)
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch),...
void * V(bool inverse=false)
Definition: clover_field.h:138
double Csw() const
Definition: clover_field.h:197
size_t Bytes() const
Definition: clover_field.h:167
const ColorSpinorField & Odd() const
QudaSiteSubset SiteSubset() const
static ColorSpinorField * Create(const ColorSpinorParam &param)
const ColorSpinorField & Even() const
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
virtual void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Apply the local MdagM operator: equivalent to applying zero Dirichlet boundary condition to MdagM on ...
Definition: dirac_quda.h:243
virtual void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the gauge field and temporary spinors to the CPU ...
Definition: dirac.cpp:305
void setMass(double mass)
Definition: dirac_quda.h:282
void Dagger(QudaDagType dag) const
sets whether operator is daggered or not
Definition: dirac_quda.h:333
static Dirac * create(const DiracParam &param)
Creates a subclass from parameters.
Definition: dirac.cpp:151
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Apply Mdag (daggered operator of M.
Definition: dirac.cpp:92
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
Definition: dirac_quda.h:1967
Complex b_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:29
ColorSpinorField * tmp1
Definition: dirac_quda.h:52
cudaGaugeField * gauge
Definition: dirac_quda.h:41
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:55
QudaDiracType type
Definition: dirac_quda.h:24
cudaCloverField * clover
Definition: dirac_quda.h:45
Complex c_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:30
cudaGaugeField * longGauge
Definition: dirac_quda.h:43
double eofa_shift
Definition: dirac_quda.h:33
cudaGaugeField * fatGauge
Definition: dirac_quda.h:42
QudaPrecision halo_precision
Definition: dirac_quda.h:57
QudaMatPCType matpcType
Definition: dirac_quda.h:39
QudaDagType dagger
Definition: dirac_quda.h:40
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
Apply MdagM operator which may be optimized.
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:295
void removeStaggeredPhase()
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:287
size_t Bytes() const
Definition: gauge_field.h:352
static GaugeField * Create(const GaugeFieldParam &param)
Create the gauge field, with meta data specified in the parameter struct.
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
int Ncolor() const
Definition: gauge_field.h:285
virtual void * Gauge_p()
Definition: gauge_field.h:358
uint64_t checksum(bool mini=false) const
double Anisotropy() const
Definition: gauge_field.h:288
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:296
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286
QudaPrecision Precision() const
const int * X() const
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
Definition: invert_quda.h:1303
Multi-Shift Conjugate Gradient Solver.
Definition: invert_quda.h:1258
static Solver * create(SolverParam &param, const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, TimeProfile &profile)
Solver factory.
Definition: solver.cpp:42
void Print()
Definition: timer.cpp:7
static void PrintGlobal()
Definition: timer.cpp:84
double Last(QudaProfileType idx)
Definition: timer.h:254
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
void copy(const GaugeField &src)
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void comm_barrier(void)
int comm_size(void)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle=false, void *user_comm=nullptr)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi....
int comm_dim_partitioned(int dim)
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
int comm_dim(int dim)
int commDimPartitioned(int dir)
void comm_finalize(void)
int comm_gpuid(void)
QudaWFlowType wflow_type
double kappa
QudaReconstructType link_recon_sloppy
double tol
QudaReconstructType link_recon
QudaVerbosity verbosity
QudaReconstructType link_recon_precondition
double tol_hq
double epsilon
QudaPrecision prec
constexpr quda::CommKey default_comm_key
void push_communicator(const quda::CommKey &split_key)
int V
Definition: host_utils.cpp:37
GaugeCovDev * dirac
Definition: covdev_test.cpp:42
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
QudaGaugeParam gauge_param
Definition: covdev_test.cpp:26
QudaInvertParam inv_param
Definition: covdev_test.cpp:27
@ QUDA_SOURCE_NORMALIZATION
Definition: enum_quda.h:234
@ QUDA_TWISTED_MASSPC_DIRAC
Definition: enum_quda.h:312
@ QUDA_GAUGE_LAPLACE_DIRAC
Definition: enum_quda.h:317
@ QUDA_GAUGE_COVDEV_DIRAC
Definition: enum_quda.h:319
@ QUDA_TWISTED_CLOVERPC_DIRAC
Definition: enum_quda.h:314
@ QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC
Definition: enum_quda.h:304
@ QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC
Definition: enum_quda.h:295
@ QUDA_TWISTED_MASS_DIRAC
Definition: enum_quda.h:311
@ QUDA_STAGGERED_DIRAC
Definition: enum_quda.h:305
@ QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC
Definition: enum_quda.h:296
@ QUDA_DOMAIN_WALL_4D_DIRAC
Definition: enum_quda.h:299
@ QUDA_ASQTAD_DIRAC
Definition: enum_quda.h:308
@ QUDA_STAGGEREDPC_DIRAC
Definition: enum_quda.h:306
@ QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC
Definition: enum_quda.h:303
@ QUDA_ASQTADPC_DIRAC
Definition: enum_quda.h:309
@ QUDA_GAUGE_LAPLACEPC_DIRAC
Definition: enum_quda.h:318
@ QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:302
@ QUDA_TWISTED_CLOVER_DIRAC
Definition: enum_quda.h:313
@ QUDA_DOMAIN_WALL_4DPC_DIRAC
Definition: enum_quda.h:300
@ QUDA_CLOVER_DIRAC
Definition: enum_quda.h:293
@ QUDA_MOBIUS_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:301
@ QUDA_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:298
@ QUDA_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:297
@ QUDA_WILSONPC_DIRAC
Definition: enum_quda.h:292
@ QUDA_CLOVERPC_DIRAC
Definition: enum_quda.h:294
@ QUDA_WILSON_DIRAC
Definition: enum_quda.h:291
enum QudaWFlowType_s QudaWFlowType
enum QudaPrecision_s QudaPrecision
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515
@ QUDA_COVDEV_DSLASH
Definition: enum_quda.h:102
@ QUDA_WILSON_DSLASH
Definition: enum_quda.h:90
@ QUDA_TWISTED_CLOVER_DSLASH
Definition: enum_quda.h:100
@ QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:97
@ QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:95
@ QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:91
@ QUDA_TWISTED_MASS_DSLASH
Definition: enum_quda.h:99
@ QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:93
@ QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:98
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
Definition: enum_quda.h:96
@ QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:101
@ QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
Definition: enum_quda.h:92
@ QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:94
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:226
@ QUDA_ASYMMETRIC_MASS_NORMALIZATION
Definition: enum_quda.h:228
@ QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:227
@ QUDA_DAG_NO
Definition: enum_quda.h:223
@ QUDA_DAG_YES
Definition: enum_quda.h:223
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:461
@ QUDA_DEGRAND_ROSSI_GAMMA_BASIS
Definition: enum_quda.h:368
@ QUDA_UKQCD_GAMMA_BASIS
Definition: enum_quda.h:369
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71
@ QUDA_RECONSTRUCT_8
Definition: enum_quda.h:72
@ QUDA_RECONSTRUCT_10
Definition: enum_quda.h:75
@ QUDA_PERIODIC_T
Definition: enum_quda.h:57
enum QudaPCType_s QudaPCType
@ QUDA_EVEN_PARITY
Definition: enum_quda.h:284
@ QUDA_ODD_PARITY
Definition: enum_quda.h:284
@ QUDA_MEMORY_MAPPED
Definition: enum_quda.h:15
@ QUDA_MEMORY_PINNED
Definition: enum_quda.h:14
@ QUDA_MEMORY_DEVICE
Definition: enum_quda.h:13
@ QUDA_TIFR_PADDED_DIRAC_ORDER
Definition: enum_quda.h:250
@ QUDA_CPS_WILSON_DIRAC_ORDER
Definition: enum_quda.h:248
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195
@ QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:500
@ QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:501
@ QUDA_TENSOR_GEOMETRY
Definition: enum_quda.h:502
enum QudaFieldGeometry_s QudaFieldGeometry
@ QUDA_TRANSFER_COARSE_KD
Definition: enum_quda.h:454
@ QUDA_TRANSFER_OPTIMIZED_KD
Definition: enum_quda.h:455
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509
@ QUDA_MATPC_ODD_ODD_ASYMMETRIC
Definition: enum_quda.h:219
@ QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:218
@ QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:117
@ QUDA_PCG_INVERTER
Definition: enum_quda.h:114
@ QUDA_INVALID_INVERTER
Definition: enum_quda.h:133
@ QUDA_EIGCG_INVERTER
Definition: enum_quda.h:116
@ QUDA_MG_INVERTER
Definition: enum_quda.h:122
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
enum QudaReconstructType_s QudaReconstructType
@ QUDA_MATPC_SOLUTION
Definition: enum_quda.h:159
@ QUDA_MATDAG_MAT_SOLUTION
Definition: enum_quda.h:158
@ QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:161
@ QUDA_MAT_SOLUTION
Definition: enum_quda.h:157
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_5D_PC
Definition: enum_quda.h:397
@ QUDA_4D_PC
Definition: enum_quda.h:397
@ QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:189
@ QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:40
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
enum QudaContractType_s QudaContractType
@ QUDA_SPECTRUM_SI_EIG
Definition: enum_quda.h:152
@ QUDA_SPECTRUM_LI_EIG
Definition: enum_quda.h:151
@ QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348
@ QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:352
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
enum QudaVerbosity_s QudaVerbosity
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:362
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
@ QUDA_TWIST_SINGLET
Definition: enum_quda.h:400
@ QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:401
@ QUDA_DIRECT_SOLVE
Definition: enum_quda.h:167
@ QUDA_NORMERR_SOLVE
Definition: enum_quda.h:171
@ QUDA_NORMERR_PC_SOLVE
Definition: enum_quda.h:172
@ QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:170
@ QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:169
enum QudaParity_s QudaParity
@ QUDA_SU3_LINKS
Definition: enum_quda.h:24
@ QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:33
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30
@ QUDA_SMEARED_LINKS
Definition: enum_quda.h:29
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
cudaGaugeField * cudaMom
cpuGaugeField * cpuMom
GaugeFieldParam gParam
cpuGaugeField * cpuGauge
cudaGaugeField * cudaForce
cudaGaugeField * cudaGauge
cudaGaugeField * cudaFatLink
cpuGaugeField * cpuFatLink
double kappa5
Definition: host_utils.cpp:51
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
void initQudaMemory()
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
double momActionQuda(void *momentum, QudaGaugeParam *param)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into sub-partitions...
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
void initQuda(int dev)
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
std::vector< cudaColorSpinorField * > solutionResident
#define STR(x)
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
cudaGaugeField * gaugeFatPrecise
void new_quda_gauge_param_(QudaGaugeParam *param)
void destroyGaugeFieldQuda(void *gauge)
cudaGaugeField * momResident
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
void new_quda_invert_param_(QudaInvertParam *param)
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
void * newDeflationQuda(QudaEigParam *eig_param)
void end_quda_()
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
void checkClover(QudaInvertParam *param)
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)
void free_sloppy_gauge_quda_()
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void momResidentQuda(void *mom, QudaGaugeParam *param)
cudaGaugeField * gaugeLongPrecise
cudaGaugeField * gaugeSloppy
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void init_quda_memory_()
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
cudaGaugeField * gaugeLongPrecondition
cudaGaugeField * extendedGaugeResident
cudaCloverField * cloverPrecondition
void setMPICommHandleQuda(void *mycomm)
cudaGaugeField * gaugeRefinement
void plaqQuda(double plaq[3])
void performAPEnStep(unsigned int n_steps, double alpha, int meas_interval)
void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
cudaGaugeField * gaugeFatExtended
#define QUDA_MAX_CHRONO
void destroyDeflationQuda(void *df)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
void freeSloppyCloverQuda()
cudaGaugeField * gaugeSmeared
void loadSloppyCloverQuda(const QudaPrecision prec[])
void init_quda_(int *dev)
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
void freeSloppyGaugeQuda()
void freeGaugeQuda(void)
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
cudaCloverField * cloverSloppy
void * newMultigridQuda(QudaMultigridParam *mg_param)
void openMagma()
TimeProfile & getProfileBLAS()
Profiler for covariant derivative.
void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
cudaGaugeField * gaugeLongRefinement
void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into sub-partitions...
cudaGaugeField * gaugeFatRefinement
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void initQudaDevice(int dev)
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
cudaGaugeField * gaugeLongExtended
cudaGaugeField * gaugeLongSloppy
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void free_clover_quda_(void)
void comm_set_gridsize_(int *grid)
void freeCloverQuda(void)
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
cudaGaugeField * gaugeEigensolver
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void performWFlownStep(unsigned int n_steps, double step_size, int meas_interval, QudaWFlowType wflow_type)
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo.
cudaGaugeField * gaugePrecise
cudaGaugeField * gaugeFatSloppy
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
char * gitversion
Definition: version.cpp:4
cudaCloverField * cloverPrecise
cudaGaugeField * gaugeFatPrecondition
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void flushChronoQuda(int i)
Flush the chronological history for the given index.
void endQuda(void)
void init_quda_device_(int *dev)
void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv, Interface op, Args... args)
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks, void *milc_longlinks)
void free_gauge_quda_()
void gaugeObservablesQuda(QudaGaugeObservableParam *param)
Calculates a variety of gauge-field observables. If a smeared gauge field is presently loaded (in gau...
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
cudaGaugeField * gaugePrecondition
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
cudaCloverField * cloverEigensolver
void performSTOUTnStep(unsigned int n_steps, double rho, int meas_interval)
int getGaugePadding(GaugeFieldParam &param)
void createCloverQuda(QudaInvertParam *invertParam)
void updateR()
update the radius for halos.
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void performOvrImpSTOUTnStep(unsigned int n_steps, double rho, double epsilon, int meas_interval)
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
cudaCloverField * cloverRefinement
cudaGaugeField * gaugeLongEigensolver
void checkBLASParam(QudaBLASParam &param)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
cudaGaugeField * gaugeExtended
void plaq_quda_(double plaq[3])
#define MAX(a, b)
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
void closeMagma()
cudaGaugeField * gaugeFatEigensolver
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
#define pool_device_malloc(size)
Definition: malloc_quda.h:170
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pool_device_free(ptr)
Definition: malloc_quda.h:171
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116
#define host_free(ptr)
Definition: malloc_quda.h:115
#define mapped_malloc(size)
Definition: malloc_quda.h:108
void destroy()
Destroy the BLAS context.
void init()
Create the BLAS context.
void destroy()
Destroy the BLAS context.
void set_native(bool native)
void destroy()
void init()
unsigned long long bytes
void ax(double a, ColorSpinorField &x)
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void stop()
Stop profiling.
Definition: device.cpp:228
void start()
Start profiling.
Definition: device.cpp:226
void create_context()
Create the streams associated with parallel execution.
Definition: device.cpp:185
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
Definition: device.cpp:25
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
Definition: device.cpp:200
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:632
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:753
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:761
void applyU(GaugeField &force, GaugeField &U)
void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.
bool canReuseResidentGauge(QudaInvertParam *inv_param)
void createDslashEvents()
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
void setKernelPackT(bool pack)
void saveTuneCache(bool error=false)
Definition: tune.cpp:439
void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile)
Calculates a variety of gauge-field observables.
void arpack_solve(std::vector< ColorSpinorField * > &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data,...
void loadTuneCache()
Definition: tune.cpp:337
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
constexpr int product(const CommKey &input)
Definition: comm_key.h:28
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param, bool for_multishift)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.
void join_field(std::vector< Field * > &v_base_field, const Field &collect_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
Definition: split_grid.h:121
bool forceMonitor()
Whether we are monitoring the force or not.
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action....
void split_field(Field &collect_field, std::vector< Field * > &v_base_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
Definition: split_grid.h:17
void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff)
Compute the long links for an improved staggered (Kogut-Susskind) fermions.
void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff)
Compute the fat links for an improved staggered (Kogut-Susskind) fermions.
void destroyDslashEvents()
void printPeakMemUsage()
Definition: malloc.cpp:539
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:605
void printAPIProfile()
Print out the timer profile for CUDA API calls.
Definition: quda_api.cpp:495
void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.
std::complex< double > Complex
Definition: quda_internal.h:86
void gaugeFixingFFT(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void WFlowStep(GaugeField &out, GaugeField &temp, GaugeField &in, double epsilon, QudaWFlowType wflow_type)
Apply Wilson Flow steps W1, W2, Vt to the gauge field. This routine assumes that the input and output...
void flushForceMonitor()
Flush any outstanding force monitoring information.
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the Laplace stencil.
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_TOTAL
Definition: timer.h:149
@ QUDA_PROFILE_FREE
Definition: timer.h:111
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107
@ QUDA_PROFILE_CHRONO
Definition: timer.h:113
@ QUDA_PROFILE_H2D
Definition: timer.h:104
@ QUDA_PROFILE_D2H
Definition: timer.h:105
cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
void computeClover(CloverField &clover, const GaugeField &fmunu, double coeff)
Driver for computing the clover field from the field strength tensor.
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...
void gaugeFixingOVR(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
void unitarizeLinks(GaugeField &outfield, const GaugeField &infield, int *fails)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:522
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
void printLaunchTimer()
Definition: tune.cpp:880
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.
void assertAllMemFree()
Definition: malloc.cpp:549
constexpr bool dynamic_clover_inverse()
Helper function that returns whether we have enabled dyanmic clover inversion or not.
Definition: clover_field.h:518
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.
void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
void projectSU3(GaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation....
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:532
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)
::std::string string
Definition: gtest-port.h:891
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
Main header file for the QUDA library.
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:689
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:342
QudaGaugeParam newQudaGaugeParam(void)
void printQudaEigParam(QudaEigParam *param)
Definition: check_params.h:158
QudaGaugeObservableParam newQudaGaugeObservableParam(void)
QudaInvertParam newQudaInvertParam(void)
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
#define qudaMemset(ptr, value, count)
Definition: quda_api.h:218
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be.
Fortran interface functions.
int dims[QUDA_MAX_DIM]
QudaEigSpectrumType spectrum
Definition: quda.h:466
QudaBoolean use_dagger
Definition: quda.h:449
QudaBoolean arpack_check
Definition: quda.h:492
double secs
Definition: quda.h:539
QudaBoolean use_norm_op
Definition: quda.h:450
QudaPrecision cuda_prec_ritz
Definition: quda.h:510
QudaBoolean compute_gamma5
Definition: quda.h:460
QudaFieldLocation location
Definition: quda.h:516
double gflops
Definition: quda.h:536
QudaInvertParam * invert_param
Definition: quda.h:413
QudaMemoryType mem_type_ritz
Definition: quda.h:513
int n_conv
Definition: quda.h:475
QudaReconstructType reconstruct_precondition
Definition: quda.h:58
size_t mom_offset
Definition: quda.h:90
QudaReconstructType reconstruct
Definition: quda.h:49
QudaPrecision cuda_prec_precondition
Definition: quda.h:57
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
int make_resident_mom
Definition: quda.h:85
int return_result_mom
Definition: quda.h:87
size_t gauge_offset
Definition: quda.h:89
int use_resident_gauge
Definition: quda.h:82
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:54
QudaFieldLocation location
Definition: quda.h:33
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42
int make_resident_gauge
Definition: quda.h:84
QudaPrecision cuda_prec
Definition: quda.h:48
size_t site_size
Definition: quda.h:91
QudaReconstructType reconstruct_eigensolver
Definition: quda.h:61
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
int use_resident_mom
Definition: quda.h:83
int overlap
Definition: quda.h:78
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:55
int staggered_phase_applied
Definition: quda.h:74
int return_result_gauge
Definition: quda.h:86
int overwrite_mom
Definition: quda.h:80
QudaPrecision cuda_prec_eigensolver
Definition: quda.h:60
int compute_clover
Definition: quda.h:266
QudaSolveType solve_type
Definition: quda.h:229
double gflops
Definition: quda.h:277
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:240
QudaSolutionType solution_type
Definition: quda.h:228
QudaCloverFieldOrder clover_order
Definition: quda.h:256
QudaMassNormalization mass_normalization
Definition: quda.h:232
QudaFieldLocation clover_location
Definition: quda.h:248
double mq2
Definition: quda.h:128
QudaPrecision clover_cuda_prec_refinement_sloppy
Definition: quda.h:252
int num_offset
Definition: quda.h:186
QudaPrecision cuda_prec_eigensolver
Definition: quda.h:242
double mass
Definition: quda.h:109
QudaPrecision clover_cuda_prec
Definition: quda.h:250
double m5
Definition: quda.h:112
int return_clover
Definition: quda.h:268
int compute_clover_inverse
Definition: quda.h:267
QudaTwistFlavorType twist_flavor
Definition: quda.h:134
QudaPrecision clover_cpu_prec
Definition: quda.h:249
QudaDslashType dslash_type
Definition: quda.h:106
int return_clover_inverse
Definition: quda.h:269
double mq1
Definition: quda.h:127
int compute_clover_trlog
Definition: quda.h:263
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:253
QudaPrecision cuda_prec
Definition: quda.h:238
double mu
Definition: quda.h:131
QudaVerbosity verbosity
Definition: quda.h:271
double_complex b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:115
double clover_rho
Definition: quda.h:261
QudaDslashType dslash_type_precondition
Definition: quda.h:312
double clover_coeff
Definition: quda.h:260
double trlogA[2]
Definition: quda.h:264
double eofa_shift
Definition: quda.h:125
double secs
Definition: quda.h:278
QudaPrecision clover_cuda_prec_eigensolver
Definition: quda.h:254
QudaDagType dagger
Definition: quda.h:231
QudaInverterType inv_type
Definition: quda.h:107
double epsilon
Definition: quda.h:132
double clover_csw
Definition: quda.h:259
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:218
QudaMatPCType matpc_type
Definition: quda.h:230
double_complex c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:116
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:251
double mq3
Definition: quda.h:129
QudaPrecision cuda_prec_sloppy
Definition: quda.h:239
QudaFieldLocation input_location
Definition: quda.h:103
QudaFieldLocation output_location
Definition: quda.h:104
QudaPrecision cuda_prec_precondition
Definition: quda.h:241
int use_resident_solution
Definition: quda.h:378
QudaDiracFieldOrder dirac_order
Definition: quda.h:244
double kappa
Definition: quda.h:110
QudaBoolean thin_update_only
Definition: quda.h:733
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
Definition: quda.h:568
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:565
QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:727
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:674
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:662
QudaBoolean setup_minimize_memory
Definition: quda.h:682
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:652
QudaInvertParam * invert_param
Definition: quda.h:551
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
Definition: quda.h:649
bool twisted
Overall clover coefficient.
Definition: clover_field.h:44
double coeff
C_sw clover coefficient.
Definition: clover_field.h:43
QudaCloverFieldOrder order
Definition: clover_field.h:48
QudaFieldLocation location
Definition: clover_field.h:51
QudaFieldCreate create
Definition: clover_field.h:49
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
Definition: clover_field.h:59
static constexpr int n_dim
Definition: comm_key.h:8
constexpr bool is_valid() const
Definition: comm_key.h:22
QudaReconstructType reconstruct
Definition: gauge_field.h:50
QudaGaugeFieldOrder order
Definition: gauge_field.h:51
QudaFieldGeometry geometry
Definition: gauge_field.h:62
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
Definition: gauge_field.h:173
QudaLinkType link_type
Definition: gauge_field.h:53
QudaFieldCreate create
Definition: gauge_field.h:60
QudaTboundary t_boundary
Definition: gauge_field.h:54
QudaMemoryType mem_type
Definition: lattice_field.h:74
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
QudaSiteSubset siteSubset
Definition: lattice_field.h:72
QudaPrecision Precision() const
Definition: lattice_field.h:59
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:178
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:184
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:58
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:428
TimeProfile & profile
Definition: deflation.h:190
Deflation * defl
Definition: deflation.h:189
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
ColorSpinorField * RV
Definition: deflation.h:185
DiracMatrix * m
Definition: deflation.h:183
DeflationParam * deflParam
Definition: deflation.h:187
TimeProfile & profile
Definition: multigrid.h:517
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
std::vector< ColorSpinorField * > B
Definition: multigrid.h:512
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
Definition: util_quda.cpp:83
void popOutputPrefix()
Pop the output prefix restoring the prior one on the stack.
Definition: util_quda.cpp:121
#define printfQuda(...)
Definition: util_quda.h:114
#define checkCudaError()
Definition: util_quda.h:158
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
Definition: util_quda.cpp:94
void pushOutputPrefix(const char *prefix)
Push a new output prefix onto the stack.
Definition: util_quda.cpp:105
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:69
#define errorQuda(...)
Definition: util_quda.h:120
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:75