QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
interface_quda.cpp
Go to the documentation of this file.
1 #include <cmath>
2 #include <cstdio>
3 #include <cstdlib>
4 #include <cstring>
5 #include <iostream>
6 #include <sys/time.h>
7 #include <complex.h>
8 
9 #include <quda.h>
10 #include <quda_fortran.h>
11 #include <quda_internal.h>
12 #include <comm_quda.h>
13 #include <tune_quda.h>
14 #include <blas_quda.h>
15 #include <gauge_field.h>
16 #include <dirac_quda.h>
17 #include <dslash_quda.h>
18 #include <invert_quda.h>
19 #include <eigensolve_quda.h>
20 #include <color_spinor_field.h>
21 #include <clover_field.h>
22 #include <llfat_quda.h>
23 #include <unitarization_links.h>
24 #include <algorithm>
25 #include <staggered_oprod.h>
26 #include <ks_improved_force.h>
27 #include <ks_force_quda.h>
28 #include <random_quda.h>
29 #include <mpi_comm_handle.h>
30 
31 #include <multigrid.h>
32 
33 #include <deflation.h>
34 
35 #ifdef NUMA_NVML
36 #include <numa_affinity.h>
37 #endif
38 
39 #ifdef QUDA_NVML
40 #include <nvml.h>
41 #endif
42 
43 #include <cuda.h>
44 
45 #include <ks_force_quda.h>
46 
47 #ifdef GPU_GAUGE_FORCE
48 #include <gauge_force_quda.h>
49 #endif
50 #include <gauge_update_quda.h>
51 
52 #define MAX(a,b) ((a)>(b)? (a):(b))
53 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
54 
55 #define spinorSiteSize 24 // real numbers per spinor
56 
57 #define MAX_GPU_NUM_PER_NODE 16
58 
59 // define newQudaGaugeParam() and newQudaInvertParam()
60 #define INIT_PARAM
61 #include "check_params.h"
62 #undef INIT_PARAM
63 
64 // define (static) checkGaugeParam() and checkInvertParam()
65 #define CHECK_PARAM
66 #include "check_params.h"
67 #undef CHECK_PARAM
68 
69 // define printQudaGaugeParam() and printQudaInvertParam()
70 #define PRINT_PARAM
71 #include "check_params.h"
72 #undef PRINT_PARAM
73 
74 #include <gauge_tools.h>
75 #include <contract_quda.h>
76 
77 #include <momentum.h>
78 
79 
80 #include <cuda_profiler_api.h>
81 
82 using namespace quda;
83 
84 static int R[4] = {0, 0, 0, 0};
85 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels
86 static bool redundant_comms = false;
87 
88 #include <blas_cublas.h>
89 
90 //for MAGMA lib:
91 #include <blas_magma.h>
92 
93 static bool InitMagma = false;
94 
95 void openMagma() {
96 
97  if (!InitMagma) {
98  OpenMagma();
99  InitMagma = true;
100  } else {
101  printfQuda("\nMAGMA library was already initialized..\n");
102  }
103 
104 }
105 
106 void closeMagma(){
107 
108  if (InitMagma) {
109  CloseMagma();
110  InitMagma = false;
111  } else {
112  printfQuda("\nMAGMA library was not initialized..\n");
113  }
114 
115 }
116 
122 
128 
134 
136 
141 
144 
145 std::vector<cudaColorSpinorField*> solutionResident;
146 
147 // vector of spinors used for forecasting solutions in HMC
148 #define QUDA_MAX_CHRONO 12
149 // each entry is one p
150 std::vector< std::vector<ColorSpinorField*> > chronoResident(QUDA_MAX_CHRONO);
151 
152 // Mapped memory buffer used to hold unitarization failures
153 static int *num_failures_h = nullptr;
154 static int *num_failures_d = nullptr;
155 
156 cudaDeviceProp deviceProp;
157 cudaStream_t *streams;
158 
159 static bool initialized = false;
160 
162 static TimeProfile profileInit("initQuda");
163 
165 static TimeProfile profileGauge("loadGaugeQuda");
166 
168 static TimeProfile profileClover("loadCloverQuda");
169 
171 static TimeProfile profileDslash("dslashQuda");
172 
174 static TimeProfile profileInvert("invertQuda");
175 
177 static TimeProfile profileMulti("invertMultiShiftQuda");
178 
180 static TimeProfile profileEigensolve("eigensolveQuda");
181 
183 static TimeProfile profileFatLink("computeKSLinkQuda");
184 
186 static TimeProfile profileGaugeForce("computeGaugeForceQuda");
187 
189 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");
190 
192 static TimeProfile profileExtendedGauge("createExtendedGaugeField");
193 
195 static TimeProfile profileCloverForce("computeCloverForceQuda");
196 
198 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");
199 
201 static TimeProfile profileHISQForce("computeHISQForceQuda");
202 
204 static TimeProfile profilePlaq("plaqQuda");
205 
207 static TimeProfile profileWuppertal("wuppertalQuda");
208 
210 static TimeProfile profileGauss("gaussQuda");
211 
213 static TimeProfile profileQCharge("qChargeQuda");
214 
216 static TimeProfile profileAPE("APEQuda");
217 
219 static TimeProfile profileSTOUT("STOUTQuda");
220 
222 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");
223 
225 static TimeProfile profileProject("projectSU3Quda");
226 
228 static TimeProfile profilePhase("staggeredPhaseQuda");
229 
231 static TimeProfile profileContract("contractQuda");
232 
234 static TimeProfile profileCovDev("covDevQuda");
235 
237 static TimeProfile profileMomAction("momActionQuda");
238 
240 static TimeProfile profileEnd("endQuda");
241 
243 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");
244 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");
245 
247 static TimeProfile profileInit2End("initQuda-endQuda",false);
248 
249 static bool enable_profiler = false;
250 static bool do_not_profile_quda = false;
251 
252 static void profilerStart(const char *f) {
253 
254 
255 
256  static std::vector<int> target_list;
257  static bool enable = false;
258  static bool init = false;
259  if (!init) {
260  char *profile_target_env = getenv("QUDA_ENABLE_TARGET_PROFILE"); // selectively enable profiling for a given solve
261 
262  if ( profile_target_env ) {
263  std::stringstream target_stream(profile_target_env);
264 
265  int target;
266  while(target_stream >> target) {
267  target_list.push_back(target);
268  if (target_stream.peek() == ',') target_stream.ignore();
269  }
270 
271  if (target_list.size() > 0) {
272  std::sort(target_list.begin(), target_list.end());
273  target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );
274  warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size());
275  enable = true;
276  }
277  }
278 
279 
280  char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts
281  if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) {
282  do_not_profile_quda=true;
283  printfQuda("Disabling profiling in QUDA\n");
284  }
285  init = true;
286  }
287 
288  static int target_count = 0;
289  static unsigned int i = 0;
290  if (do_not_profile_quda){
291  cudaProfilerStop();
292  printfQuda("Stopping profiling in QUDA\n");
293  } else {
294  if (enable) {
295  if (i < target_list.size() && target_count++ == target_list[i]) {
296  enable_profiler = true;
297  printfQuda("Starting profiling for %s\n", f);
298  cudaProfilerStart();
299  i++; // advance to next target
300  }
301  }
302 }
303 }
304 
305 static void profilerStop(const char *f) {
307  cudaProfilerStart();
308  } else {
309 
310  if (enable_profiler) {
311  printfQuda("Stopping profiling for %s\n", f);
312  cudaProfilerStop();
313  enable_profiler = false;
314  }
315  }
316 }
317 
318 
319 namespace quda {
320  void printLaunchTimer();
321 }
322 
323 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
324 {
325  setVerbosity(verbosity);
326  setOutputPrefix(prefix);
327  setOutputFile(outfile);
328 }
329 
330 
331 typedef struct {
332  int ndim;
334 } LexMapData;
335 
339 static int lex_rank_from_coords(const int *coords, void *fdata)
340 {
341  auto *md = static_cast<LexMapData *>(fdata);
342 
343  int rank = coords[0];
344  for (int i = 1; i < md->ndim; i++) {
345  rank = md->dims[i] * rank + coords[i];
346  }
347  return rank;
348 }
349 
350 #ifdef QMP_COMMS
351 
354 static int qmp_rank_from_coords(const int *coords, void *fdata)
355 {
356  return QMP_get_node_number_from(coords);
357 }
358 #endif
359 
360 // Provision for user control over MPI comm handle
361 // Assumes an MPI implementation of QMP
362 
363 #if defined(QMP_COMMS) || defined(MPI_COMMS)
364 MPI_Comm MPI_COMM_HANDLE;
365 static int user_set_comm_handle = 0;
366 #endif
367 
368 void setMPICommHandleQuda(void *mycomm)
369 {
370 #if defined(QMP_COMMS) || defined(MPI_COMMS)
371  MPI_COMM_HANDLE = *((MPI_Comm *)mycomm);
372  user_set_comm_handle = 1;
373 #endif
374 }
375 
376 #ifdef QMP_COMMS
377 static void initQMPComms(void)
378 {
379  // Default comm handle is taken from QMP
380  // WARNING: Assumes an MPI implementation of QMP
381  if (!user_set_comm_handle) {
382  void *mycomm;
383  QMP_get_mpi_comm(QMP_comm_get_default(), &mycomm);
384  setMPICommHandleQuda(mycomm);
385  }
386 }
387 #elif defined(MPI_COMMS)
388 static void initMPIComms(void)
389 {
390  // Default comm handle is MPI_COMM_WORLD
391  if (!user_set_comm_handle) {
392  static MPI_Comm mycomm;
393  MPI_Comm_dup(MPI_COMM_WORLD, &mycomm);
394  setMPICommHandleQuda((void *)&mycomm);
395  }
396 }
397 #endif
398 
399 static bool comms_initialized = false;
400 
401 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
402 {
403  if (comms_initialized) return;
404 
405 #if QMP_COMMS
406  initQMPComms();
407 #elif defined(MPI_COMMS)
408  initMPIComms();
409 #endif
410 
411  if (nDim != 4) {
412  errorQuda("Number of communication grid dimensions must be 4");
413  }
414 
415  LexMapData map_data;
416  if (!func) {
417 
418 #if QMP_COMMS
419  if (QMP_logical_topology_is_declared()) {
420  if (QMP_get_logical_number_of_dimensions() != 4) {
421  errorQuda("QMP logical topology must have 4 dimensions");
422  }
423  for (int i=0; i<nDim; i++) {
424  int qdim = QMP_get_logical_dimensions()[i];
425  if(qdim != dims[i]) {
426  errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
427  }
428  }
429  fdata = nullptr;
430  func = qmp_rank_from_coords;
431  } else {
432  warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
433 #endif
434 
435  map_data.ndim = nDim;
436  for (int i=0; i<nDim; i++) {
437  map_data.dims[i] = dims[i];
438  }
439  fdata = (void *) &map_data;
440  func = lex_rank_from_coords;
441 
442 #if QMP_COMMS
443  }
444 #endif
445 
446  }
447  comm_init(nDim, dims, func, fdata);
448  comms_initialized = true;
449 }
450 
451 
452 static void init_default_comms()
453 {
454 #if defined(QMP_COMMS)
455  if (QMP_logical_topology_is_declared()) {
456  int ndim = QMP_get_logical_number_of_dimensions();
457  const int *dims = QMP_get_logical_dimensions();
458  initCommsGridQuda(ndim, dims, nullptr, nullptr);
459  } else {
460  errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
461  " and QMP logical topology has not been declared");
462  }
463 #elif defined(MPI_COMMS)
464  errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
465 #else // single-GPU
466  const int dims[4] = {1, 1, 1, 1};
467  initCommsGridQuda(4, dims, nullptr, nullptr);
468 #endif
469 }
470 
471 
472 #define STR_(x) #x
473 #define STR(x) STR_(x)
475 #undef STR
476 #undef STR_
477 
478 extern char* gitversion;
479 
480 /*
481  * Set the device that QUDA uses.
482  */
483 void initQudaDevice(int dev) {
484 
485  //static bool initialized = false;
486  if (initialized) return;
487  initialized = true;
488 
492 
493  if (getVerbosity() >= QUDA_SUMMARIZE) {
494 #ifdef GITVERSION
495  printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
496 #else
497  printfQuda("QUDA %s\n",quda_version.c_str());
498 #endif
499  }
500 
501  int driver_version;
502  cudaDriverGetVersion(&driver_version);
503  printfQuda("CUDA Driver version = %d\n", driver_version);
504 
505  int runtime_version;
506  cudaRuntimeGetVersion(&runtime_version);
507  printfQuda("CUDA Runtime version = %d\n", runtime_version);
508 
509 #ifdef QUDA_NVML
510  nvmlReturn_t result = nvmlInit();
511  if (NVML_SUCCESS != result) errorQuda("NVML Init failed with error %d", result);
512  const int length = 80;
513  char graphics_version[length];
514  result = nvmlSystemGetDriverVersion(graphics_version, length);
515  if (NVML_SUCCESS != result) errorQuda("nvmlSystemGetDriverVersion failed with error %d", result);
516  printfQuda("Graphic driver version = %s\n", graphics_version);
517  result = nvmlShutdown();
518  if (NVML_SUCCESS != result) errorQuda("NVML Shutdown failed with error %d", result);
519 #endif
520 
521 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000)
522  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
523  // not needed for CUDA >= 4.1
524  char* cni_str = getenv("CUDA_NIC_INTEROP");
525  if(cni_str == nullptr){
526  errorQuda("Environment variable CUDA_NIC_INTEROP is not set");
527  }
528  int cni_int = atoi(cni_str);
529  if (cni_int != 1){
530  errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");
531  }
532 #endif
533 
534  int deviceCount;
535  cudaGetDeviceCount(&deviceCount);
536  if (deviceCount == 0) {
537  errorQuda("No CUDA devices found");
538  }
539 
540  for(int i=0; i<deviceCount; i++) {
541  cudaGetDeviceProperties(&deviceProp, i);
542  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
543  if (getVerbosity() >= QUDA_SUMMARIZE) {
544  printfQuda("Found device %d: %s\n", i, deviceProp.name);
545  }
546  }
547 
548 #ifdef MULTI_GPU
549  if (dev < 0) {
550  if (!comms_initialized) {
551  errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
552  }
553  dev = comm_gpuid();
554  }
555 #else
556  if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
557 #endif
558 
559  cudaGetDeviceProperties(&deviceProp, dev);
560  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
561  if (deviceProp.major < 1) {
562  errorQuda("Device %d does not support CUDA", dev);
563  }
564 
565 
566 // Check GPU and QUDA build compatibiliy
567 // 4 cases:
568 // a) QUDA and GPU match: great
569 // b) QUDA built for higher compute capability: error
570 // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
571 // d) QUDA built for same major compute capability but lower minor: warn
572 
573  const int my_major = __COMPUTE_CAPABILITY__ / 100;
574  const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
575 // b) UDA was compiled for a higher compute capability
576  if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
577  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
578 
579 
580 // c) QUDA was compiled for a lower compute capability
581  if (deviceProp.major < my_major) {
582  char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
583  if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
584  if (getVerbosity() > QUDA_SILENT) warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
585  } else {
586  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.", deviceProp.major, deviceProp.minor, my_major, my_minor);
587  }
588  }
589 // d) QUDA built for same major compute capability but lower minor
590  if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
591  warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
592  }
593 
594  if (getVerbosity() >= QUDA_SUMMARIZE) {
595  printfQuda("Using device %d: %s\n", dev, deviceProp.name);
596  }
597 #ifndef USE_QDPJIT
598  cudaSetDevice(dev);
599  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
600 #endif
601 
602 
603 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML)
604  char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
605  if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
606  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
607  }
608  else{
609  setNumaAffinityNVML(dev);
610  }
611 #endif
612 
613 
614 
615  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
616  //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
617  // cudaGetDeviceProperties(&deviceProp, dev);
618 
619  { // determine if we will do CPU or GPU data reordering (default is GPU)
620  char *reorder_str = getenv("QUDA_REORDER_LOCATION");
621 
622  if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {
623  warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
625  } else {
626  warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
628  }
629  }
630 
633 }
634 
635 /*
636  * Any persistent memory allocations that QUDA uses are done here.
637  */
639 {
642 
644 
645  streams = new cudaStream_t[Nstream];
646 
647  int greatestPriority;
648  int leastPriority;
649  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
650  for (int i=0; i<Nstream-1; i++) {
651  cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
652  }
653  cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);
654 
655  checkCudaError();
657  blas::init();
658  cublas::init();
659 
660  // initalize the memory pool allocators
661  pool::init();
662 
663  num_failures_h = static_cast<int*>(mapped_malloc(sizeof(int)));
664  cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);
665 
666  loadTuneCache();
667 
668  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
669 
672 }
673 
674 void updateR()
675 {
676  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
677 }
678 
679 void initQuda(int dev)
680 {
681  // initialize communications topology, if not already done explicitly via initCommsGridQuda()
683 
684  // set the device that QUDA uses
685  initQudaDevice(dev);
686 
687  // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
688  initQudaMemory();
689 }
690 
691 // helper for creating extended gauge fields
694 {
695  profile.TPSTART(QUDA_PROFILE_INIT);
696  int y[4];
697  for (int dir=0; dir<4; ++dir) y[dir] = in.X()[dir] + 2*R[dir];
698  int pad = 0;
699 
700  GaugeFieldParam gParamEx(y, in.Precision(), recon != QUDA_RECONSTRUCT_INVALID ? recon : in.Reconstruct(), pad,
702  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
703  gParamEx.order = in.Order();
704  gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
705  gParamEx.t_boundary = in.TBoundary();
706  gParamEx.nFace = 1;
707  gParamEx.tadpole = in.Tadpole();
708  gParamEx.anisotropy = in.Anisotropy();
709  for (int d=0; d<4; d++) gParamEx.r[d] = R[d];
710 
711  auto *out = new cudaGaugeField(gParamEx);
712 
713  // copy input field into the extended device gauge field
715 
716  profile.TPSTOP(QUDA_PROFILE_INIT);
717 
718  // now fill up the halos
719  out->exchangeExtendedGhost(R,profile,redundant_comms);
720 
721  return out;
722 }
723 
724 // This is a flag used to signal when we have downloaded new gauge
725 // field. Set by loadGaugeQuda and consumed by loadCloverQuda as one
726 // possible flag to indicate we need to recompute the clover field
727 static bool invalidate_clover = true;
728 
729 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
730 {
732 
733  if (!initialized) errorQuda("QUDA not initialized");
735 
736  checkGaugeParam(param);
737 
739  // Set the specific input parameters and create the cpu gauge field
740  GaugeFieldParam gauge_param(h_gauge, *param);
741 
742  // if we are using half precision then we need to compute the fat
743  // link maximum while still on the cpu
744  // FIXME get a kernel for this
745  if (param->type == QUDA_ASQTAD_FAT_LINKS)
746  gauge_param.compute_fat_link_max = true;
747 
748  if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
750  static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
751  static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
752 
753  if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
754  static size_t checksum = SIZE_MAX;
755  size_t in_checksum = in->checksum(true);
756  if (in_checksum == checksum) {
757  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
760  delete in;
761  invalidate_clover = false;
762  return;
763  }
764  checksum = in_checksum;
765  invalidate_clover = true;
766  }
767 
768  // free any current gauge field before new allocations to reduce memory overhead
769  switch (param->type) {
770  case QUDA_WILSON_LINKS:
771  if (gaugeRefinement != gaugeSloppy && gaugeRefinement) delete gaugeRefinement;
772  if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
773  if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
774  if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise;
775  break;
777  if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement) delete gaugeFatRefinement;
778  if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
779  if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
780  if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise;
781  break;
783  if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement) delete gaugeLongRefinement;
784  if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
785  if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
786  if (gaugeLongPrecise) delete gaugeLongPrecise;
787  break;
788  case QUDA_SMEARED_LINKS:
789  if (gaugeSmeared) delete gaugeSmeared;
790  break;
791  default:
792  errorQuda("Invalid gauge type %d", param->type);
793  }
794 
795  // if not preserving then copy the gauge field passed in
796  cudaGaugeField *precise = nullptr;
797 
798  // switch the parameters for creating the mirror precise cuda gauge field
799  gauge_param.create = QUDA_NULL_FIELD_CREATE;
800  gauge_param.reconstruct = param->reconstruct;
801  gauge_param.setPrecision(param->cuda_prec, true);
803  gauge_param.pad = param->ga_pad;
804 
805  precise = new cudaGaugeField(gauge_param);
806 
807  if (param->use_resident_gauge) {
808  if(gaugePrecise == nullptr) errorQuda("No resident gauge field");
809  // copy rather than point at to ensure that the padded region is filled in
810  precise->copy(*gaugePrecise);
811  precise->exchangeGhost();
812  delete gaugePrecise;
813  gaugePrecise = nullptr;
815  } else {
818  precise->copy(*in);
820  }
821 
822  // for gaugeSmeared we are interested only in the precise version
823  if (param->type == QUDA_SMEARED_LINKS) {
824  gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);
825 
827  delete precise;
828  delete in;
830 
832  return;
833  }
834 
835  // creating sloppy fields isn't really compute, but it is work done on the gpu
837 
838  // switch the parameters for creating the mirror sloppy cuda gauge field
839  gauge_param.reconstruct = param->reconstruct_sloppy;
840  gauge_param.setPrecision(param->cuda_prec_sloppy, true);
841  cudaGaugeField *sloppy = nullptr;
842  if (param->cuda_prec != param->cuda_prec_sloppy ||
843  param->reconstruct != param->reconstruct_sloppy) {
844  sloppy = new cudaGaugeField(gauge_param);
845  sloppy->copy(*precise);
846  } else {
847  sloppy = precise;
848  }
849 
850  // switch the parameters for creating the mirror preconditioner cuda gauge field
851  gauge_param.reconstruct = param->reconstruct_precondition;
852  gauge_param.setPrecision(param->cuda_prec_precondition, true);
853  cudaGaugeField *precondition = nullptr;
854  if (param->cuda_prec_sloppy != param->cuda_prec_precondition ||
855  param->reconstruct_sloppy != param->reconstruct_precondition) {
856  precondition = new cudaGaugeField(gauge_param);
857  precondition->copy(*sloppy);
858  } else {
859  precondition = sloppy;
860  }
861 
862  // switch the parameters for creating the refinement cuda gauge field
863  gauge_param.reconstruct = param->reconstruct_refinement_sloppy;
864  gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);
865  cudaGaugeField *refinement = nullptr;
866  if (param->cuda_prec_sloppy != param->cuda_prec_refinement_sloppy
868  refinement = new cudaGaugeField(gauge_param);
869  refinement->copy(*sloppy);
870  } else {
871  refinement = sloppy;
872  }
873 
875 
876  // create an extended preconditioning field
877  cudaGaugeField* extended = nullptr;
878  if (param->overlap){
879  int R[4]; // domain-overlap widths in different directions
880  for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
881  extended = createExtendedGauge(*precondition, R, profileGauge);
882  }
883 
884  switch (param->type) {
885  case QUDA_WILSON_LINKS:
886  gaugePrecise = precise;
887  gaugeSloppy = sloppy;
888  gaugePrecondition = precondition;
889  gaugeRefinement = refinement;
890 
891  if(param->overlap) gaugeExtended = extended;
892  break;
894  gaugeFatPrecise = precise;
895  gaugeFatSloppy = sloppy;
896  gaugeFatPrecondition = precondition;
897  gaugeFatRefinement = refinement;
898 
899  if(param->overlap){
900  if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
901  gaugeFatExtended = extended;
902  }
903  break;
905  gaugeLongPrecise = precise;
906  gaugeLongSloppy = sloppy;
907  gaugeLongPrecondition = precondition;
908  gaugeLongRefinement = refinement;
909 
910  if(param->overlap){
911  if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
912  gaugeLongExtended = extended;
913  }
914  break;
915  default:
916  errorQuda("Invalid gauge type %d", param->type);
917  }
918 
920  delete in;
922 
923  if (extendedGaugeResident) {
924  // updated the resident gauge field if needed
925  const int *R_ = extendedGaugeResident->R();
926  const int R[] = { R_[0], R_[1], R_[2], R_[3] };
927  QudaReconstructType recon = extendedGaugeResident->Reconstruct();
928  delete extendedGaugeResident;
929 
930  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
931  }
932 
934 }
935 
936 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
937 {
939 
940  if (param->location != QUDA_CPU_FIELD_LOCATION)
941  errorQuda("Non-cpu output location not yet supported");
942 
943  if (!initialized) errorQuda("QUDA not initialized");
944  checkGaugeParam(param);
945 
946  // Set the specific cpu parameters and create the cpu gauge field
947  GaugeFieldParam gauge_param(h_gauge, *param);
948  cpuGaugeField cpuGauge(gauge_param);
949  cudaGaugeField *cudaGauge = nullptr;
950  switch (param->type) {
951  case QUDA_WILSON_LINKS:
952  cudaGauge = gaugePrecise;
953  break;
955  cudaGauge = gaugeFatPrecise;
956  break;
958  cudaGauge = gaugeLongPrecise;
959  break;
960  case QUDA_SMEARED_LINKS:
961  gauge_param.create = QUDA_NULL_FIELD_CREATE;
962  gauge_param.reconstruct = param->reconstruct;
963  gauge_param.setPrecision(param->cuda_prec, true);
965  gauge_param.pad = param->ga_pad;
966  cudaGauge = new cudaGaugeField(gauge_param);
967  copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
968  break;
969  default:
970  errorQuda("Invalid gauge type");
971  }
972 
974  cudaGauge->saveCPUField(cpuGauge);
976 
977  if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
978 
980 }
981 
983 void freeSloppyCloverQuda();
984 
985 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
986 {
989 
990  checkCloverParam(inv_param);
991  bool device_calc = false; // calculate clover and inverse on the device?
992 
993  pushVerbosity(inv_param->verbosity);
995 
996  if (!initialized) errorQuda("QUDA not initialized");
997 
998  if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
999  device_calc = true;
1000  if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set");
1001  if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
1002  }
1003 
1004  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) errorQuda("Half precision not supported on CPU");
1005  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded before clover");
1006  if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {
1007  errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
1008  }
1009 
1010  // determines whether operator is preconditioned when calling invertQuda()
1011  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
1012  inv_param->solve_type == QUDA_NORMOP_PC_SOLVE ||
1013  inv_param->solve_type == QUDA_NORMERR_PC_SOLVE );
1014 
1015  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
1016  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
1018 
1019  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
1021 
1022  // uninverted clover term is required when applying unpreconditioned operator,
1023  // but note that dslashQuda() is always preconditioned
1024  if (!h_clover && !pc_solve && !pc_solution) {
1025  //warningQuda("Uninverted clover term not loaded");
1026  }
1027 
1028  // uninverted clover term is also required for "asymmetric" preconditioning
1029  if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
1030  warningQuda("Uninverted clover term not loaded");
1031  }
1032 
1033  bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
1034 #ifdef DYNAMIC_CLOVER
1035  bool dynamic_clover = true;
1036 #else
1037  bool dynamic_clover = false;
1038 #endif
1039 
1040  CloverFieldParam clover_param;
1041  clover_param.nDim = 4;
1042  clover_param.csw = inv_param->clover_coeff;
1043  clover_param.twisted = twisted;
1044  clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
1045  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
1046  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
1047  clover_param.pad = inv_param->cl_pad;
1048  clover_param.create = QUDA_NULL_FIELD_CREATE;
1049  clover_param.norm = nullptr;
1050  clover_param.invNorm = nullptr;
1051  clover_param.setPrecision(inv_param->clover_cuda_prec);
1052  clover_param.direct = h_clover || device_calc ? true : false;
1053  clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false;
1054  CloverField *in = nullptr;
1056 
1057  // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)
1058  bool clover_update = false;
1059  double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
1060  if (!cloverPrecise || invalidate_clover || inv_param->clover_coeff != csw_old) clover_update = true;
1061 
1062  // compute or download clover field only if gauge field has been updated or clover field doesn't exist
1063  if (clover_update) {
1064  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
1066  if (cloverPrecise) delete cloverPrecise;
1067 
1069  cloverPrecise = new cudaCloverField(clover_param);
1070 
1071  if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
1072  // create a param for the cpu clover field
1073  CloverFieldParam inParam(clover_param);
1074  inParam.setPrecision(inv_param->clover_cpu_prec);
1075  inParam.order = inv_param->clover_order;
1076  inParam.direct = h_clover ? true : false;
1077  inParam.inverse = h_clovinv ? true : false;
1078  inParam.clover = h_clover;
1079  inParam.cloverInv = h_clovinv;
1081  in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
1082  static_cast<CloverField*>(new cpuCloverField(inParam)) :
1083  static_cast<CloverField*>(new cudaCloverField(inParam));
1084  }
1086 
1087  if (!device_calc) {
1089  bool inverse = (h_clovinv && !inv_param->compute_clover_inverse && !dynamic_clover);
1090  cloverPrecise->copy(*in, inverse);
1092  } else {
1094  createCloverQuda(inv_param);
1096  }
1097 
1098  // inverted clover term is required when applying preconditioned operator
1099  if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
1101  if (!dynamic_clover) {
1102  cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog);
1103  if (inv_param->compute_clover_trlog) {
1104  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
1105  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
1106  }
1107  }
1109  }
1110  } else {
1111  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
1112  }
1113 
1114  clover_param.direct = true;
1115  clover_param.inverse = dynamic_clover ? false : true;
1116 
1117  cloverPrecise->setRho(inv_param->clover_rho);
1118 
1121  loadSloppyCloverQuda(prec);
1122 
1123  // if requested, copy back the clover / inverse field
1124  if (inv_param->return_clover || inv_param->return_clover_inverse) {
1125  if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");
1126 
1127  // copy the inverted clover term into host application order on the device
1128  clover_param.setPrecision(inv_param->clover_cpu_prec);
1129  clover_param.direct = (h_clover && inv_param->return_clover);
1130  clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);
1131 
1132  // this isn't really "epilogue" but this label suffices
1134  cudaCloverField *hack = nullptr;
1135  if (!dynamic_clover) {
1136  clover_param.order = inv_param->clover_order;
1137  hack = new cudaCloverField(clover_param);
1138  hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1139  } else {
1140  auto *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack
1141  hackOfTheHack->copy(*cloverPrecise, false);
1142  cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog);
1143  if (inv_param->compute_clover_trlog) {
1144  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
1145  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
1146  }
1147  clover_param.order = inv_param->clover_order;
1148  hack = new cudaCloverField(clover_param);
1149  hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1150  delete hackOfTheHack;
1151  }
1153 
1154  // copy the field into the host application's clover field
1156  if (inv_param->return_clover) {
1157  qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
1158  }
1159  if (inv_param->return_clover_inverse) {
1160  qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
1161  }
1162 
1164 
1165  delete hack;
1166  checkCudaError();
1167  }
1168 
1170  if (in) delete in; // delete object referencing input field
1172 
1173  popVerbosity();
1174 
1176 }
1177 
1178 void freeSloppyCloverQuda();
1179 
1181 {
1183 
1184  if (cloverPrecise) {
1185  // create the mirror sloppy clover field
1186  CloverFieldParam clover_param(*cloverPrecise);
1187 
1188  clover_param.setPrecision(prec[0]);
1189 
1190  if (cloverPrecise->V(false) != cloverPrecise->V(true)) {
1191  clover_param.direct = true;
1192  clover_param.inverse = true;
1193  } else {
1194  clover_param.direct = false;
1195  clover_param.inverse = true;
1196  }
1197 
1198  if (clover_param.Precision() != cloverPrecise->Precision()) {
1199  cloverSloppy = new cudaCloverField(clover_param);
1200  cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
1201  } else {
1202  cloverSloppy = cloverPrecise;
1203  }
1204 
1205  // switch the parameters for creating the mirror preconditioner clover field
1206  clover_param.setPrecision(prec[1]);
1207 
1208  // create the mirror preconditioner clover field
1209  if (clover_param.Precision() != cloverSloppy->Precision()) {
1210  cloverPrecondition = new cudaCloverField(clover_param);
1211  cloverPrecondition->copy(*cloverSloppy, clover_param.inverse);
1212  } else {
1213  cloverPrecondition = cloverSloppy;
1214  }
1215 
1216  // switch the parameters for creating the mirror preconditioner clover field
1217  clover_param.setPrecision(prec[2]);
1218 
1219  // create the mirror preconditioner clover field
1220  if (clover_param.Precision() != cloverSloppy->Precision()) {
1221  cloverRefinement = new cudaCloverField(clover_param);
1222  cloverRefinement->copy(*cloverSloppy, clover_param.inverse);
1223  } else {
1224  cloverRefinement = cloverSloppy;
1225  }
1226  }
1227 
1228 }
1229 
1230 // just free the sloppy fields used in mixed-precision solvers
1232 {
1233  if (!initialized) errorQuda("QUDA not initialized");
1234  if (gaugeSloppy != gaugeRefinement && gaugeRefinement) delete gaugeRefinement;
1235  if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
1236  if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
1237 
1238  gaugeRefinement = nullptr;
1239  gaugePrecondition = nullptr;
1240  gaugeSloppy = nullptr;
1241 
1242  if (gaugeLongSloppy != gaugeLongRefinement && gaugeLongRefinement) delete gaugeLongRefinement;
1243  if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
1244  if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
1245 
1246  gaugeLongRefinement = nullptr;
1247  gaugeLongPrecondition = nullptr;
1248  gaugeLongSloppy = nullptr;
1249 
1250  if (gaugeFatSloppy != gaugeFatRefinement && gaugeFatRefinement) delete gaugeFatRefinement;
1251  if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
1252  if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
1253 
1254  gaugeFatRefinement = nullptr;
1255  gaugeFatPrecondition = nullptr;
1256  gaugeFatSloppy = nullptr;
1257 }
1258 
1259 void freeGaugeQuda(void)
1260 {
1261  if (!initialized) errorQuda("QUDA not initialized");
1262 
1264 
1265  if (gaugePrecise) delete gaugePrecise;
1266  if (gaugeExtended) delete gaugeExtended;
1267 
1268  gaugePrecise = nullptr;
1269  gaugeExtended = nullptr;
1270 
1271  if (gaugeLongPrecise) delete gaugeLongPrecise;
1272  if (gaugeLongExtended) delete gaugeLongExtended;
1273 
1274  gaugeLongPrecise = nullptr;
1275  gaugeLongExtended = nullptr;
1276 
1277  if (gaugeFatPrecise) delete gaugeFatPrecise;
1278 
1279  gaugeFatPrecise = nullptr;
1280  gaugeFatExtended = nullptr;
1281 
1282  if (gaugeSmeared) delete gaugeSmeared;
1283 
1284  gaugeSmeared = nullptr;
1285  // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
1286  if (extendedGaugeResident) {
1287  delete extendedGaugeResident;
1288  extendedGaugeResident = nullptr;
1289  }
1290 }
1291 
1293 {
1294  // first do SU3 links (if they exist)
1295  if (gaugePrecise) {
1296  GaugeFieldParam gauge_param(*gaugePrecise);
1297  // switch the parameters for creating the mirror sloppy cuda gauge field
1298 
1299  gauge_param.reconstruct = recon[0];
1300  gauge_param.setPrecision(prec[0], true);
1301 
1302  if (gaugeSloppy) errorQuda("gaugeSloppy already exists");
1303 
1304  if (gauge_param.Precision() != gaugePrecise->Precision() || gauge_param.reconstruct != gaugePrecise->Reconstruct()) {
1305  gaugeSloppy = new cudaGaugeField(gauge_param);
1306  gaugeSloppy->copy(*gaugePrecise);
1307  } else {
1308  gaugeSloppy = gaugePrecise;
1309  }
1310 
1311  // switch the parameters for creating the mirror preconditioner cuda gauge field
1312  gauge_param.reconstruct = recon[1];
1313  gauge_param.setPrecision(prec[1], true);
1314 
1315  if (gaugePrecondition) errorQuda("gaugePrecondition already exists");
1316 
1317  if (gauge_param.Precision() != gaugeSloppy->Precision() || gauge_param.reconstruct != gaugeSloppy->Reconstruct()) {
1318  gaugePrecondition = new cudaGaugeField(gauge_param);
1319  gaugePrecondition->copy(*gaugeSloppy);
1320  } else {
1321  gaugePrecondition = gaugeSloppy;
1322  }
1323 
1324  // switch the parameters for creating the mirror refinement cuda gauge field
1325  gauge_param.reconstruct = recon[2];
1326  gauge_param.setPrecision(prec[2], true);
1327 
1328  if (gaugeRefinement) errorQuda("gaugeRefinement already exists");
1329 
1330  if (gauge_param.Precision() != gaugeSloppy->Precision() || gauge_param.reconstruct != gaugeSloppy->Reconstruct()) {
1331  gaugeRefinement = new cudaGaugeField(gauge_param);
1332  gaugeRefinement->copy(*gaugeSloppy);
1333  } else {
1334  gaugeRefinement = gaugeSloppy;
1335  }
1336  }
1337 
1338  // fat links (if they exist)
1339  if (gaugeFatPrecise) {
1340  GaugeFieldParam gauge_param(*gaugeFatPrecise);
1341 
1342  gauge_param.setPrecision(prec[0], true);
1343 
1344  if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");
1345 
1346  if (gauge_param.Precision() != gaugeFatPrecise->Precision()
1347  || gauge_param.reconstruct != gaugeFatPrecise->Reconstruct()) {
1348  gaugeFatSloppy = new cudaGaugeField(gauge_param);
1349  gaugeFatSloppy->copy(*gaugeFatPrecise);
1350  } else {
1351  gaugeFatSloppy = gaugeFatPrecise;
1352  }
1353 
1354  // switch the parameters for creating the mirror preconditioner cuda gauge field
1355  gauge_param.setPrecision(prec[1], true);
1356 
1357  if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");
1358 
1359  if (gauge_param.Precision() != gaugeFatSloppy->Precision()
1360  || gauge_param.reconstruct != gaugeFatSloppy->Reconstruct()) {
1361  gaugeFatPrecondition = new cudaGaugeField(gauge_param);
1362  gaugeFatPrecondition->copy(*gaugeFatSloppy);
1363  } else {
1364  gaugeFatPrecondition = gaugeFatSloppy;
1365  }
1366 
1367  // switch the parameters for creating the mirror refinement cuda gauge field
1368  gauge_param.setPrecision(prec[2], true);
1369 
1370  if (gaugeFatRefinement) errorQuda("gaugeFatRefinement already exists\n");
1371 
1372  if (gauge_param.Precision() != gaugeFatSloppy->Precision()
1373  || gauge_param.reconstruct != gaugeFatSloppy->Reconstruct()) {
1374  gaugeFatRefinement = new cudaGaugeField(gauge_param);
1375  gaugeFatRefinement->copy(*gaugeFatSloppy);
1376  } else {
1377  gaugeFatRefinement = gaugeFatSloppy;
1378  }
1379  }
1380 
1381  // long links (if they exist)
1382  if (gaugeLongPrecise) {
1383  GaugeFieldParam gauge_param(*gaugeLongPrecise);
1384 
1385  gauge_param.reconstruct = recon[0];
1386  gauge_param.setPrecision(prec[0], true);
1387 
1388  if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");
1389 
1390  if (gauge_param.Precision() != gaugeLongPrecise->Precision()
1391  || gauge_param.reconstruct != gaugeLongPrecise->Reconstruct()) {
1392  gaugeLongSloppy = new cudaGaugeField(gauge_param);
1393  gaugeLongSloppy->copy(*gaugeLongPrecise);
1394  } else {
1395  gaugeLongSloppy = gaugeLongPrecise;
1396  }
1397 
1398  // switch the parameters for creating the mirror preconditioner cuda gauge field
1399  gauge_param.reconstruct = recon[1];
1400  gauge_param.setPrecision(prec[1], true);
1401 
1402  if (gaugeLongPrecondition) errorQuda("gaugeLongPrecondition already exists\n");
1403 
1404  if (gauge_param.Precision() != gaugeLongSloppy->Precision()
1405  || gauge_param.reconstruct != gaugeLongSloppy->Reconstruct()) {
1406  gaugeLongPrecondition = new cudaGaugeField(gauge_param);
1407  gaugeLongPrecondition->copy(*gaugeLongSloppy);
1408  } else {
1409  gaugeLongPrecondition = gaugeLongSloppy;
1410  }
1411 
1412  // switch the parameters for creating the mirror refinement cuda gauge field
1413  gauge_param.reconstruct = recon[2];
1414  gauge_param.setPrecision(prec[2], true);
1415 
1416  if (gaugeLongRefinement) errorQuda("gaugeLongRefinement already exists\n");
1417 
1418  if (gauge_param.Precision() != gaugeLongSloppy->Precision()
1419  || gauge_param.reconstruct != gaugeLongSloppy->Reconstruct()) {
1420  gaugeLongRefinement = new cudaGaugeField(gauge_param);
1421  gaugeLongRefinement->copy(*gaugeLongSloppy);
1422  } else {
1423  gaugeLongRefinement = gaugeLongSloppy;
1424  }
1425  }
1426 }
1427 
1429 {
1430  if (!initialized) errorQuda("QUDA not initialized");
1431  if (cloverRefinement != cloverSloppy && cloverRefinement) delete cloverRefinement;
1432  if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition;
1433  if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;
1434 
1435  cloverRefinement = nullptr;
1436  cloverPrecondition = nullptr;
1437  cloverSloppy = nullptr;
1438 }
1439 
1440 void freeCloverQuda(void)
1441 {
1442  if (!initialized) errorQuda("QUDA not initialized");
1444  if (cloverPrecise) delete cloverPrecise;
1445  cloverPrecise = nullptr;
1446 }
1447 
1448 void flushChronoQuda(int i)
1449 {
1450  if (i >= QUDA_MAX_CHRONO)
1451  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
1452 
1453  auto &basis = chronoResident[i];
1454 
1455  for (auto v : basis) {
1456  if (v) delete v;
1457  }
1458  basis.clear();
1459 }
1460 
1461 void endQuda(void)
1462 {
1463  profileEnd.TPSTART(QUDA_PROFILE_TOTAL);
1464 
1465  if (!initialized) return;
1466 
1467  freeGaugeQuda();
1468  freeCloverQuda();
1469 
1470  for (int i=0; i<QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
1471 
1472  for (auto v : solutionResident) if (v) delete v;
1473  solutionResident.clear();
1474 
1475  if(momResident) delete momResident;
1476 
1479 
1480  cublas::destroy();
1481  blas::end();
1482 
1485 
1487  num_failures_h = nullptr;
1488  num_failures_d = nullptr;
1489 
1490  if (streams) {
1491  for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
1492  delete []streams;
1493  streams = nullptr;
1494  }
1496 
1497  saveTuneCache();
1498  saveProfile();
1499 
1500  // flush any outstanding force monitoring (if enabled)
1502 
1503  initialized = false;
1504 
1505  comm_finalize();
1506  comms_initialized = false;
1507 
1510 
1511  // print out the profile information of the lifetime of the library
1512  if (getVerbosity() >= QUDA_SUMMARIZE) {
1513  profileInit.Print();
1514  profileGauge.Print();
1515  profileClover.Print();
1516  profileDslash.Print();
1517  profileInvert.Print();
1518  profileMulti.Print();
1528  profileCovDev.Print();
1529  profilePlaq.Print();
1531  profileAPE.Print();
1532  profileSTOUT.Print();
1534  profilePhase.Print();
1536  profileEnd.Print();
1537 
1540 
1541  printLaunchTimer();
1542  printAPIProfile();
1543 
1544  printfQuda("\n");
1546  printfQuda("\n");
1547  }
1548 
1549  assertAllMemFree();
1550 
1551  char *device_reset_env = getenv("QUDA_DEVICE_RESET");
1552  if (device_reset_env && strcmp(device_reset_env,"1") == 0) {
1553  // end this CUDA context
1554  cudaDeviceReset();
1555  }
1556 
1557 }
1558 
1559 
1560 namespace quda {
1561 
1562  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1563  {
1564  double kappa = inv_param->kappa;
1565  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1566  kappa *= gaugePrecise->Anisotropy();
1567  }
1568 
1569  switch (inv_param->dslash_type) {
1570  case QUDA_WILSON_DSLASH:
1571  diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
1572  break;
1574  diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
1575  break;
1578  diracParam.Ls = inv_param->Ls;
1579  break;
1582  diracParam.Ls = inv_param->Ls;
1583  break;
1585  if (inv_param->Ls > QUDA_MAX_DWF_LS)
1586  errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
1588  diracParam.Ls = inv_param->Ls;
1589  if (sizeof(Complex) != sizeof(double _Complex)) {
1590  errorQuda("Irreconcilable difference between interface and internal complex number conventions");
1591  }
1592  memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
1593  memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
1594  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1595  printfQuda("Printing b_5 and c_5 values\n");
1596  for (int i = 0; i < diracParam.Ls; i++) {
1597  printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),
1598  diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
1599  // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,
1600  // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,
1601  // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );
1602  }
1603  }
1604  break;
1605  case QUDA_STAGGERED_DSLASH:
1606  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1607  break;
1608  case QUDA_ASQTAD_DSLASH:
1609  diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
1610  break;
1613  if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
1614  diracParam.Ls = 1;
1615  diracParam.epsilon = 0.0;
1616  } else {
1617  diracParam.Ls = 2;
1618  diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
1619  }
1620  break;
1623  if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
1624  diracParam.Ls = 1;
1625  diracParam.epsilon = 0.0;
1626  } else {
1627  diracParam.Ls = 2;
1628  diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
1629  }
1630  break;
1631  case QUDA_LAPLACE_DSLASH:
1633  diracParam.laplace3D = inv_param->laplace3D;
1634  break;
1635  case QUDA_COVDEV_DSLASH:
1636  diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
1637  break;
1638  default:
1639  errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
1640  }
1641 
1642  diracParam.matpcType = inv_param->matpc_type;
1643  diracParam.dagger = inv_param->dagger;
1644  diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise : gaugePrecise;
1645  diracParam.fatGauge = gaugeFatPrecise;
1646  diracParam.longGauge = gaugeLongPrecise;
1647  diracParam.clover = cloverPrecise;
1648  diracParam.kappa = kappa;
1649  diracParam.mass = inv_param->mass;
1650  diracParam.m5 = inv_param->m5;
1651  diracParam.mu = inv_param->mu;
1652 
1653  for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on
1654 
1655  if (diracParam.gauge->Precision() != inv_param->cuda_prec)
1656  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1657  inv_param->cuda_prec);
1658  }
1659 
1660 
1661  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1662  {
1663  setDiracParam(diracParam, inv_param, pc);
1664 
1665  diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatSloppy : gaugeSloppy;
1666  diracParam.fatGauge = gaugeFatSloppy;
1667  diracParam.longGauge = gaugeLongSloppy;
1668  diracParam.clover = cloverSloppy;
1669 
1670  for (int i=0; i<4; i++) {
1671  diracParam.commDim[i] = 1; // comms are always on
1672  }
1673 
1674  if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy)
1675  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1676  inv_param->cuda_prec_sloppy);
1677  }
1678 
1679  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1680  {
1681  setDiracParam(diracParam, inv_param, pc);
1682 
1684  diracParam.fatGauge = gaugeFatRefinement;
1685  diracParam.longGauge = gaugeLongRefinement;
1686  diracParam.clover = cloverRefinement;
1687 
1688  for (int i=0; i<4; i++) {
1689  diracParam.commDim[i] = 1; // comms are always on
1690  }
1691 
1692  if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy)
1693  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1694  inv_param->cuda_prec_refinement_sloppy);
1695  }
1696 
1697  // The preconditioner currently mimicks the sloppy operator with no comms
1698  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
1699  {
1700  setDiracParam(diracParam, inv_param, pc);
1701 
1702  if (inv_param->overlap) {
1703  diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;
1704  diracParam.fatGauge = gaugeFatExtended;
1705  diracParam.longGauge = gaugeLongExtended;
1706  } else {
1708  diracParam.fatGauge = gaugeFatPrecondition;
1709  diracParam.longGauge = gaugeLongPrecondition;
1710  }
1711  diracParam.clover = cloverPrecondition;
1712 
1713  for (int i=0; i<4; i++) {
1714  diracParam.commDim[i] = comms ? 1 : 0;
1715  }
1716 
1717  // In the preconditioned staggered CG allow a different dslash type in the preconditioning
1718  if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH
1720  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1721  diracParam.gauge = gaugeFatPrecondition;
1722  }
1723 
1724  if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition)
1725  errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
1726  inv_param->cuda_prec_precondition);
1727  }
1728 
1729 
1730  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
1731  {
1732  DiracParam diracParam;
1733  DiracParam diracSloppyParam;
1734  DiracParam diracPreParam;
1735 
1736  setDiracParam(diracParam, &param, pc_solve);
1737  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1738  bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ? false : true ;//inc eigCG needs 2 sloppy precisions.
1739  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1740 
1741 
1742  d = Dirac::create(diracParam); // create the Dirac operator
1743  dSloppy = Dirac::create(diracSloppyParam);
1744  dPre = Dirac::create(diracPreParam);
1745  }
1746 
1747  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
1748  {
1749  DiracParam diracParam;
1750  DiracParam diracSloppyParam;
1751  DiracParam diracPreParam;
1752  DiracParam diracRefParam;
1753 
1754  setDiracParam(diracParam, &param, pc_solve);
1755  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1756  setDiracRefineParam(diracRefParam, &param, pc_solve);
1757  bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ? false : true ;//inc eigCG needs 2 sloppy precisions.
1758  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1759 
1760 
1761  d = Dirac::create(diracParam); // create the Dirac operator
1762  dSloppy = Dirac::create(diracSloppyParam);
1763  dPre = Dirac::create(diracPreParam);
1764  dRef = Dirac::create(diracRefParam);
1765  }
1766 
1768 
1770 
1771  double kappa5 = (0.5/(5.0 + param.m5));
1772  double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1774  param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;
1775 
1776  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1777  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1778  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1779  double nin = blas::norm2(b);
1780  printfQuda("Mass rescale: norm of source in = %g\n", nin);
1781  }
1782 
1783  // staggered dslash uses mass normalization internally
1785  switch (param.solution_type) {
1786  case QUDA_MAT_SOLUTION:
1787  case QUDA_MATPC_SOLUTION:
1788  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);
1789  break;
1792  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);
1793  break;
1794  default:
1795  errorQuda("Not implemented");
1796  }
1797  return;
1798  }
1799 
1800  for(int i=0; i<param.num_offset; i++) {
1801  unscaled_shifts[i] = param.offset[i];
1802  }
1803 
1804  // multiply the source to compensate for normalization of the Dirac operator, if necessary
1805  switch (param.solution_type) {
1806  case QUDA_MAT_SOLUTION:
1809  blas::ax(2.0*kappa, b);
1810  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1811  }
1812  break;
1816  blas::ax(4.0*kappa*kappa, b);
1817  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1818  }
1819  break;
1820  case QUDA_MATPC_SOLUTION:
1822  blas::ax(4.0*kappa*kappa, b);
1823  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1825  blas::ax(2.0*kappa, b);
1826  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1827  }
1828  break;
1831  blas::ax(16.0*std::pow(kappa,4), b);
1832  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 16.0*std::pow(kappa,4);
1834  blas::ax(4.0*kappa*kappa, b);
1835  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1836  }
1837  break;
1838  default:
1839  errorQuda("Solution type %d not supported", param.solution_type);
1840  }
1841 
1842  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
1843  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1844  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1845  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1846  double nin = blas::norm2(b);
1847  printfQuda("Mass rescale: norm of source out = %g\n", nin);
1848  }
1849 
1850  }
1851 }
1852 
1853 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
1854 {
1857 
1858  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
1859 
1860  if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
1861  || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
1862  errorQuda("Gauge field not allocated");
1863  if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
1864  errorQuda("Clover field not allocated");
1865 
1866  pushVerbosity(inv_param->verbosity);
1868 
1869  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
1870  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1871  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1872 
1873  cpuParam.v = h_out;
1874  cpuParam.location = inv_param->output_location;
1875  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1876 
1877  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1878  cudaColorSpinorField in(*in_h, cudaParam);
1879  cudaColorSpinorField out(in, cudaParam);
1880 
1881  bool pc = true;
1882  DiracParam diracParam;
1883  setDiracParam(diracParam, inv_param, pc);
1884 
1886 
1888  in = *in_h;
1890 
1892 
1893  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1894  double cpu = blas::norm2(*in_h);
1895  double gpu = blas::norm2(in);
1896  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1897  }
1898 
1899  if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
1900  (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
1901  inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )
1902  blas::ax(1.0/(2.0*inv_param->mass), in);
1903 
1904  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1905  if (parity == QUDA_EVEN_PARITY) {
1906  parity = QUDA_ODD_PARITY;
1907  } else {
1908  parity = QUDA_EVEN_PARITY;
1909  }
1910  blas::ax(gauge.Anisotropy(), in);
1911  }
1912 
1913  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1914  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {
1915  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1916  cudaColorSpinorField tmp1(in, cudaParam);
1917  ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
1918  dirac->Dslash(out, tmp1, parity); // apply the operator
1919  } else {
1920  dirac->Dslash(out, in, parity); // apply the operator
1921  }
1923 
1925  *out_h = out;
1927 
1928  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1929  double cpu = blas::norm2(*out_h);
1930  double gpu = blas::norm2(out);
1931  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1932  }
1933 
1935  delete dirac; // clean up
1936 
1937  delete out_h;
1938  delete in_h;
1940 
1941  popVerbosity();
1943 }
1944 
1946 {
1947  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
1948 
1949  if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
1950  || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
1951  errorQuda("Gauge field not allocated");
1952 
1953  pushVerbosity(inv_param->verbosity);
1955 
1956  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
1957  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1958 
1959  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1960  cudaColorSpinorField in(*in_h, cudaParam);
1961 
1962  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1963  double cpu = blas::norm2(*in_h);
1964  double gpu = blas::norm2(in);
1965  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1966  }
1967 
1968  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1969  cudaColorSpinorField out(in, cudaParam);
1970 
1971  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1972  if (parity == QUDA_EVEN_PARITY) {
1973  parity = QUDA_ODD_PARITY;
1974  } else {
1975  parity = QUDA_EVEN_PARITY;
1976  }
1977  blas::ax(gauge.Anisotropy(), in);
1978  }
1979  bool pc = true;
1980 
1981  DiracParam diracParam;
1982  setDiracParam(diracParam, inv_param, pc);
1983 
1984  DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator
1985  printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);
1986  switch (test_type) {
1987  case 0:
1988  dirac.Dslash4(out, in, parity);
1989  break;
1990  case 1:
1991  dirac.Dslash5(out, in, parity);
1992  break;
1993  case 2:
1994  dirac.Dslash5inv(out, in, parity, inv_param->kappa);
1995  break;
1996  }
1997 
1998  cpuParam.v = h_out;
1999  cpuParam.location = inv_param->output_location;
2000  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2001  *out_h = out;
2002 
2003  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2004  double cpu = blas::norm2(*out_h);
2005  double gpu = blas::norm2(out);
2006  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2007  }
2008 
2009  delete out_h;
2010  delete in_h;
2011 
2012  popVerbosity();
2013 }
2014 
2016 {
2017  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
2018 
2019  if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
2020  || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
2021  errorQuda("Gauge field not allocated");
2022 
2023  pushVerbosity(inv_param->verbosity);
2025 
2026  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
2027  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
2028 
2029  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2030  cudaColorSpinorField in(*in_h, cudaParam);
2031 
2032  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2033  double cpu = blas::norm2(*in_h);
2034  double gpu = blas::norm2(in);
2035  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2036  }
2037 
2038  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2039  cudaColorSpinorField out(in, cudaParam);
2040 
2041  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
2042  if (parity == QUDA_EVEN_PARITY) {
2043  parity = QUDA_ODD_PARITY;
2044  } else {
2045  parity = QUDA_EVEN_PARITY;
2046  }
2047  blas::ax(gauge.Anisotropy(), in);
2048  }
2049  bool pc = true;
2050 
2051  DiracParam diracParam;
2052  setDiracParam(diracParam, inv_param, pc);
2053 
2054  DiracMobiusPC dirac(diracParam); // create the Dirac operator
2055  switch (test_type) {
2056  case 0:
2057  dirac.Dslash4(out, in, parity);
2058  break;
2059  case 1:
2060  dirac.Dslash5(out, in, parity);
2061  break;
2062  case 2:
2063  dirac.Dslash4pre(out, in, parity);
2064  break;
2065  case 3:
2066  dirac.Dslash5inv(out, in, parity);
2067  break;
2068  }
2069 
2070  cpuParam.v = h_out;
2071  cpuParam.location = inv_param->output_location;
2072  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2073  *out_h = out;
2074 
2075  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2076  double cpu = blas::norm2(*out_h);
2077  double gpu = blas::norm2(out);
2078  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2079  }
2080 
2081  delete out_h;
2082  delete in_h;
2083 
2084  popVerbosity();
2085 }
2086 
2087 
2088 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
2089 {
2090  pushVerbosity(inv_param->verbosity);
2091 
2092  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
2093 
2094  if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
2095  || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
2096  errorQuda("Gauge field not allocated");
2097  if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
2098  errorQuda("Clover field not allocated");
2100 
2101  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
2103 
2104  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
2105  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
2106 
2107  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2108  cudaColorSpinorField in(*in_h, cudaParam);
2109 
2110  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2111  double cpu = blas::norm2(*in_h);
2112  double gpu = blas::norm2(in);
2113  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2114  }
2115 
2116  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2117  cudaColorSpinorField out(in, cudaParam);
2118 
2119  DiracParam diracParam;
2120  setDiracParam(diracParam, inv_param, pc);
2121 
2122  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
2123  dirac->M(out, in); // apply the operator
2124  delete dirac; // clean up
2125 
2126  double kappa = inv_param->kappa;
2127  if (pc) {
2128  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
2129  blas::ax(0.25/(kappa*kappa), out);
2130  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
2131  blas::ax(0.5/kappa, out);
2132  }
2133  } else {
2134  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
2136  blas::ax(0.5/kappa, out);
2137  }
2138  }
2139 
2140  cpuParam.v = h_out;
2141  cpuParam.location = inv_param->output_location;
2142  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2143  *out_h = out;
2144 
2145  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2146  double cpu = blas::norm2(*out_h);
2147  double gpu = blas::norm2(out);
2148  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2149  }
2150 
2151  delete out_h;
2152  delete in_h;
2153 
2154  popVerbosity();
2155 }
2156 
2157 
2158 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
2159 {
2160  pushVerbosity(inv_param->verbosity);
2161 
2162  const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
2163 
2164  if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
2165  || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
2166  errorQuda("Gauge field not allocated");
2167  if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
2168  errorQuda("Clover field not allocated");
2170 
2171  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
2173 
2174  ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
2175  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
2176 
2177  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2178  cudaColorSpinorField in(*in_h, cudaParam);
2179 
2180  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
2181  double cpu = blas::norm2(*in_h);
2182  double gpu = blas::norm2(in);
2183  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2184  }
2185 
2186  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2187  cudaColorSpinorField out(in, cudaParam);
2188 
2189  // double kappa = inv_param->kappa;
2190  // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
2191 
2192  DiracParam diracParam;
2193  setDiracParam(diracParam, inv_param, pc);
2194 
2195  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
2196  dirac->MdagM(out, in); // apply the operator
2197  delete dirac; // clean up
2198 
2199  double kappa = inv_param->kappa;
2200  if (pc) {
2201  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
2202  blas::ax(1.0/std::pow(2.0*kappa,4), out);
2203  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
2204  blas::ax(0.25/(kappa*kappa), out);
2205  }
2206  } else {
2207  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
2209  blas::ax(0.25/(kappa*kappa), out);
2210  }
2211  }
2212 
2213  cpuParam.v = h_out;
2214  cpuParam.location = inv_param->output_location;
2215  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2216  *out_h = out;
2217 
2218  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
2219  double cpu = blas::norm2(*out_h);
2220  double gpu = blas::norm2(out);
2221  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2222  }
2223 
2224  delete out_h;
2225  delete in_h;
2226 
2227  popVerbosity();
2228 }
2229 
2230 namespace quda
2231 {
2233  {
2234  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
2235  return (gaugePrecise != nullptr) and param->cuda_prec == gaugePrecise->Precision();
2236  } else {
2237  return (gaugeFatPrecise != nullptr) and param->cuda_prec == gaugeFatPrecise->Precision();
2238  }
2239  }
2240 } // namespace quda
2241 
2243 
2245  return;
2246  }
2247 
2248  if (param->cuda_prec != cloverPrecise->Precision()) {
2249  errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());
2250  }
2251 
2252  if ( (!cloverSloppy || param->cuda_prec_sloppy != cloverSloppy->Precision()) ||
2253  (!cloverPrecondition || param->cuda_prec_precondition != cloverPrecondition->Precision()) ||
2254  (!cloverRefinement || param->cuda_prec_refinement_sloppy != cloverRefinement->Precision()) ) {
2257  loadSloppyCloverQuda(prec);
2258  }
2259 
2260  if (cloverPrecise == nullptr) errorQuda("Precise clover field doesn't exist");
2261  if (cloverSloppy == nullptr) errorQuda("Sloppy clover field doesn't exist");
2262  if (cloverPrecondition == nullptr) errorQuda("Precondition clover field doesn't exist");
2263  if (cloverRefinement == nullptr) errorQuda("Refinement clover field doesn't exist");
2264 }
2265 
2267 {
2268  quda::cudaGaugeField *cudaGauge = nullptr;
2269  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
2270  if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
2271 
2272  if (param->cuda_prec != gaugePrecise->Precision()) {
2273  errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
2274  }
2275 
2276  if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
2277  || param->cuda_prec_precondition != gaugePrecondition->Precision()
2278  || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()) {
2279  QudaPrecision precision[3]
2281  QudaReconstructType recon[3]
2282  = {gaugeSloppy->Reconstruct(), gaugePrecondition->Reconstruct(), gaugeRefinement->Reconstruct()};
2284  loadSloppyGaugeQuda(precision, recon);
2285  }
2286 
2287  if (gaugeSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");
2288  if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
2289  if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
2290  if (param->overlap) {
2291  if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
2292  }
2293  cudaGauge = gaugePrecise;
2294  } else {
2295  if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");
2296  if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");
2297 
2298  if (param->cuda_prec != gaugeFatPrecise->Precision()) {
2299  errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());
2300  }
2301 
2302  if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
2303  || param->cuda_prec_precondition != gaugeFatPrecondition->Precision()
2304  || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision()
2305  || param->cuda_prec_sloppy != gaugeLongSloppy->Precision()
2306  || param->cuda_prec_precondition != gaugeLongPrecondition->Precision()
2307  || param->cuda_prec_refinement_sloppy != gaugeLongRefinement->Precision()) {
2308 
2309  QudaPrecision precision[3]
2311  // recon is always no for fat links, so just use long reconstructs here
2312  QudaReconstructType recon[3]
2313  = {gaugeLongSloppy->Reconstruct(), gaugeLongPrecondition->Reconstruct(), gaugeLongRefinement->Reconstruct()};
2315  loadSloppyGaugeQuda(precision, recon);
2316  }
2317 
2318  if (gaugeFatSloppy == nullptr) errorQuda("Sloppy gauge fat field doesn't exist");
2319  if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");
2320  if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");
2321  if (param->overlap) {
2322  if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
2323  }
2324 
2325  if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");
2326  if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");
2327  if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");
2328  if (param->overlap) {
2329  if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
2330  }
2331  cudaGauge = gaugeFatPrecise;
2332  }
2333 
2334  checkClover(param);
2335 
2336  return cudaGauge;
2337 }
2338 
2339 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
2340 {
2341  pushVerbosity(inv_param->verbosity);
2342 
2343  if (!initialized) errorQuda("QUDA not initialized");
2344  if (gaugePrecise == nullptr) errorQuda("Gauge field not allocated");
2345  if (cloverPrecise == nullptr) errorQuda("Clover field not allocated");
2346 
2348 
2349  if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))
2350  errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2351 
2352  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), true);
2353 
2354  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
2355  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2356  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2357 
2358  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2359  cudaColorSpinorField in(*in_h, cudaParam);
2360 
2361  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2362  double cpu = blas::norm2(*in_h);
2363  double gpu = blas::norm2(in);
2364  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2365  }
2366 
2367  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2368  cudaColorSpinorField out(in, cudaParam);
2369 
2370  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
2371  if (parity == QUDA_EVEN_PARITY) {
2372  parity = QUDA_ODD_PARITY;
2373  } else {
2374  parity = QUDA_EVEN_PARITY;
2375  }
2376  blas::ax(gaugePrecise->Anisotropy(), in);
2377  }
2378  bool pc = true;
2379 
2380  DiracParam diracParam;
2381  setDiracParam(diracParam, inv_param, pc);
2382  //FIXME: Do we need this for twisted clover???
2383  DiracCloverPC dirac(diracParam); // create the Dirac operator
2384  if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
2385  else dirac.CloverInv(out, in, parity);
2386 
2387  cpuParam.v = h_out;
2388  cpuParam.location = inv_param->output_location;
2389  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2390  *out_h = out;
2391 
2392  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2393  double cpu = blas::norm2(*out_h);
2394  double gpu = blas::norm2(out);
2395  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2396  }
2397 
2398  /*for (int i=0; i<in_h->Volume(); i++) {
2399  ((cpuColorSpinorField*)out_h)->PrintVector(i);
2400  }*/
2401 
2402  delete out_h;
2403  delete in_h;
2404 
2405  popVerbosity();
2406 }
2407 
2408 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
2409 {
2412 
2413  // Transfer the inv param structure contained in eig_param
2414  QudaInvertParam *inv_param = eig_param->invert_param;
2415 
2417  || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)
2418  setKernelPackT(true);
2419 
2420  if (!initialized) errorQuda("QUDA not initialized");
2421 
2422  pushVerbosity(inv_param->verbosity);
2423  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2424  printQudaInvertParam(inv_param);
2425  printQudaEigParam(eig_param);
2426  }
2427 
2428  checkInvertParam(inv_param);
2429  checkEigParam(eig_param);
2430  cudaGaugeField *cudaGauge = checkGauge(inv_param);
2431 
2432  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE)
2433  || (inv_param->solve_type == QUDA_NORMERR_PC_SOLVE);
2434 
2435  inv_param->secs = 0;
2436  inv_param->gflops = 0;
2437  inv_param->iter = 0;
2438 
2439  // Define problem matrix
2440  //------------------------------------------------------
2441  Dirac *d = nullptr;
2442  Dirac *dSloppy = nullptr;
2443  Dirac *dPre = nullptr;
2444 
2445  // create the dirac operator
2446  createDirac(d, dSloppy, dPre, *inv_param, pc_solve);
2447  Dirac &dirac = *d;
2448 
2449  // Create device side ColorSpinorField vector space and to pass to the
2450  // compute function.
2451  const int *X = cudaGauge->X();
2452  ColorSpinorParam cpuParam(host_evecs[0], *inv_param, X, inv_param->solution_type, inv_param->input_location);
2453 
2454  // create wrappers around application vector set
2455  std::vector<ColorSpinorField *> host_evecs_;
2456  for (int i = 0; i < eig_param->nConv; i++) {
2457  cpuParam.v = host_evecs[i];
2458  host_evecs_.push_back(ColorSpinorField::Create(cpuParam));
2459  }
2460 
2461  ColorSpinorParam cudaParam(cpuParam);
2462  cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
2463  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2464  cudaParam.setPrecision(eig_param->cuda_prec_ritz, eig_param->cuda_prec_ritz, true);
2465 
2466  std::vector<Complex> evals(eig_param->nConv, 0.0);
2467  std::vector<ColorSpinorField *> kSpace;
2468  for (int i = 0; i < eig_param->nConv; i++) { kSpace.push_back(ColorSpinorField::Create(cudaParam)); }
2469 
2470  // If you use polynomial acceleration on a non-symmetric matrix,
2471  // the solver will fail.
2472  if (eig_param->use_poly_acc && !eig_param->use_norm_op && !(inv_param->dslash_type == QUDA_LAPLACE_DSLASH)) {
2473  // Breaking up the boolean check a little bit. If it's a staggered dslash type and a PC type, we can use poly accel.
2474  if (!((inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH) && inv_param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2475  errorQuda("Polynomial acceleration with non-symmetric matrices not supported");
2476  }
2477  }
2478 
2480 
2481  if (!eig_param->use_norm_op && !eig_param->use_dagger) {
2482  DiracM m(dirac);
2483  if (eig_param->arpack_check) {
2484  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2485  } else {
2486  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2487  (*eig_solve)(kSpace, evals);
2488  delete eig_solve;
2489  }
2490  } else if (!eig_param->use_norm_op && eig_param->use_dagger) {
2491  DiracMdag m(dirac);
2492  if (eig_param->arpack_check) {
2493  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2494  } else {
2495  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2496  (*eig_solve)(kSpace, evals);
2497  delete eig_solve;
2498  }
2499  } else if (eig_param->use_norm_op && !eig_param->use_dagger) {
2500  DiracMdagM m(dirac);
2501  if (eig_param->arpack_check) {
2502  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2503  } else {
2504  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2505  (*eig_solve)(kSpace, evals);
2506  delete eig_solve;
2507  }
2508  } else if (eig_param->use_norm_op && eig_param->use_dagger) {
2509  DiracMMdag m(dirac);
2510  if (eig_param->arpack_check) {
2511  arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2512  } else {
2513  EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
2514  (*eig_solve)(kSpace, evals);
2515  delete eig_solve;
2516  }
2517  } else {
2518  errorQuda("Invalid use_norm_op and dagger combination");
2519  }
2520 
2521  // Copy eigen values back
2522  for (int i = 0; i < eig_param->nConv; i++) { host_evals[i] = real(evals[i]) + imag(evals[i]) * _Complex_I; }
2523 
2524  // Transfer Eigenpairs back to host if using GPU eigensolver
2525  if (!(eig_param->arpack_check)) {
2527  for (int i = 0; i < eig_param->nConv; i++) *host_evecs_[i] = *kSpace[i];
2529  }
2530 
2532  for (int i = 0; i < eig_param->nConv; i++) delete host_evecs_[i];
2533  delete d;
2534  delete dSloppy;
2535  delete dPre;
2536  for (int i = 0; i < eig_param->nConv; i++) delete kSpace[i];
2538 
2539  popVerbosity();
2540 
2541  // cache is written out even if a long benchmarking job gets interrupted
2542  saveTuneCache();
2543 
2545 }
2546 
2548  : profile(profile) {
2549  profile.TPSTART(QUDA_PROFILE_INIT);
2550  QudaInvertParam *param = mg_param.invert_param;
2551 
2552  checkMultigridParam(&mg_param);
2554 
2555  // check MG params (needs to go somewhere else)
2556  if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
2557  errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);
2558  for (int i=0; i<mg_param.n_level; i++) {
2560  errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);
2561  }
2562  if (param->solve_type != QUDA_DIRECT_SOLVE)
2563  errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2564 
2566  mg_param.secs = 0;
2567  mg_param.gflops = 0;
2568 
2569  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2571 
2572  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2573  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2574 
2575  // create the dirac operators for the fine grid
2576 
2577  // this is the Dirac operator we use for inter-grid residual computation
2578  DiracParam diracParam;
2579  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2580  d = Dirac::create(diracParam);
2581  m = new DiracM(*d);
2582 
2583  // this is the Dirac operator we use for smoothing
2584  DiracParam diracSmoothParam;
2585  bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
2586  (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2587  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2588  diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0];
2589  dSmooth = Dirac::create(diracSmoothParam);
2590  mSmooth = new DiracM(*dSmooth);
2591 
2592  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2593  DiracParam diracSmoothSloppyParam;
2594  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve,
2595  mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false);
2596  diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0];
2597 
2598  dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);
2600 
2601  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating vector of nullptr space fields of length %d\n", mg_param.n_vec[0]);
2602 
2603  ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]);
2605  QudaPrecision Bprec = mg_param.precision_null[0];
2606  Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);
2607  csParam.setPrecision(Bprec);
2610  B.resize(mg_param.n_vec[0]);
2611  for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); }
2612 
2613  // fill out the MG parameters for the fine level
2614  mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);
2615 
2616  mg = new MG(*mgParam, profile);
2617  mgParam->updateInvertParam(*param);
2618 
2619  // cache is written out even if a long benchmarking job gets interrupted
2620  saveTuneCache();
2621  profile.TPSTOP(QUDA_PROFILE_INIT);
2622 }
2623 
2625  profilerStart(__func__);
2626 
2627  pushVerbosity(mg_param->invert_param->verbosity);
2628 
2630  auto *mg = new multigrid_solver(*mg_param, profileInvert);
2632 
2633  saveTuneCache();
2634 
2635  popVerbosity();
2636 
2637  profilerStop(__func__);
2638  return static_cast<void*>(mg);
2639 }
2640 
2642  delete static_cast<multigrid_solver*>(mg);
2643 }
2644 
2645 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
2646 {
2647  profilerStart(__func__);
2648 
2649  pushVerbosity(mg_param->invert_param->verbosity);
2650 
2653 
2654  auto *mg = static_cast<multigrid_solver*>(mg_);
2655  checkMultigridParam(mg_param);
2656 
2657  QudaInvertParam *param = mg_param->invert_param;
2658  // check the gauge fields have been created and set the precision as needed
2659  checkGauge(param);
2660 
2661  // for reporting level 1 is the fine level but internally use level 0 for indexing
2662  // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" );
2663  // setOutputPrefix(prefix);
2664  setOutputPrefix("MG level 1 (GPU): "); //fix me
2665 
2666  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Updating operator on level 1 of %d levels\n", mg->mgParam->Nlevel);
2667 
2668  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2669  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2670 
2671  // free the previous dirac operators
2672  if (mg->m) delete mg->m;
2673  if (mg->mSmooth) delete mg->mSmooth;
2674  if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;
2675 
2676  if (mg->d) delete mg->d;
2677  if (mg->dSmooth) delete mg->dSmooth;
2678  if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;
2679 
2680  // create new fine dirac operators
2681 
2682  // this is the Dirac operator we use for inter-grid residual computation
2683  DiracParam diracParam;
2684  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2685  mg->d = Dirac::create(diracParam);
2686  mg->m = new DiracM(*(mg->d));
2687 
2688  // this is the Dirac operator we use for smoothing
2689  DiracParam diracSmoothParam;
2690  bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
2691  (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2692  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2693  mg->dSmooth = Dirac::create(diracSmoothParam);
2694  mg->mSmooth = new DiracM(*(mg->dSmooth));
2695 
2696  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2697  DiracParam diracSmoothSloppyParam;
2698  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
2699  mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
2700  mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));
2701 
2702  mg->mgParam->matResidual = mg->m;
2703  mg->mgParam->matSmooth = mg->mSmooth;
2704  mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;
2705 
2706  mg->mgParam->updateInvertParam(*param);
2707  if(mg->mgParam->mg_global.invert_param != param)
2708  mg->mgParam->mg_global.invert_param = param;
2709 
2710  bool refresh = true;
2711  mg->mg->reset(refresh);
2712 
2713  setOutputPrefix("");
2714 
2715  // cache is written out even if a long benchmarking job gets interrupted
2716  saveTuneCache();
2717 
2720 
2721  popVerbosity();
2722 
2723  profilerStop(__func__);
2724 }
2725 
2726 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
2727 {
2728  profilerStart(__func__);
2729  pushVerbosity(mg_param->invert_param->verbosity);
2731 
2732  auto *mg = static_cast<multigrid_solver*>(mg_);
2733  checkMultigridParam(mg_param);
2734  checkGauge(mg_param->invert_param);
2735 
2736  mg->mg->dumpNullVectors();
2737 
2739  popVerbosity();
2740  profilerStop(__func__);
2741 }
2742 
2744  : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2745 
2746  QudaInvertParam *param = eig_param.invert_param;
2747 
2748  if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
2749 
2750  profile.TPSTART(QUDA_PROFILE_INIT);
2751 
2753  eig_param.secs = 0;
2754  eig_param.gflops = 0;
2755 
2756  DiracParam diracParam;
2757  if(eig_param.cuda_prec_ritz == param->cuda_prec)
2758  {
2759  setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2760  } else {
2761  setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2762  }
2763 
2764  const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2765 
2766  d = Dirac::create(diracParam);
2767  m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));
2768 
2769  ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location);
2770 
2771  ritzParam.create = QUDA_ZERO_FIELD_CREATE;
2772  ritzParam.is_composite = true;
2773  ritzParam.is_component = false;
2774  ritzParam.composite_dim = param->nev*param->deflation_grid;
2775  ritzParam.setPrecision(param->cuda_prec_ritz);
2776 
2777  if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
2778  ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order
2779  if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
2780 
2781  //select memory location here, by default ritz vectors will be allocated on the device
2782  //but if not sufficient device memory, then the user may choose mapped type of memory
2783  ritzParam.mem_type = eig_param.mem_type_ritz;
2784  } else { //host location
2785  ritzParam.mem_type = QUDA_MEMORY_PINNED;
2786  }
2787 
2788  int ritzVolume = 1;
2789  for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];
2790 
2791  if (getVerbosity() == QUDA_DEBUG_VERBOSE) {
2792 
2793  size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());
2794  printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());
2795  if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");
2796  else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED)
2797  printfQuda("Using mapped memory type.\n");
2798  }
2799 
2800  RV = ColorSpinorField::Create(ritzParam);
2801 
2802  deflParam = new DeflationParam(eig_param, RV, *m);
2803 
2804  defl = new Deflation(*deflParam, profile);
2805 
2806  profile.TPSTOP(QUDA_PROFILE_INIT);
2807 }
2808 
2809 void* newDeflationQuda(QudaEigParam *eig_param) {
2811 #ifdef MAGMA_LIB
2812  openMagma();
2813 #endif
2814  auto *defl = new deflated_solver(*eig_param, profileInvert);
2815 
2817 
2818  saveProfile(__func__);
2819  flushProfile();
2820  return static_cast<void*>(defl);
2821 }
2822 
2823 void destroyDeflationQuda(void *df) {
2824 #ifdef MAGMA_LIB
2825  closeMagma();
2826 #endif
2827  delete static_cast<deflated_solver*>(df);
2828 }
2829 
2830 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
2831 {
2832  profilerStart(__func__);
2833 
2835 
2836  if (!initialized) errorQuda("QUDA not initialized");
2837 
2838  pushVerbosity(param->verbosity);
2840 
2841  checkInvertParam(param, hp_x, hp_b);
2842 
2843  // check the gauge fields have been created
2845 
2846  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
2847  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
2848  // for now, though, so here we factorize everything for convenience.
2849 
2850  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2852  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2854  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2855  (param->solution_type == QUDA_MATPC_SOLUTION);
2856  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2857  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2858  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2859  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2860 
2861  param->secs = 0;
2862  param->gflops = 0;
2863  param->iter = 0;
2864 
2865  Dirac *d = nullptr;
2866  Dirac *dSloppy = nullptr;
2867  Dirac *dPre = nullptr;
2868 
2869  // create the dirac operator
2870  createDirac(d, dSloppy, dPre, *param, pc_solve);
2871 
2872  Dirac &dirac = *d;
2873  Dirac &diracSloppy = *dSloppy;
2874  Dirac &diracPre = *dPre;
2875 
2877 
2878  ColorSpinorField *b = nullptr;
2879  ColorSpinorField *x = nullptr;
2880  ColorSpinorField *in = nullptr;
2881  ColorSpinorField *out = nullptr;
2882 
2883  const int *X = cudaGauge->X();
2884 
2885  // wrap CPU host side pointers
2886  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
2887  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
2888 
2889  cpuParam.v = hp_x;
2890  cpuParam.location = param->output_location;
2891  ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);
2892 
2893  // download source
2894  ColorSpinorParam cudaParam(cpuParam, *param);
2895  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2896  b = new cudaColorSpinorField(*h_b, cudaParam);
2897 
2898  // now check if we need to invalidate the solutionResident vectors
2899  bool invalidate = false;
2900  if (param->use_resident_solution == 1) {
2901  for (auto v : solutionResident)
2902  if (b->Precision() != v->Precision() || b->SiteSubset() != v->SiteSubset()) { invalidate = true; break; }
2903 
2904  if (invalidate) {
2905  for (auto v : solutionResident) if (v) delete v;
2906  solutionResident.clear();
2907  }
2908 
2909  if (!solutionResident.size()) {
2910  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2911  solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution
2912  }
2913  x = solutionResident[0];
2914  } else {
2915  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2916  x = new cudaColorSpinorField(cudaParam);
2917  }
2918 
2919  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2920  // initial guess only supported for single-pass solvers
2922  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2923  errorQuda("Initial guess not supported for two-pass solver");
2924  }
2925 
2926  *x = *h_x; // solution
2927  } else { // zero initial guess
2928  blas::zero(*x);
2929  }
2930 
2933 
2934  double nb = blas::norm2(*b);
2935  if (nb==0.0) errorQuda("Source has zero norm");
2936 
2937  if (getVerbosity() >= QUDA_VERBOSE) {
2938  double nh_b = blas::norm2(*h_b);
2939  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2940  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) {
2941  double nh_x = blas::norm2(*h_x);
2942  double nx = blas::norm2(*x);
2943  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2944  }
2945  }
2946 
2947  // rescale the source and solution vectors to help prevent the onset of underflow
2949  blas::ax(1.0/sqrt(nb), *b);
2950  blas::ax(1.0/sqrt(nb), *x);
2951  }
2952 
2953  massRescale(*static_cast<cudaColorSpinorField*>(b), *param);
2954 
2955  dirac.prepare(in, out, *x, *b, param->solution_type);
2956 
2957  if (getVerbosity() >= QUDA_VERBOSE) {
2958  double nin = blas::norm2(*in);
2959  double nout = blas::norm2(*out);
2960  printfQuda("Prepared source = %g\n", nin);
2961  printfQuda("Prepared solution = %g\n", nout);
2962  }
2963 
2964  if (getVerbosity() >= QUDA_VERBOSE) {
2965  double nin = blas::norm2(*in);
2966  printfQuda("Prepared source post mass rescale = %g\n", nin);
2967  }
2968 
2969  // solution_type specifies *what* system is to be solved.
2970  // solve_type specifies *how* the system is to be solved.
2971  //
2972  // We have the following four cases (plus preconditioned variants):
2973  //
2974  // solution_type solve_type Effect
2975  // ------------- ---------- ------
2976  // MAT DIRECT Solve Ax=b
2977  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
2978  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
2979  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
2980  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
2981  //
2982  // We generally require that the solution_type and solve_type
2983  // preconditioning match. As an exception, the unpreconditioned MAT
2984  // solution_type may be used with any solve_type, including
2985  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
2986  // preconditioned source and reconstruction of the full solution are
2987  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
2988  // respectively.
2989 
2990  if (pc_solution && !pc_solve) {
2991  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2992  }
2993 
2994  if (!mat_solution && !pc_solution && pc_solve) {
2995  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2996  }
2997 
2998  if (!mat_solution && norm_error_solve) {
2999  errorQuda("Normal-error solve requires Mat solution");
3000  }
3001 
3002  if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {
3003  errorQuda("Multigrid preconditioning only supported for direct solves");
3004  }
3005 
3006  if (param->chrono_use_resident && ( norm_error_solve) ){
3007  errorQuda("Chronological forcasting only presently supported for M^dagger M solver");
3008  }
3009 
3011 
3012  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
3014  dirac.Mdag(*in, tmp);
3015  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
3016  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3017  SolverParam solverParam(*param);
3018  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3019  (*solve)(*out, *in);
3020  blas::copy(*in, *out);
3021  solverParam.updateInvertParam(*param);
3022  delete solve;
3023  }
3024 
3025  if (direct_solve) {
3026  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3027  SolverParam solverParam(*param);
3028  // chronological forecasting
3029  if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
3031 
3032  auto &basis = chronoResident[param->chrono_index];
3033 
3034  ColorSpinorParam cs_param(*basis[0]);
3036  ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
3037  std::vector<ColorSpinorField*> Ap;
3038  for (unsigned int k=0; k < basis.size(); k++) {
3039  Ap.emplace_back((ColorSpinorField::Create(cs_param)));
3040  }
3041 
3042  if (param->chrono_precision == param->cuda_prec) {
3043  for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
3044  } else if (param->chrono_precision == param->cuda_prec_sloppy) {
3045  for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3046  } else {
3047  errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3048  param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
3049  }
3050 
3051  bool orthogonal = true;
3052  bool apply_mat = false;
3053  bool hermitian = false;
3054  MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3055 
3056  blas::copy(*tmp, *in);
3057  mre(*out, *tmp, basis, Ap);
3058 
3059  for (auto ap: Ap) {
3060  if (ap) delete (ap);
3061  }
3062  delete tmp;
3063  if (tmp2 != out) delete tmp2;
3064 
3066  }
3067 
3068  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3069  (*solve)(*out, *in);
3070  solverParam.updateInvertParam(*param);
3071  delete solve;
3072  } else if (!norm_error_solve) {
3073  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3074  SolverParam solverParam(*param);
3075 
3076  // chronological forecasting
3077  if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
3079 
3080  auto &basis = chronoResident[param->chrono_index];
3081 
3082  ColorSpinorParam cs_param(*basis[0]);
3083  std::vector<ColorSpinorField*> Ap;
3085  ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
3086  for (unsigned int k=0; k < basis.size(); k++) {
3087  Ap.emplace_back((ColorSpinorField::Create(cs_param)));
3088  }
3089 
3090  if (param->chrono_precision == param->cuda_prec) {
3091  for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
3092  } else if (param->chrono_precision == param->cuda_prec_sloppy) {
3093  for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3094  } else {
3095  errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3096  param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
3097  }
3098 
3099  bool orthogonal = true;
3100  bool apply_mat = false;
3101  bool hermitian = true;
3102  MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3103 
3104  blas::copy(*tmp, *in);
3105  mre(*out, *tmp, basis, Ap);
3106 
3107  for (auto ap: Ap) {
3108  if (ap) delete(ap);
3109  }
3110  delete tmp;
3111  if (tmp2 != out) delete tmp2;
3112 
3114  }
3115 
3116  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3117  (*solve)(*out, *in);
3118  solverParam.updateInvertParam(*param);
3119  delete solve;
3120  } else { // norm_error_solve
3121  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3122  cudaColorSpinorField tmp(*out);
3123  SolverParam solverParam(*param);
3124  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3125  (*solve)(tmp, *in); // y = (M M^\dag) b
3126  dirac.Mdag(*out, tmp); // x = M^dag y
3127  solverParam.updateInvertParam(*param);
3128  delete solve;
3129  }
3130 
3131  if (getVerbosity() >= QUDA_VERBOSE){
3132  double nx = blas::norm2(*x);
3133  printfQuda("Solution = %g\n",nx);
3134  }
3135 
3137  if (param->chrono_make_resident) {
3138  if(param->chrono_max_dim < 1){
3139  errorQuda("Cannot chrono_make_resident with chrono_max_dim %i",param->chrono_max_dim);
3140  }
3141 
3142  const int i = param->chrono_index;
3143  if (i >= QUDA_MAX_CHRONO)
3144  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
3145 
3146  auto &basis = chronoResident[i];
3147 
3148  if(param->chrono_max_dim < (int)basis.size()){
3149  errorQuda("Requested chrono_max_dim %i is smaller than already existing chroology %i",param->chrono_max_dim,(int)basis.size());
3150  }
3151 
3152  if(not param->chrono_replace_last){
3153  // if we have not filled the space yet just augment
3154  if ((int)basis.size() < param->chrono_max_dim) {
3155  ColorSpinorParam cs_param(*out);
3156  cs_param.setPrecision(param->chrono_precision);
3157  basis.emplace_back(ColorSpinorField::Create(cs_param));
3158  }
3159 
3160  // shuffle every entry down one and bring the last to the front
3161  ColorSpinorField *tmp = basis[basis.size()-1];
3162  for (unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];
3163  basis[0] = tmp;
3164  }
3165  *(basis[0]) = *out; // set first entry to new solution
3166  }
3167  dirac.reconstruct(*x, *b, param->solution_type);
3168 
3170  // rescale the solution
3171  blas::ax(sqrt(nb), *x);
3172  }
3174 
3175  if (!param->make_resident_solution) {
3177  *h_x = *x;
3179  }
3180 
3182 
3183  if (param->compute_action) {
3184  Complex action = blas::cDotProduct(*b, *x);
3185  param->action[0] = action.real();
3186  param->action[1] = action.imag();
3187  }
3188 
3189  if (getVerbosity() >= QUDA_VERBOSE){
3190  double nx = blas::norm2(*x);
3191  double nh_x = blas::norm2(*h_x);
3192  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3193  }
3195 
3197 
3198  delete h_b;
3199  delete h_x;
3200  delete b;
3201 
3202  if (param->use_resident_solution && !param->make_resident_solution) {
3203  for (auto v: solutionResident) if (v) delete v;
3204  solutionResident.clear();
3205  } else if (!param->make_resident_solution) {
3206  delete x;
3207  }
3208 
3209  delete d;
3210  delete dSloppy;
3211  delete dPre;
3212 
3214 
3215  popVerbosity();
3216 
3217  // cache is written out even if a long benchmarking job gets interrupted
3218  saveTuneCache();
3219 
3221 
3222  profilerStop(__func__);
3223 }
3224 
3225 
3234 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
3235 {
3236  // currently that code is just a copy of invertQuda and cannot work
3238 
3239  if (!initialized) errorQuda("QUDA not initialized");
3240 
3241  pushVerbosity(param->verbosity);
3243 
3244  checkInvertParam(param, _hp_x[0], _hp_b[0]);
3245 
3246  // check the gauge fields have been created
3248 
3249  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
3250  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
3251  // for now, though, so here we factorize everything for convenience.
3252 
3253  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
3255  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
3257  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
3258  (param->solution_type == QUDA_MATPC_SOLUTION);
3259  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
3260  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
3261  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
3262  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
3263 
3264  param->secs = 0;
3265  param->gflops = 0;
3266  param->iter = 0;
3267 
3268  Dirac *d = nullptr;
3269  Dirac *dSloppy = nullptr;
3270  Dirac *dPre = nullptr;
3271 
3272  // create the dirac operator
3273  createDirac(d, dSloppy, dPre, *param, pc_solve);
3274 
3275  Dirac &dirac = *d;
3276  Dirac &diracSloppy = *dSloppy;
3277  Dirac &diracPre = *dPre;
3278 
3280 
3281  // std::vector<ColorSpinorField*> b; // Cuda Solutions
3282  // b.resize(param->num_src);
3283  // std::vector<ColorSpinorField*> x; // Cuda Solutions
3284  // x.resize(param->num_src);
3285  ColorSpinorField* in; // = nullptr;
3286  //in.resize(param->num_src);
3287  ColorSpinorField* out; // = nullptr;
3288  //out.resize(param->num_src);
3289 
3290  // for(int i=0;i < param->num_src;i++){
3291  // in[i] = nullptr;
3292  // out[i] = nullptr;
3293  // }
3294 
3295  const int *X = cudaGauge->X();
3296 
3297 
3298  // Host pointers for x, take a copy of the input host pointers
3299  void** hp_x;
3300  hp_x = new void* [ param->num_src ];
3301 
3302  void** hp_b;
3303  hp_b = new void* [param->num_src];
3304 
3305  for(int i=0;i < param->num_src;i++){
3306  hp_x[i] = _hp_x[i];
3307  hp_b[i] = _hp_b[i];
3308  }
3309 
3310  // wrap CPU host side pointers
3311  ColorSpinorParam cpuParam(hp_b[0], *param, X, pc_solution, param->input_location);
3312  std::vector<ColorSpinorField*> h_b;
3313  h_b.resize(param->num_src);
3314  for(int i=0; i < param->num_src; i++) {
3315  cpuParam.v = hp_b[i]; //MW seems wird in the loop
3316  h_b[i] = ColorSpinorField::Create(cpuParam);
3317  }
3318 
3319  // cpuParam.v = hp_x;
3320  cpuParam.location = param->output_location;
3321  std::vector<ColorSpinorField*> h_x;
3322  h_x.resize(param->num_src);
3323 //
3324  for(int i=0; i < param->num_src; i++) {
3325  cpuParam.v = hp_x[i]; //MW seems wird in the loop
3326  h_x[i] = ColorSpinorField::Create(cpuParam);
3327  }
3328 
3329 
3330  // MW currently checked until here
3331 
3332  // download source
3333  printfQuda("Setup b\n");
3334  ColorSpinorParam cudaParam(cpuParam, *param);
3335  cudaParam.create = QUDA_NULL_FIELD_CREATE;
3336  cudaParam.is_composite = true;
3337  cudaParam.composite_dim = param->num_src;
3338 
3339  printfQuda("Create b \n");
3341 
3342 
3343 
3344 
3345  for(int i=0; i < param->num_src; i++) {
3346  b->Component(i) = *h_b[i];
3347  }
3348  printfQuda("Done b \n");
3349 
3350  ColorSpinorField *x;
3351  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
3352  // initial guess only supported for single-pass solvers
3354  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
3355  errorQuda("Initial guess not supported for two-pass solver");
3356  }
3357  cudaParam.is_composite = true;
3358  cudaParam.is_component = false;
3359  cudaParam.composite_dim = param->num_src;
3360 
3361  x = ColorSpinorField::Create(cudaParam);
3362  for(int i=0; i < param->num_src; i++) {
3363  x->Component(i) = *h_x[i];
3364  }
3365 
3366  } else { // zero initial guess
3367  // Create the solution fields filled with zero
3368  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3369  printfQuda("Create x \n");
3370  x = ColorSpinorField::Create(cudaParam);
3371  printfQuda("Done x \n");
3372  // solution
3373  }
3374 
3376 
3377  auto * nb = new double[param->num_src];
3378  for(int i=0; i < param->num_src; i++) {
3379  nb[i] = blas::norm2(b->Component(i));
3380  printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nb[i], nb[i]);
3381  if (nb[i]==0.0) errorQuda("Source has zero norm");
3382 
3383  if (getVerbosity() >= QUDA_VERBOSE) {
3384  double nh_b = blas::norm2(*h_b[i]);
3385  double nh_x = blas::norm2(*h_x[i]);
3386  double nx = blas::norm2(x->Component(i));
3387  printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nh_b, nb[i]);
3388  printfQuda("Solution %i: CPU = %g, CUDA copy = %g\n", i, nh_x, nx);
3389  }
3390  }
3391 
3392  // MW checked until here do far
3393 
3394  // rescale the source and solution vectors to help prevent the onset of underflow
3396  for(int i=0; i < param->num_src; i++) {
3397  blas::ax(1.0/sqrt(nb[i]), b->Component(i));
3398  blas::ax(1.0/sqrt(nb[i]), x->Component(i));
3399  }
3400  }
3401 
3402  for(int i=0; i < param->num_src; i++) {
3403  massRescale(dynamic_cast<cudaColorSpinorField&>( b->Component(i) ), *param);
3404  }
3405 
3406  // MW: need to check what dirac.prepare does
3407  // for now let's just try looping of num_rhs already here???
3408  // for(int i=0; i < param->num_src; i++) {
3409  dirac.prepare(in, out, *x, *b, param->solution_type);
3410 for(int i=0; i < param->num_src; i++) {
3411  if (getVerbosity() >= QUDA_VERBOSE) {
3412  double nin = blas::norm2((in->Component(i)));
3413  double nout = blas::norm2((out->Component(i)));
3414  printfQuda("Prepared source %i = %g\n", i, nin);
3415  printfQuda("Prepared solution %i = %g\n", i, nout);
3416  }
3417 
3418  if (getVerbosity() >= QUDA_VERBOSE) {
3419  double nin = blas::norm2(in->Component(i));
3420  printfQuda("Prepared source %i post mass rescale = %g\n", i, nin);
3421  }
3422  }
3423 
3424  // solution_type specifies *what* system is to be solved.
3425  // solve_type specifies *how* the system is to be solved.
3426  //
3427  // We have the following four cases (plus preconditioned variants):
3428  //
3429  // solution_type solve_type Effect
3430  // ------------- ---------- ------
3431  // MAT DIRECT Solve Ax=b
3432  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
3433  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
3434  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
3435  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
3436  //
3437  // We generally require that the solution_type and solve_type
3438  // preconditioning match. As an exception, the unpreconditioned MAT
3439  // solution_type may be used with any solve_type, including
3440  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
3441  // preconditioned source and reconstruction of the full solution are
3442  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
3443  // respectively.
3444 
3445  if (pc_solution && !pc_solve) {
3446  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
3447  }
3448 
3449  if (!mat_solution && !pc_solution && pc_solve) {
3450  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3451  }
3452 
3453  if (!mat_solution && norm_error_solve) {
3454  errorQuda("Normal-error solve requires Mat solution");
3455  }
3456 
3457  if (param->inv_type_precondition == QUDA_MG_INVERTER && (pc_solve || pc_solution || !direct_solve || !mat_solution))
3458  errorQuda("Multigrid preconditioning only supported for direct non-red-black solve");
3459 
3460  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
3461  for(int i=0; i < param->num_src; i++) {
3463  dirac.Mdag(in->Component(i), tmp);
3464  }
3465  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
3466  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3467  SolverParam solverParam(*param);
3468  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3469  solve->blocksolve(*out,*in);
3470  for(int i=0; i < param->num_src; i++) {
3471  blas::copy(in->Component(i), out->Component(i));
3472  }
3473  solverParam.updateInvertParam(*param);
3474  delete solve;
3475  }
3476 
3477  if (direct_solve) {
3478  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3479  SolverParam solverParam(*param);
3480  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3481  solve->blocksolve(*out,*in);
3482  solverParam.updateInvertParam(*param);
3483  delete solve;
3484  } else if (!norm_error_solve) {
3485  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3486  SolverParam solverParam(*param);
3487  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3488  solve->blocksolve(*out,*in);
3489  solverParam.updateInvertParam(*param);
3490  delete solve;
3491  } else { // norm_error_solve
3492  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3493  errorQuda("norm_error_solve not supported in multi source solve");
3494  //cudaColorSpinorField tmp(*out);
3495  // SolverParam solverParam(*param);
3496  //Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3497  //(*solve)(tmp, *in); // y = (M M^\dag) b
3498  //dirac.Mdag(*out, tmp); // x = M^dag y
3499  //solverParam.updateInvertParam(*param,i,i);
3500  // delete solve;
3501  }
3502 
3503  if (getVerbosity() >= QUDA_VERBOSE){
3504  for(int i=0; i < param->num_src; i++) {
3505  double nx = blas::norm2(x->Component(i));
3506  printfQuda("Solution %i = %g\n",i, nx);
3507  }
3508  }
3509 
3510 
3512  for(int i=0; i< param->num_src; i++){
3513  dirac.reconstruct(x->Component(i), b->Component(i), param->solution_type);
3514  }
3516 
3518  for(int i=0; i< param->num_src; i++){
3519  // rescale the solution
3520  blas::ax(sqrt(nb[i]), x->Component(i));
3521  }
3522  }
3523 
3524  // MW -- not sure how to handle that here
3525  if (!param->make_resident_solution) {
3527  for(int i=0; i< param->num_src; i++){
3528  *h_x[i] = x->Component(i);
3529  }
3531  }
3532 
3533  if (getVerbosity() >= QUDA_VERBOSE){
3534  for(int i=0; i< param->num_src; i++){
3535  double nx = blas::norm2(x->Component(i));
3536  double nh_x = blas::norm2(*h_x[i]);
3537  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3538  }
3539  }
3540 
3541  //FIX need to make sure all deletes are correct again
3542  for(int i=0; i < param->num_src; i++){
3543  delete h_x[i];
3544  // delete x[i];
3545  delete h_b[i];
3546  // delete b[i];
3547  }
3548  delete [] hp_b;
3549  delete [] hp_x;
3550 // delete [] b;
3551 // if (!param->make_resident_solution) delete x; // FIXME make this cleaner
3552 
3553  delete d;
3554  delete dSloppy;
3555  delete dPre;
3556  delete x;
3557  delete b;
3558 
3559  popVerbosity();
3560 
3561  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
3562  saveTuneCache();
3563 
3565 }
3566 
3579 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
3580 {
3581  profilerStart(__func__);
3582 
3585 
3586  if (!initialized) errorQuda("QUDA not initialized");
3587 
3588  checkInvertParam(param, _hp_x[0], _hp_b);
3589 
3590  // check the gauge fields have been created
3591  checkGauge(param);
3592 
3593  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
3594  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
3596 
3597  pushVerbosity(param->verbosity);
3598 
3599  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
3600  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
3601  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
3602  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
3603 
3604 
3605  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3606  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3607 
3608  if (param->solution_type != QUDA_MATPC_SOLUTION) {
3609  errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type");
3610  }
3611 
3612  if (param->solve_type != QUDA_DIRECT_PC_SOLVE) {
3613  errorQuda("For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");
3614  }
3615 
3616  } else { // Wilson type
3617 
3618  if (mat_solution) {
3619  errorQuda("For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
3620  }
3621  if (direct_solve) {
3622  errorQuda("For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3623  }
3624  if (pc_solution & !pc_solve) {
3625  errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");
3626  }
3627  if (!pc_solution & pc_solve) {
3628  errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3629  }
3630  }
3631 
3632  // Timing and FLOP counters
3633  param->secs = 0;
3634  param->gflops = 0;
3635  param->iter = 0;
3636 
3637  for (int i=0; i<param->num_offset-1; i++) {
3638  for (int j=i+1; j<param->num_offset; j++) {
3639  if (param->offset[i] > param->offset[j])
3640  errorQuda("Offsets must be ordered from smallest to largest");
3641  }
3642  }
3643 
3644  // Host pointers for x, take a copy of the input host pointers
3645  void** hp_x;
3646  hp_x = new void* [ param->num_offset ];
3647 
3648  void* hp_b = _hp_b;
3649  for(int i=0;i < param->num_offset;i++){
3650  hp_x[i] = _hp_x[i];
3651  }
3652 
3653  // Create the matrix.
3654  // The way this works is that createDirac will create 'd' and 'dSloppy'
3655  // which are global. We then grab these with references...
3656  //
3657  // Balint: Isn't there a nice construction pattern we could use here? This is
3658  // expedient but yucky.
3659  // DiracParam diracParam;
3660  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3662  param->mass = sqrt(param->offset[0]/4);
3663  }
3664 
3665  Dirac *d = nullptr;
3666  Dirac *dSloppy = nullptr;
3667  Dirac *dPre = nullptr;
3668  Dirac *dRefine = nullptr;
3669 
3670  // create the dirac operator
3671  createDirac(d, dSloppy, dPre, dRefine, *param, pc_solve);
3672  Dirac &dirac = *d;
3673  Dirac &diracSloppy = *dSloppy;
3674 
3675 
3676  cudaColorSpinorField *b = nullptr; // Cuda RHS
3677  std::vector<ColorSpinorField*> x; // Cuda Solutions
3678  x.resize(param->num_offset);
3679  std::vector<ColorSpinorField*> p;
3680  std::unique_ptr<double[]> r2_old(new double[param->num_offset]);
3681 
3682  // Grab the dimension array of the input gauge field.
3683  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
3684  gaugeFatPrecise->X() : gaugePrecise->X();
3685 
3686  // This creates a ColorSpinorParam struct, from the host data
3687  // pointer, the definitions in param, the dimensions X, and whether
3688  // the solution is on a checkerboard instruction or not. These can
3689  // then be used as 'instructions' to create the actual
3690  // ColorSpinorField
3691  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
3692  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
3693 
3694  std::vector<ColorSpinorField*> h_x;
3695  h_x.resize(param->num_offset);
3696 
3697  cpuParam.location = param->output_location;
3698  for(int i=0; i < param->num_offset; i++) {
3699  cpuParam.v = hp_x[i];
3700  h_x[i] = ColorSpinorField::Create(cpuParam);
3701  }
3702 
3704  profileMulti.TPSTART(QUDA_PROFILE_H2D);
3705  // Now I need a colorSpinorParam for the device
3706  ColorSpinorParam cudaParam(cpuParam, *param);
3707  // This setting will download a host vector
3708  cudaParam.create = QUDA_COPY_FIELD_CREATE;
3709  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
3711 
3713  // Create the solution fields filled with zero
3714  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3715 
3716  // now check if we need to invalidate the solutionResident vectors
3717  bool invalidate = false;
3718  for (auto v : solutionResident)
3719  if (cudaParam.Precision() != v->Precision()) { invalidate = true; break; }
3720 
3721  if (invalidate) {
3722  for (auto v : solutionResident) delete v;
3723  solutionResident.clear();
3724  }
3725 
3726  // grow resident solutions to be big enough
3727  for (int i=solutionResident.size(); i < param->num_offset; i++) {
3728  solutionResident.push_back(new cudaColorSpinorField(cudaParam));
3729  }
3730  for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];
3731 
3733 
3734 
3736 
3737  // Check source norms
3738  double nb = blas::norm2(*b);
3739  if (nb==0.0) errorQuda("Source has zero norm");
3740 
3741  if(getVerbosity() >= QUDA_VERBOSE ) {
3742  double nh_b = blas::norm2(*h_b);
3743  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3744  }
3745 
3746  // rescale the source vector to help prevent the onset of underflow
3748  blas::ax(1.0/sqrt(nb), *b);
3749  }
3750 
3751  massRescale(*b, *param);
3753 
3754  DiracMatrix *m, *mSloppy;
3755 
3756  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3757  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3758  m = new DiracM(dirac);
3759  mSloppy = new DiracM(diracSloppy);
3760  } else {
3761  m = new DiracMdagM(dirac);
3762  mSloppy = new DiracMdagM(diracSloppy);
3763  }
3764 
3765  SolverParam solverParam(*param);
3766  MultiShiftCG cg_m(*m, *mSloppy, solverParam, profileMulti);
3767  cg_m(x, *b, p, r2_old.get());
3768  solverParam.updateInvertParam(*param);
3769 
3770  delete m;
3771  delete mSloppy;
3772 
3773  if (param->compute_true_res) {
3774  // check each shift has the desired tolerance and use sequential CG to refine
3776  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3777  cudaColorSpinorField r(*b, cudaParam);
3779  QudaInvertParam refineparam = *param;
3780  refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy;
3781  Dirac &dirac = *d;
3782  Dirac &diracSloppy = *dRefine;
3783 
3784 #define REFINE_INCREASING_MASS
3785 #ifdef REFINE_INCREASING_MASS
3786  for(int i=0; i < param->num_offset; i++) {
3787 #else
3788  for(int i=param->num_offset-1; i >= 0; i--) {
3789 #endif
3790  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3791  param->true_res_hq_offset[i] : 0;
3792  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3793  param->tol_hq_offset[i] : 0;
3794 
3795  /*
3796  In the case where the shifted systems have zero tolerance
3797  specified, we refine these systems until either the limit of
3798  precision is reached (prec_tol) or until the tolerance reaches
3799  the iterated residual tolerance of the previous multi-shift
3800  solver (iter_res_offset[i]), which ever is greater.
3801  */
3802  const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12
3803  const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));
3804  const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
3805  // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
3806  if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
3807  if (getVerbosity() >= QUDA_SUMMARIZE)
3808  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3809  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
3810 
3811  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
3812  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3813  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3814  dirac.setMass(sqrt(param->offset[i]/4));
3815  diracSloppy.setMass(sqrt(param->offset[i]/4));
3816  }
3817 
3818  DiracMatrix *m, *mSloppy;
3819 
3820  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3821  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3822  m = new DiracM(dirac);
3823  mSloppy = new DiracM(diracSloppy);
3824  } else {
3825  m = new DiracMdagM(dirac);
3826  mSloppy = new DiracMdagM(diracSloppy);
3827  }
3828 
3829  // need to curry in the shift if we are not doing staggered
3830  if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
3831  param->dslash_type != QUDA_STAGGERED_DSLASH) {
3832  m->shift = param->offset[i];
3833  mSloppy->shift = param->offset[i];
3834  }
3835 
3836  if (false) { // experimenting with Minimum residual extrapolation
3837  // only perform MRE using current and previously refined solutions
3838 #ifdef REFINE_INCREASING_MASS
3839  const int nRefine = i+1;
3840 #else
3841  const int nRefine = param->num_offset - i + 1;
3842 #endif
3843 
3844  std::vector<ColorSpinorField*> q;
3845  q.resize(nRefine);
3846  std::vector<ColorSpinorField*> z;
3847  z.resize(nRefine);
3848  cudaParam.create = QUDA_NULL_FIELD_CREATE;
3849  cudaColorSpinorField tmp(cudaParam);
3850 
3851  for(int j=0; j < nRefine; j++) {
3852  q[j] = new cudaColorSpinorField(cudaParam);
3853  z[j] = new cudaColorSpinorField(cudaParam);
3854  }
3855 
3856  *z[0] = *x[0]; // zero solution already solved
3857 #ifdef REFINE_INCREASING_MASS
3858  for (int j=1; j<nRefine; j++) *z[j] = *x[j];
3859 #else
3860  for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];
3861 #endif
3862 
3863  bool orthogonal = true;
3864  bool apply_mat = true;
3865  bool hermitian = true;
3866  MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
3867  blas::copy(tmp, *b);
3868  mre(*x[i], tmp, z, q);
3869 
3870  for(int j=0; j < nRefine; j++) {
3871  delete q[j];
3872  delete z[j];
3873  }
3874  }
3875 
3876  SolverParam solverParam(refineparam);
3877  solverParam.iter = 0;
3879  solverParam.tol = (param->tol_offset[i] > 0.0 ? param->tol_offset[i] : iter_tol); // set L2 tolerance
3880  solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
3881  solverParam.delta = param->reliable_delta_refinement;
3882 
3883  {
3884  CG cg(*m, *mSloppy, solverParam, profileMulti);
3885  if (i==0)
3886  cg(*x[i], *b, p[i], r2_old[i]);
3887  else
3888  cg(*x[i], *b);
3889  }
3890 
3891  solverParam.true_res_offset[i] = solverParam.true_res;
3892  solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
3893  solverParam.updateInvertParam(*param,i);
3894 
3895  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3896  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3897  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
3898  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
3899  }
3900 
3901  delete m;
3902  delete mSloppy;
3903 
3904  }
3905  }
3906  }
3907 
3908  // restore shifts -- avoid side effects
3909  for(int i=0; i < param->num_offset; i++) {
3910  param->offset[i] = unscaled_shifts[i];
3911  }
3912 
3913  profileMulti.TPSTART(QUDA_PROFILE_D2H);
3914 
3915  if (param->compute_action) {
3916  Complex action(0);
3917  for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);
3918  param->action[0] = action.real();
3919  param->action[1] = action.imag();
3920  }
3921 
3922  for(int i=0; i < param->num_offset; i++) {
3923  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
3924  blas::ax(sqrt(nb), *x[i]);
3925  }
3926 
3927  if (getVerbosity() >= QUDA_VERBOSE){
3928  double nx = blas::norm2(*x[i]);
3929  printfQuda("Solution %d = %g\n", i, nx);
3930  }
3931 
3932  if (!param->make_resident_solution) *h_x[i] = *x[i];
3933  }
3935 
3937 
3938  if (!param->make_resident_solution) {
3939  for (auto v: solutionResident) if (v) delete v;
3940  solutionResident.clear();
3941  }
3942 
3944 
3946  for(int i=0; i < param->num_offset; i++){
3947  delete h_x[i];
3948  //if (!param->make_resident_solution) delete x[i];
3949  }
3950 
3951  delete h_b;
3952  delete b;
3953 
3954  delete [] hp_x;
3955 
3956  delete d;
3957  delete dSloppy;
3958  delete dPre;
3959  delete dRefine;
3960  for (auto& pp : p) delete pp;
3961 
3963 
3964  popVerbosity();
3965 
3966  // cache is written out even if a long benchmarking job gets interrupted
3967  saveTuneCache();
3968 
3970 
3971  profilerStop(__func__);
3972 }
3973 
3974 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param) {
3975 
3976 #ifdef GPU_FATLINK
3979 
3980  checkGaugeParam(param);
3981 
3982  if (ulink) {
3983  const double unitarize_eps = 1e-14;
3984  const double max_error = 1e-10;
3985  const int reunit_allow_svd = 1;
3986  const int reunit_svd_only = 0;
3987  const double svd_rel_error = 1e-6;
3988  const double svd_abs_error = 1e-6;
3989  quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only,
3990  svd_rel_error, svd_abs_error);
3991  }
3992 
3993  GaugeFieldParam gParam(fatlink, *param, QUDA_GENERAL_LINKS);
3994  cpuGaugeField cpuFatLink(gParam); // create the host fatlink
3995  gParam.gauge = longlink;
3996  cpuGaugeField cpuLongLink(gParam); // create the host longlink
3997  gParam.gauge = ulink;
3998  cpuGaugeField cpuUnitarizedLink(gParam);
3999  gParam.link_type = param->type;
4000  gParam.gauge = inlink;
4001  cpuGaugeField cpuInLink(gParam); // create the host sitelink
4002 
4003  // create the device fields
4004  gParam.reconstruct = param->reconstruct;
4005  gParam.setPrecision(param->cuda_prec, true);
4006  gParam.create = QUDA_NULL_FIELD_CREATE;
4007  cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
4008 
4010 
4012  cudaInLink->loadCPUField(cpuInLink);
4014 
4015  cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
4016 
4018  delete cudaInLink;
4020 
4021  gParam.create = QUDA_ZERO_FIELD_CREATE;
4022  gParam.link_type = QUDA_GENERAL_LINKS;
4024  gParam.setPrecision(param->cuda_prec, true);
4026  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
4027  cudaGaugeField *cudaUnitarizedLink = ulink ? new cudaGaugeField(gParam) : nullptr;
4028  cudaGaugeField *cudaLongLink = longlink ? new cudaGaugeField(gParam) : nullptr;
4029 
4031  fatLongKSLink(cudaFatLink, cudaLongLink, *cudaInLinkEx, path_coeff);
4033 
4034  if (ulink) {
4036  *num_failures_h = 0;
4037  quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
4038  if (*num_failures_h>0) errorQuda("Error in unitarization component of the hisq fattening: %d failures\n", *num_failures_h);
4040  }
4041 
4043  if (ulink) cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink);
4044  if (fatlink) cudaFatLink->saveCPUField(cpuFatLink);
4045  if (longlink) cudaLongLink->saveCPUField(cpuLongLink);
4047 
4049  delete cudaFatLink;
4050  if (longlink) delete cudaLongLink;
4051  if (ulink) delete cudaUnitarizedLink;
4052  delete cudaInLinkEx;
4054 
4056 #else
4057  errorQuda("Fat-link has not been built");
4058 #endif // GPU_FATLINK
4059 }
4060 
4062  int pad = 0;
4063 #ifdef MULTI_GPU
4064  int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
4065  int face_size[4];
4066  for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
4067  pad = *std::max_element(face_size, face_size+4);
4068 #endif
4069 
4070  return pad;
4071 }
4072 
4073 int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length,
4074  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
4075 {
4076 #ifdef GPU_GAUGE_FORCE
4079 
4080  checkGaugeParam(qudaGaugeParam);
4081 
4082  GaugeFieldParam gParam(siteLink, *qudaGaugeParam);
4083  gParam.site_offset = qudaGaugeParam->gauge_offset;
4084  gParam.site_size = qudaGaugeParam->site_size;
4085  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
4086 
4087  cudaGaugeField* cudaSiteLink = nullptr;
4088 
4089  if (qudaGaugeParam->use_resident_gauge) {
4090  if (!gaugePrecise) errorQuda("No resident gauge field to use");
4091  cudaSiteLink = gaugePrecise;
4092  } else {
4093  gParam.create = QUDA_NULL_FIELD_CREATE;
4094  gParam.reconstruct = qudaGaugeParam->reconstruct;
4095  gParam.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||
4096  qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?
4098 
4099  cudaSiteLink = new cudaGaugeField(gParam);
4101 
4103  cudaSiteLink->loadCPUField(*cpuSiteLink);
4105 
4107  }
4108 
4109  GaugeFieldParam gParamMom(mom, *qudaGaugeParam, QUDA_ASQTAD_MOM_LINKS);
4110  // FIXME - test program always uses MILC for mom but can use QDP for gauge
4111  if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;
4112  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
4113  else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4114 
4115  gParamMom.site_offset = qudaGaugeParam->mom_offset;
4116  gParamMom.site_size = qudaGaugeParam->site_size;
4117  cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;
4118 
4119  cudaGaugeField* cudaMom = nullptr;
4120  if (qudaGaugeParam->use_resident_mom) {
4121  if (!momResident) errorQuda("No resident momentum field to use");
4122  cudaMom = momResident;
4123  if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
4125  } else {
4126  gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
4127  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
4128  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
4129  gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
4130  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
4131  cudaMom = new cudaGaugeField(gParamMom);
4133  if (!qudaGaugeParam->overwrite_mom) {
4135  cudaMom->loadCPUField(*cpuMom);
4137  }
4138  }
4139 
4141 
4142  // actually do the computation
4144  if (!forceMonitor()) {
4145  gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4146  } else {
4147  // if we are monitoring the force, separate the force computation from the momentum update
4148  GaugeFieldParam gParam(*cudaMom);
4149  gParam.create = QUDA_ZERO_FIELD_CREATE;
4150  GaugeField *force = GaugeField::Create(gParam);
4151  gaugeForce(*force, *cudaGauge, 1.0, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4152  updateMomentum(*cudaMom, eb3, *force, "gauge");
4153  delete force;
4154  }
4156 
4157  if (qudaGaugeParam->return_result_mom) {
4159  cudaMom->saveCPUField(*cpuMom);
4161  }
4162 
4164  if (qudaGaugeParam->make_resident_gauge) {
4165  if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
4166  gaugePrecise = cudaSiteLink;
4167  } else {
4168  delete cudaSiteLink;
4169  }
4170 
4171  if (qudaGaugeParam->make_resident_mom) {
4172  if (momResident && momResident != cudaMom) delete momResident;
4173  momResident = cudaMom;
4174  } else {
4175  delete cudaMom;
4176  }
4177 
4178  if (cpuSiteLink) delete cpuSiteLink;
4179  if (cpuMom) delete cpuMom;
4180 
4181  if (qudaGaugeParam->make_resident_gauge) {
4182  if (extendedGaugeResident) delete extendedGaugeResident;
4183  extendedGaugeResident = cudaGauge;
4184  } else {
4185  delete cudaGauge;
4186  }
4188 
4190 
4191  checkCudaError();
4192 #else
4193  errorQuda("Gauge force has not been built");
4194 #endif // GPU_GAUGE_FORCE
4195  return 0;
4196 }
4197 
4199 {
4201  if (!cloverPrecise) errorQuda("Clover field not allocated");
4202 
4203  QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
4204  // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
4205  int R[4];
4206  for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
4207  cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
4208 
4210  // create the Fmunu field
4211  GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
4212  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
4213  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4214  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4215  cudaGaugeField Fmunu(tensorParam);
4217 
4219  computeFmunu(Fmunu, *gauge);
4220  computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);
4222 
4224 
4225  // FIXME always preserve the extended gauge
4226  extendedGaugeResident = gauge;
4227 }
4228 
4229 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
4230 {
4231  GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
4232  gParam.geometry = static_cast<QudaFieldGeometry>(geometry);
4233  if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
4234  errorQuda("Only scalar and vector geometries are supported\n");
4235 
4236  cpuGaugeField *cpuGauge = nullptr;
4237  if (gauge) cpuGauge = new cpuGaugeField(gParam);
4238 
4239  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4240  gParam.create = QUDA_ZERO_FIELD_CREATE;
4241  auto* cudaGauge = new cudaGaugeField(gParam);
4242 
4243  if (gauge) {
4244  cudaGauge->loadCPUField(*cpuGauge);
4245  delete cpuGauge;
4246  }
4247 
4248  return cudaGauge;
4249 }
4250 
4251 
4252 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){
4253 
4254  auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
4255 
4256  GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
4257  gParam.geometry = cudaGauge->Geometry();
4258 
4259  cpuGaugeField cpuGauge(gParam);
4260  cudaGauge->saveCPUField(cpuGauge);
4261 
4262 }
4263 
4264 
4265 void destroyGaugeFieldQuda(void* gauge){
4266  auto* g = reinterpret_cast<cudaGaugeField*>(gauge);
4267  delete g;
4268 }
4269 
4270 
4271 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *h_force, void **x,
4273 {
4276 
4277  GaugeFieldParam gParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);
4278 
4279  // create the host momentum field
4280  gParam.reconstruct = gauge_param->reconstruct;
4281  gParam.t_boundary = QUDA_PERIODIC_T;
4282  cpuGaugeField cpuMom(gParam);
4283 
4284  // create the host momentum field
4285  gParam.link_type = QUDA_GENERAL_LINKS;
4286  gParam.gauge = h_force;
4287  cpuGaugeField cpuForce(gParam);
4288 
4289  // create the device momentum field
4291  gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
4292  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4294  cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
4295 
4296  // create temporary field for quark-field outer product
4298  gParam.link_type = QUDA_GENERAL_LINKS;
4299  gParam.create = QUDA_ZERO_FIELD_CREATE;
4300  cudaGaugeField cudaForce(gParam);
4301  GaugeField *cudaForce_[2] = {&cudaForce};
4302 
4303  ColorSpinorParam qParam;
4305  qParam.nColor = 3;
4306  qParam.nSpin = 1;
4309  qParam.nDim = 5; // 5 since staggered mrhs
4310  qParam.setPrecision(gParam.Precision());
4311  qParam.pad = 0;
4312  for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];
4313  qParam.x[4] = 1;
4314  qParam.create = QUDA_NULL_FIELD_CREATE;
4317 
4320 
4321  if (gauge_param->use_resident_mom) {
4322  if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
4323  cudaMom = momResident;
4324  } else {
4325  // download the initial momentum (FIXME make an option just to return?)
4326  cudaMom->loadCPUField(cpuMom);
4327  }
4328 
4329  // resident gauge field is required
4330  if (!gauge_param->use_resident_gauge || !gaugePrecise)
4331  errorQuda("Resident gauge field is required");
4332 
4333  if (!gaugePrecise->StaggeredPhaseApplied()) {
4334  errorQuda("Gauge field requires the staggered phase factors to be applied");
4335  }
4336 
4337  // check if staggered phase is the desired one
4338  if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {
4339  errorQuda("Requested staggered phase %d, but found %d\n",
4340  gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());
4341  }
4342 
4345 
4346  const int nvector = inv_param->num_offset;
4347  std::vector<ColorSpinorField*> X(nvector);
4348  for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
4349 
4350  if (inv_param->use_resident_solution) {
4351  if (solutionResident.size() < (unsigned int)nvector)
4352  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
4353  solutionResident.size(), nvector);
4354  }
4355 
4356  // create the staggered operator
4357  DiracParam diracParam;
4358  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
4359  (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
4360  if (!pc_solve)
4361  errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type);
4362  setDiracParam(diracParam, inv_param, pc_solve);
4363  Dirac *dirac = Dirac::create(diracParam);
4364 
4367 
4368  for (int i=0; i<nvector; i++) {
4369  ColorSpinorField &x = *(X[i]);
4370 
4371  if (inv_param->use_resident_solution) x.Even() = *(solutionResident[i]);
4372  else errorQuda("%s requires resident solution", __func__);
4373 
4374  // set the odd solution component
4375  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
4376  }
4377 
4380 
4381 #if 0
4382  if (inv_param->use_resident_solution) {
4383  for (auto v : solutionResident) if (v) delete solutionResident[i];
4384  solutionResident.clear();
4385  }
4386 #endif
4387  delete dirac;
4388 
4391 
4392  // compute quark-field outer product
4393  for (int i=0; i<nvector; i++) {
4394  ColorSpinorField &x = *(X[i]);
4395  // second component is zero since we have no three hop term
4396  double coeff[2] = {inv_param->residue[i], 0.0};
4397 
4398  // Operate on even-parity sites
4399  computeStaggeredOprod(cudaForce_, x, coeff, 1);
4400  }
4401 
4402  // mom += delta * [U * force]TA
4403  applyU(cudaForce, *gaugePrecise);
4404  updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");
4406 
4409 
4410  if (gauge_param->return_result_mom) {
4411  // copy the momentum field back to the host
4412  cudaMom->saveCPUField(cpuMom);
4413  }
4414 
4415  if (gauge_param->make_resident_mom) {
4416  // make the momentum field resident
4417  momResident = cudaMom;
4418  } else {
4419  delete cudaMom;
4420  }
4421 
4424 
4425  for (int i=0; i<nvector; i++) delete X[i];
4426 
4429 
4430  checkCudaError();
4431 }
4432 
4433 void computeHISQForceQuda(void* const milc_momentum,
4434  double dt,
4435  const double level2_coeff[6],
4436  const double fat7_coeff[6],
4437  const void* const w_link,
4438  const void* const v_link,
4439  const void* const u_link,
4440  void **fermion,
4441  int num_terms,
4442  int num_naik_terms,
4443  double **coeff,
4445 {
4446 #ifdef GPU_STAGGERED_OPROD
4447  using namespace quda;
4448  using namespace quda::fermion_force;
4450  if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
4451 
4452  checkGaugeParam(gParam);
4453 
4455 
4456  // create the device outer-product field
4457  GaugeFieldParam oParam(0, *gParam, QUDA_GENERAL_LINKS);
4458  oParam.nFace = 0;
4459  oParam.create = QUDA_ZERO_FIELD_CREATE;
4460  oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4461  cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
4462  cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
4463  cudaGaugeField *naikOprod = new cudaGaugeField(oParam);
4464 
4465  {
4466  // default settings for the unitarization
4467  const double unitarize_eps = 1e-14;
4468  const double hisq_force_filter = 5e-5;
4469  const double max_det_error = 1e-10;
4470  const bool allow_svd = true;
4471  const bool svd_only = false;
4472  const double svd_rel_err = 1e-8;
4473  const double svd_abs_err = 1e-8;
4474 
4475  setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);
4476  }
4477 
4478  double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4479  // You have to look at the MILC routine to understand the following
4480  // Basically, I have already absorbed the one-link coefficient
4481 
4482  GaugeFieldParam param(milc_momentum, *gParam, QUDA_ASQTAD_MOM_LINKS);
4483  //param.nFace = 0;
4484  param.order = QUDA_MILC_GAUGE_ORDER;
4487  cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;
4488 
4489  param.link_type = QUDA_GENERAL_LINKS;
4491  param.gauge = (void*)w_link;
4492  cpuGaugeField cpuWLink(param);
4493  param.gauge = (void*)v_link;
4494  cpuGaugeField cpuVLink(param);
4495  param.gauge = (void*)u_link;
4496  cpuGaugeField cpuULink(param);
4497 
4502  GaugeFieldParam momParam(param);
4503 
4505  param.link_type = QUDA_GENERAL_LINKS;
4506  param.setPrecision(gParam->cpu_prec, true);
4507 
4509  for (int dir=0; dir<4; ++dir) {
4510  param.x[dir] += 2*R[dir];
4511  param.r[dir] = R[dir];
4512  }
4513 
4516  param.setPrecision(gParam->cpu_prec);
4518 
4520 
4521  { // do outer-product computation
4522  ColorSpinorParam qParam;
4523  qParam.nColor = 3;
4524  qParam.nSpin = 1;
4527  qParam.nDim = 4;
4528  qParam.setPrecision(oParam.Precision());
4529  qParam.pad = 0;
4530  for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];
4531 
4532  // create the device quark field
4533  qParam.create = QUDA_NULL_FIELD_CREATE;
4535  cudaColorSpinorField cudaQuark(qParam);
4536 
4537  // create the host quark field
4540  qParam.v = fermion[0];
4541 
4542  { // regular terms
4543  GaugeField *oprod[2] = {stapleOprod, naikOprod};
4544 
4545  // loop over different quark fields
4546  for(int i=0; i<num_terms; ++i){
4547 
4548  // Wrap the MILC quark field
4550  qParam.v = fermion[i];
4551  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4553 
4555  cudaQuark = cpuQuark;
4557 
4559  computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
4561  }
4562  }
4563 
4564  { // naik terms
4565  oneLinkOprod->copy(*stapleOprod);
4566  ax(level2_coeff[0], *oneLinkOprod);
4567  GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4568 
4569  // loop over different quark fields
4570  for(int i=0; i<num_naik_terms; ++i){
4571 
4572  // Wrap the MILC quark field
4574  qParam.v = fermion[i + num_terms - num_naik_terms];
4575  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4577 
4579  cudaQuark = cpuQuark;
4581 
4583  computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
4585  }
4586  }
4587  }
4588 
4590  cudaGaugeField* cudaInForce = new cudaGaugeField(param);
4591  copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
4592  delete stapleOprod;
4593 
4594  cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
4595  copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
4596  delete oneLinkOprod;
4597 
4598  cudaGaugeField* cudaGauge = new cudaGaugeField(param);
4600 
4601  cudaGauge->loadCPUField(cpuWLink, profileHISQForce);
4602 
4603  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4604  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4605  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4606 
4608  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff);
4610 
4611  // Load naik outer product
4612  copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
4613  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4614  delete naikOprod;
4615 
4616  // Compute Naik three-link term
4618  hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1]);
4620 
4621  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4622 
4623  // load v-link
4624  cudaGauge->loadCPUField(cpuVLink, profileHISQForce);
4625  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4626 
4628  *num_failures_h = 0;
4629  unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d);
4631 
4632  if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
4633 
4634  cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
4635 
4636  // read in u-link
4637  cudaGauge->loadCPUField(cpuULink, profileHISQForce);
4638  cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
4639 
4640  // Compute Fat7-staple term
4642  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff);
4644 
4645  delete cudaInForce;
4646  cudaGaugeField* cudaMom = new cudaGaugeField(momParam);
4647 
4649  hisqCompleteForce(*cudaOutForce, *cudaGauge);
4651 
4652  if (gParam->use_resident_mom) {
4653  if (!momResident) errorQuda("No resident momentum field to use");
4654  updateMomentum(*momResident, dt, *cudaOutForce, "hisq");
4655  } else {
4656  updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
4657  }
4658 
4659  if (gParam->return_result_mom) {
4660  // Close the paths, make anti-hermitian, and store in compressed format
4661  if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
4662  }
4663 
4665 
4666  if (cpuMom) delete cpuMom;
4667 
4668  if (!gParam->make_resident_mom) {
4669  delete momResident;
4670  momResident = nullptr;
4671  }
4672  if (cudaMom) delete cudaMom;
4673  delete cudaOutForce;
4674  delete cudaGauge;
4676 
4678 
4679 #else
4680  errorQuda("HISQ force has not been built");
4681 #endif
4682 }
4683 
4684 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,
4685  double *coeff, double kappa2, double ck,
4686  int nvector, double multiplicity, void *gauge,
4688 
4689  using namespace quda;
4692 
4693  checkGaugeParam(gauge_param);
4694  if (!gaugePrecise) errorQuda("No resident gauge field");
4695 
4696  GaugeFieldParam fParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);
4697  // create the host momentum field
4699  fParam.order = gauge_param->gauge_order;
4700  cpuGaugeField cpuMom(fParam);
4701 
4702  // create the device momentum field
4703  fParam.create = QUDA_ZERO_FIELD_CREATE;
4704  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4705  cudaGaugeField cudaMom(fParam);
4706 
4707  // create the device force field
4708  fParam.link_type = QUDA_GENERAL_LINKS;
4709  fParam.create = QUDA_ZERO_FIELD_CREATE;
4710  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4712  cudaGaugeField cudaForce(fParam);
4713 
4714  ColorSpinorParam qParam;
4716  qParam.nColor = 3;
4717  qParam.nSpin = 4;
4720  qParam.nDim = 4;
4721  qParam.setPrecision(fParam.Precision());
4722  qParam.pad = 0;
4723  for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];
4724 
4725  // create the device quark field
4726  qParam.create = QUDA_NULL_FIELD_CREATE;
4729 
4730  std::vector<ColorSpinorField*> quarkX, quarkP;
4731  for (int i=0; i<nvector; i++) {
4732  quarkX.push_back(ColorSpinorField::Create(qParam));
4733  quarkP.push_back(ColorSpinorField::Create(qParam));
4734  }
4735 
4737  qParam.x[0] /= 2;
4738  cudaColorSpinorField tmp(qParam);
4739 
4740  // create the host quark field
4743  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface
4744 
4745  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
4746  (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
4747  DiracParam diracParam;
4748  setDiracParam(diracParam, inv_param, pc_solve);
4749  diracParam.tmp1 = &tmp; // use as temporary for dirac->M
4750  Dirac *dirac = Dirac::create(diracParam);
4751 
4752  if (inv_param->use_resident_solution) {
4753  if (solutionResident.size() < (unsigned int)nvector)
4754  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
4755  solutionResident.size(), nvector);
4756  }
4757 
4759 
4760  // create oprod and trace fields
4761  fParam.geometry = QUDA_TENSOR_GEOMETRY;
4762  cudaGaugeField oprod(fParam);
4763 
4766 
4767  std::vector<double> force_coeff(nvector);
4768  // loop over different quark fields
4769  for(int i=0; i<nvector; i++){
4770  ColorSpinorField &x = *(quarkX[i]);
4771  ColorSpinorField &p = *(quarkP[i]);
4772 
4773  if (!inv_param->use_resident_solution) {
4774  // for downloading x_e
4776  qParam.x[0] /= 2;
4777 
4778  // Wrap the even-parity MILC quark field
4781  qParam.v = h_x[i];
4782  cpuColorSpinorField cpuQuarkX(qParam); // create host quark field
4784 
4786  x.Even() = cpuQuarkX;
4788 
4790  gamma5(x.Even(), x.Even());
4791  } else {
4792  x.Even() = *(solutionResident[i]);
4793  }
4794 
4795  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
4796  dirac->M(p.Even(), x.Even());
4797  dirac->Dagger(QUDA_DAG_YES);
4798  dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
4799  dirac->Dagger(QUDA_DAG_NO);
4800 
4801  gamma5(x, x);
4802  gamma5(p, p);
4803 
4804  force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
4805  }
4806 
4807  computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
4808 
4809  // In double precision the clover derivative is faster with no reconstruct
4810  cudaGaugeField *u = &gaugeEx;
4811  if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
4812  GaugeFieldParam param(gaugeEx);
4814  u = new cudaGaugeField(param);
4815  u -> copy(gaugeEx);
4816  }
4817 
4818  computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);
4819 
4820  /* Now the U dA/dU terms */
4821  std::vector< std::vector<double> > ferm_epsilon(nvector);
4822  for (int shift = 0; shift < nvector; shift++) {
4823  ferm_epsilon[shift].reserve(2);
4824  ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
4825  ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
4826  }
4827 
4828  computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
4829 
4831 
4833 
4834  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
4835  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
4836 
4837  if (u != &gaugeEx) delete u;
4838 
4839  updateMomentum(cudaMom, -1.0, cudaForce, "clover");
4841 
4842  // copy the outer product field back to the host
4844  cudaMom.saveCPUField(cpuMom);
4846 
4848 
4849  for (int i=0; i<nvector; i++) {
4850  delete quarkX[i];
4851  delete quarkP[i];
4852  }
4853 
4854 #if 0
4855  if (inv_param->use_resident_solution) {
4856  for (auto v : solutionResident) if (v) delete v;
4857  solutionResident.clear();
4858  }
4859 #endif
4860  delete dirac;
4862 
4863  checkCudaError();
4865 }
4866 
4867 
4868 
4869 void updateGaugeFieldQuda(void* gauge,
4870  void* momentum,
4871  double dt,
4872  int conj_mom,
4873  int exact,
4874  QudaGaugeParam* param)
4875 {
4877 
4878  checkGaugeParam(param);
4879 
4881 
4882  // create the host fields
4883  GaugeFieldParam gParam(gauge, *param, QUDA_SU3_LINKS);
4884  gParam.site_offset = param->gauge_offset;
4885  gParam.site_size = param->site_size;
4886  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4887  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
4888 
4889  GaugeFieldParam gParamMom(momentum, *param);
4890  gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
4892  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
4893  gParamMom.site_offset = param->mom_offset;
4894  gParamMom.site_size = param->site_size;
4895  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;
4896 
4897  // create the device fields
4898  gParam.create = QUDA_NULL_FIELD_CREATE;
4899  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4903  gParam.pad = 0;
4904  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
4905 
4906  gParam.link_type = QUDA_SU3_LINKS;
4907  gParam.reconstruct = param->reconstruct;
4908  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
4909  auto *cudaOutGauge = new cudaGaugeField(gParam);
4910 
4912 
4914 
4915  if (!param->use_resident_gauge) { // load fields onto the device
4916  cudaInGauge->loadCPUField(*cpuGauge);
4917  } else { // or use resident fields already present
4918  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
4919  cudaInGauge = gaugePrecise;
4920  gaugePrecise = nullptr;
4921  }
4922 
4923  if (!param->use_resident_mom) {
4924  cudaMom->loadCPUField(*cpuMom);
4925  } else {
4926  if (!momResident) errorQuda("No resident mom field allocated");
4927  cudaMom = momResident;
4928  momResident = nullptr;
4929  }
4930 
4932 
4933  // perform the update
4935  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
4936  (bool)conj_mom, (bool)exact);
4938 
4939  if (param->return_result_gauge) {
4940  // copy the gauge field back to the host
4942  cudaOutGauge->saveCPUField(*cpuGauge);
4944  }
4945 
4947  if (param->make_resident_gauge) {
4948  if (gaugePrecise != nullptr) delete gaugePrecise;
4949  gaugePrecise = cudaOutGauge;
4950  } else {
4951  delete cudaOutGauge;
4952  }
4953 
4954  if (param->make_resident_mom) {
4955  if (momResident != nullptr && momResident != cudaMom) delete momResident;
4956  momResident = cudaMom;
4957  } else {
4958  delete cudaMom;
4959  }
4960 
4961  delete cudaInGauge;
4962  if (cpuMom) delete cpuMom;
4963  if (cpuGauge) delete cpuGauge;
4965 
4966  checkCudaError();
4967 
4969 }
4970 
4971  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
4973 
4975  checkGaugeParam(param);
4976 
4977  // create the gauge field
4978  GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
4979  gParam.site_offset = param->gauge_offset;
4980  gParam.site_size = param->site_size;
4981  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4982  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
4983 
4984  // create the device fields
4985  gParam.create = QUDA_NULL_FIELD_CREATE;
4986  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4987  gParam.reconstruct = param->reconstruct;
4988  cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
4990 
4991  if (param->use_resident_gauge) {
4992  if (!gaugePrecise) errorQuda("No resident gauge field to use");
4993  cudaGauge = gaugePrecise;
4994  } else {
4996  cudaGauge->loadCPUField(*cpuGauge);
4998  }
4999 
5001  *num_failures_h = 0;
5002 
5003  // project onto SU(3)
5004  projectSU3(*cudaGauge, tol, num_failures_d);
5005 
5007 
5008  if(*num_failures_h>0)
5009  errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
5010 
5012  if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
5014 
5015  if (param->make_resident_gauge) {
5016  if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
5017  gaugePrecise = cudaGauge;
5018  } else {
5019  delete cudaGauge;
5020  }
5021 
5023  if (cpuGauge) delete cpuGauge;
5025 
5027  }
5028 
5029  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
5031 
5033  checkGaugeParam(param);
5034 
5035  // create the gauge field
5036  GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
5037  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
5038  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
5039 
5040  // create the device fields
5041  gParam.create = QUDA_NULL_FIELD_CREATE;
5042  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
5043  gParam.reconstruct = param->reconstruct;
5044  cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
5046 
5047  if (param->use_resident_gauge) {
5048  if (!gaugePrecise) errorQuda("No resident gauge field to use");
5049  cudaGauge = gaugePrecise;
5050  } else {
5051  profilePhase.TPSTART(QUDA_PROFILE_H2D);
5052  cudaGauge->loadCPUField(*cpuGauge);
5054  }
5055 
5057  *num_failures_h = 0;
5058 
5059  // apply / remove phase as appropriate
5060  if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();
5061  else cudaGauge->removeStaggeredPhase();
5062 
5064 
5065  profilePhase.TPSTART(QUDA_PROFILE_D2H);
5066  if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
5068 
5069  if (param->make_resident_gauge) {
5070  if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
5071  gaugePrecise = cudaGauge;
5072  } else {
5073  delete cudaGauge;
5074  }
5075 
5077  if (cpuGauge) delete cpuGauge;
5079 
5081  }
5082 
5083 // evaluate the momentum action
5084 double momActionQuda(void* momentum, QudaGaugeParam* param)
5085 {
5087 
5089  checkGaugeParam(param);
5090 
5091  // create the momentum fields
5092  GaugeFieldParam gParam(momentum, *param, QUDA_ASQTAD_MOM_LINKS);
5093  gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
5095 
5096  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr;
5097 
5098  // create the device fields
5099  gParam.create = QUDA_NULL_FIELD_CREATE;
5100  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
5102 
5103  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
5104 
5106 
5108  if (!param->use_resident_mom) {
5109  cudaMom->loadCPUField(*cpuMom);
5110  } else {
5111  if (!momResident) errorQuda("No resident mom field allocated");
5112  cudaMom = momResident;
5113  }
5115 
5116  // perform the update
5118  double action = computeMomAction(*cudaMom);
5120 
5122  if (param->make_resident_mom) {
5123  if (momResident != nullptr && momResident != cudaMom) delete momResident;
5124  momResident = cudaMom;
5125  } else {
5126  delete cudaMom;
5127  momResident = nullptr;
5128  }
5129  if (cpuMom) {
5130  delete cpuMom;
5131  }
5132 
5134 
5135  checkCudaError();
5136 
5138  return action;
5139 }
5140 
5141 /*
5142  The following functions are for the Fortran interface.
5143 */
5144 
5145 void init_quda_(int *dev) { initQuda(*dev); }
5146 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
5148 void end_quda_() { endQuda(); }
5149 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
5152 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
5153 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
5155 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5156  QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
5157 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5158  QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
5159 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5160 { MatQuda(h_out, h_in, inv_param); }
5161 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5162 { MatDagMatQuda(h_out, h_in, inv_param); }
5163 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {
5164  fflush(stdout);
5165  // ensure that fifth dimension is set to 1
5166  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
5167  invertQuda(hp_x, hp_b, param);
5168  fflush(stdout);
5169 }
5170 
5171 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param) {
5172  // ensure that fifth dimension is set to 1
5173  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
5174 
5175  if (!gaugePrecise) errorQuda("Resident gauge field not allocated");
5176 
5177  // get data into array of pointers
5178  int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;
5179 
5180  // compute offset assuming TIFR padded ordering (FIXME)
5182  errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");
5183 
5184  const int *X = gaugePrecise->X();
5185  size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;
5186  void *hp_x[QUDA_MAX_MULTI_SHIFT];
5187  for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;
5188 
5189  invertMultiShiftQuda(hp_x, hp_b, param);
5190 }
5191 
5193 
5194 void register_pinned_quda_(void *ptr, size_t *bytes) {
5195  cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
5196  checkCudaError();
5197 }
5198 
5199 void unregister_pinned_quda_(void *ptr) {
5200  cudaHostUnregister(ptr);
5201  checkCudaError();
5202 }
5203 
5205  *param = newQudaGaugeParam();
5206 }
5208  *param = newQudaInvertParam();
5209 }
5210 
5211 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
5212  bool *conj_mom, bool *exact,
5213  QudaGaugeParam *param) {
5214  updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
5215 }
5216 
5217 static inline int opp(int dir) { return 7-dir; }
5218 
5219 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){
5220 
5221  int index=0;
5222  // Plaquette paths
5223  if (num_loop_types >= 1)
5224  for(int i=0; i<4; ++i){
5225  if(i==dir) continue;
5226  paths[index][0] = i; paths[index][1] = opp(dir); paths[index++][2] = opp(i);
5227  paths[index][0] = opp(i); paths[index][1] = opp(dir); paths[index++][2] = i;
5228  }
5229 
5230  // Rectangle Paths
5231  if (num_loop_types >= 2)
5232  for(int i=0; i<4; ++i){
5233  if(i==dir) continue;
5234  paths[index][0] = paths[index][1] = i; paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
5235  index++;
5236  paths[index][0] = paths[index][1] = opp(i); paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
5237  index++;
5238  paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
5239  index++;
5240  paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
5241  index++;
5242  paths[index][0] = i; paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
5243  index++;
5244  paths[index][0] = opp(i); paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
5245  index++;
5246  }
5247 
5248  if (num_loop_types >= 3) {
5249  // Staple paths
5250  for(int i=0; i<4; ++i){
5251  for(int j=0; j<4; ++j){
5252  if(i==dir || j==dir || i==j) continue;
5253  paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
5254  index++;
5255  paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
5256  index++;
5257  paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
5258  index++;
5259  paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
5260  index++;
5261  }
5262  }
5263  }
5264 
5265 }
5266 
5267 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,
5268  QudaGaugeParam *param) {
5269 
5270  int numPaths = 0;
5271  switch (*num_loop_types) {
5272  case 1:
5273  numPaths = 6;
5274  break;
5275  case 2:
5276  numPaths = 24;
5277  break;
5278  case 3:
5279  numPaths = 48;
5280  break;
5281  default:
5282  errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);
5283  }
5284 
5285  auto *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
5286  int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));
5287 
5288  if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
5289  loop_coeff[i] = coeff[0];
5290  path_length[i] = 3;
5291  }
5292  if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {
5293  loop_coeff[i] = coeff[1];
5294  path_length[i] = 5;
5295  }
5296  if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {
5297  loop_coeff[i] = coeff[2];
5298  path_length[i] = 5;
5299  }
5300 
5301  int** input_path_buf[4];
5302  for(int dir=0; dir<4; ++dir){
5303  input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
5304  for(int i=0; i<numPaths; ++i){
5305  input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));
5306  }
5307  createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
5308  }
5309 
5310  int max_length = 6;
5311 
5312  computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);
5313 
5314  for(auto & dir : input_path_buf){
5315  for(int i=0; i<numPaths; ++i) host_free(dir[i]);
5316  host_free(dir);
5317  }
5318 
5319  host_free(path_length);
5320  host_free(loop_coeff);
5321 }
5322 
5323 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {
5324  computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);
5325 }
5326 
5327 // apply the staggered phases
5329  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");
5330  if (gaugePrecise) {
5331  gaugePrecise->applyStaggeredPhase();
5332  } else {
5333  errorQuda("No persistent gauge field");
5334  }
5335 }
5336 
5337 // remove the staggered phases
5339  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");
5340  if (gaugePrecise) {
5341  gaugePrecise->removeStaggeredPhase();
5342  } else {
5343  errorQuda("No persistent gauge field");
5344  }
5346 }
5347 
5348 // evaluate the kinetic term
5349 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {
5350  *kin = momActionQuda(momentum, param);
5351 }
5352 
5353 
5357 #ifdef MULTI_GPU
5358 static int bqcd_rank_from_coords(const int *coords, void *fdata)
5359 {
5360  int *dims = static_cast<int *>(fdata);
5361 
5362  int rank = coords[3];
5363  for (int i = 2; i >= 0; i--) {
5364  rank = dims[i] * rank + coords[i];
5365  }
5366  return rank;
5367 }
5368 #endif
5369 
5370 void comm_set_gridsize_(int *grid)
5371 {
5372 #ifdef MULTI_GPU
5373  initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
5374 #endif
5375 }
5376 
5381 {
5382  bool pack_ = *pack ? true : false;
5383  setKernelPackT(pack_);
5384 }
5385 
5386 void gaussGaugeQuda(unsigned long long seed, double sigma)
5387 {
5389 
5390  if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
5391 
5392  cudaGaugeField *data = gaugePrecise;
5393 
5394  GaugeFieldParam param(*data);
5397  cudaGaugeField u(param);
5398 
5400  quda::gaugeGauss(*data, seed, sigma);
5402 
5403  if (extendedGaugeResident) {
5404  *extendedGaugeResident = *gaugePrecise;
5405  extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
5406  }
5407 
5409 }
5410 
5411 
5412 /*
5413  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.
5414  */
5415 void plaq_quda_(double plaq[3]) {
5416  plaqQuda(plaq);
5417 }
5418 
5419 void plaqQuda(double plaq[3])
5420 {
5422 
5423  if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
5424 
5425  cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
5426  extendedGaugeResident = data;
5427 
5429  double3 plaq3 = quda::plaquette(*data);
5430  plaq[0] = plaq3.x;
5431  plaq[1] = plaq3.y;
5432  plaq[2] = plaq3.z;
5434 
5436 }
5437 
5438 /*
5439  * Performs a deep copy from the internal extendedGaugeResident field.
5440  */
5441 void copyExtendedResidentGaugeQuda(void* resident_gauge, QudaFieldLocation loc)
5442 {
5443  //profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);
5444 
5445  if (!gaugePrecise) errorQuda("Cannot perform deep copy of resident gauge field as there is no resident gauge field");
5446 
5447  cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
5448  extendedGaugeResident = data;
5449 
5450  auto* io_gauge = (cudaGaugeField*)resident_gauge;
5451 
5452  copyExtendedGauge(*io_gauge, *extendedGaugeResident, loc);
5453 
5454  //profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
5455 }
5456 
5457 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
5458 {
5460 
5461  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5462 
5463  pushVerbosity(inv_param->verbosity);
5465 
5466  cudaGaugeField *precise = nullptr;
5467 
5468  if (gaugeSmeared != nullptr) {
5469  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
5470  GaugeFieldParam gParam(*gaugePrecise);
5471  gParam.create = QUDA_NULL_FIELD_CREATE;
5472  precise = new cudaGaugeField(gParam);
5473  copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
5474  precise->exchangeGhost();
5475  } else {
5476  if (getVerbosity() >= QUDA_VERBOSE)
5477  printfQuda("Wuppertal smearing done with gaugePrecise\n");
5478  precise = gaugePrecise;
5479  }
5480 
5481  ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), false, inv_param->input_location);
5482  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
5483 
5484  ColorSpinorParam cudaParam(cpuParam, *inv_param);
5485  cudaColorSpinorField in(*in_h, cudaParam);
5486 
5487  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5488  double cpu = blas::norm2(*in_h);
5489  double gpu = blas::norm2(in);
5490  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
5491  }
5492 
5493  cudaParam.create = QUDA_NULL_FIELD_CREATE;
5494  cudaColorSpinorField out(in, cudaParam);
5495  int parity = 0;
5496 
5497  for (unsigned int i=0; i<nSteps; i++) {
5498  if(i) in = out;
5499  wuppertalStep(out, in, parity, *precise, alpha);
5500  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5501  double norm = blas::norm2(out);
5502  printfQuda("Step %d, vector norm %e\n", i, norm);
5503  }
5504  }
5505 
5506  cpuParam.v = h_out;
5507  cpuParam.location = inv_param->output_location;
5508  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
5509  *out_h = out;
5510 
5511  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5512  double cpu = blas::norm2(*out_h);
5513  double gpu = blas::norm2(out);
5514  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
5515  }
5516 
5517  if (gaugeSmeared != nullptr)
5518  delete precise;
5519 
5520  delete out_h;
5521  delete in_h;
5522 
5523  popVerbosity();
5524 
5526 }
5527 
5528 void performAPEnStep(unsigned int nSteps, double alpha)
5529 {
5530  profileAPE.TPSTART(QUDA_PROFILE_TOTAL);
5531 
5532  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5533 
5534  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5535  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);
5536 
5537  GaugeFieldParam gParam(*gaugeSmeared);
5538  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5539 
5540  double3 plaq = plaquette(*gaugeSmeared);
5541  if (getVerbosity() >= QUDA_SUMMARIZE) {
5542  printfQuda("Plaquette after 0 APE steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5543  }
5544 
5545  for (unsigned int i=0; i<nSteps; i++) {
5546  cudaGaugeTemp->copy(*gaugeSmeared);
5547  cudaGaugeTemp->exchangeExtendedGhost(R,profileAPE,redundant_comms);
5548  APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
5549  }
5550 
5551  delete cudaGaugeTemp;
5552 
5554 
5555  plaq = plaquette(*gaugeSmeared);
5556  if (getVerbosity() >= QUDA_SUMMARIZE) {
5557  printfQuda("Plaquette after %d APE steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5558  }
5559 
5561 }
5562 
5563 void performSTOUTnStep(unsigned int nSteps, double rho)
5564 {
5566 
5567  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5568 
5569  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5570  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);
5571 
5572  GaugeFieldParam gParam(*gaugeSmeared);
5573  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5574 
5575  double3 plaq = plaquette(*gaugeSmeared);
5576  if (getVerbosity() >= QUDA_SUMMARIZE) {
5577  printfQuda("Plaquette after 0 STOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5578  }
5579 
5580  for (unsigned int i=0; i<nSteps; i++) {
5581  cudaGaugeTemp->copy(*gaugeSmeared);
5582  cudaGaugeTemp->exchangeExtendedGhost(R,profileSTOUT,redundant_comms);
5583  STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
5584  }
5585 
5586  delete cudaGaugeTemp;
5587 
5588  gaugeSmeared->exchangeExtendedGhost(R,redundant_comms);
5589 
5590  plaq = plaquette(*gaugeSmeared);
5591  if (getVerbosity() >= QUDA_SUMMARIZE) {
5592  printfQuda("Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5593  }
5594 
5596 }
5597 
5598 void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
5599 {
5601 
5602  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
5603 
5604  if (gaugeSmeared != nullptr) delete gaugeSmeared;
5605  gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);
5606 
5607  GaugeFieldParam gParam(*gaugeSmeared);
5608  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
5609 
5610  double3 plaq = plaquette(*gaugeSmeared);
5611  if (getVerbosity() >= QUDA_SUMMARIZE) {
5612  printfQuda("Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5613  }
5614 
5615  for (unsigned int i=0; i<nSteps; i++) {
5616  cudaGaugeTemp->copy(*gaugeSmeared);
5617  cudaGaugeTemp->exchangeExtendedGhost(R,profileOvrImpSTOUT,redundant_comms);
5618  OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);
5619  }
5620 
5621  delete cudaGaugeTemp;
5622 
5624 
5625  plaq = plaquette(*gaugeSmeared);
5626  if (getVerbosity() >= QUDA_SUMMARIZE) {
5627  printfQuda("Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5628  }
5629 
5631 }
5632 
5633 
5634 int computeGaugeFixingOVRQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \
5635  const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, \
5636  const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo)
5637 {
5638 
5640 
5641  checkGaugeParam(param);
5642 
5644  GaugeFieldParam gParam(gauge, *param);
5645  auto *cpuGauge = new cpuGaugeField(gParam);
5646 
5647  //gParam.pad = getFatLinkPadding(param->X);
5648  gParam.create = QUDA_NULL_FIELD_CREATE;
5649  gParam.link_type = param->type;
5650  gParam.reconstruct = param->reconstruct;
5651  gParam.order = (gParam.Precision() == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
5653  auto *cudaInGauge = new cudaGaugeField(gParam);
5654 
5656 
5658 
5659 
5661  cudaInGauge->loadCPUField(*cpuGauge);
5662  /* } else { // or use resident fields already present
5663  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5664  cudaInGauge = gaugePrecise;
5665  gaugePrecise = nullptr;
5666  } */
5667 
5669 
5670  checkCudaError();
5671 
5672  if (comm_size() == 1) {
5673  // perform the update
5675  gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5676  reunit_interval, stopWtheta);
5678  } else {
5679  cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);
5680 
5681  // perform the update
5683  gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5684  reunit_interval, stopWtheta);
5686 
5687  //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge
5688  copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
5689  }
5690 
5691  checkCudaError();
5692  // copy the gauge field back to the host
5694  cudaInGauge->saveCPUField(*cpuGauge);
5696 
5698 
5699  if (param->make_resident_gauge) {
5700  if (gaugePrecise != nullptr) delete gaugePrecise;
5701  gaugePrecise = cudaInGauge;
5702  } else {
5703  delete cudaInGauge;
5704  }
5705 
5706  if(timeinfo){
5707  timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
5708  timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
5709  timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
5710  }
5711 
5712  checkCudaError();
5713  return 0;
5714 }
5715 
5716 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \
5717  const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
5718  const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo)
5719 {
5720 
5722 
5723  checkGaugeParam(param);
5724 
5726 
5727  GaugeFieldParam gParam(gauge, *param);
5728  auto *cpuGauge = new cpuGaugeField(gParam);
5729 
5730  //gParam.pad = getFatLinkPadding(param->X);
5731  gParam.create = QUDA_NULL_FIELD_CREATE;
5732  gParam.link_type = param->type;
5733  gParam.reconstruct = param->reconstruct;
5734  gParam.order = (gParam.Precision() == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
5736 
5737  auto *cudaInGauge = new cudaGaugeField(gParam);
5738 
5739 
5741 
5743 
5744  //if (!param->use_resident_gauge) { // load fields onto the device
5745  cudaInGauge->loadCPUField(*cpuGauge);
5746  /*} else { // or use resident fields already present
5747  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5748  cudaInGauge = gaugePrecise;
5749  gaugePrecise = nullptr;
5750  } */
5751 
5752 
5754 
5755  // perform the update
5757  checkCudaError();
5758 
5759  gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5760 
5762 
5763  checkCudaError();
5764  // copy the gauge field back to the host
5766  checkCudaError();
5767  cudaInGauge->saveCPUField(*cpuGauge);
5769  checkCudaError();
5770 
5772 
5773  if (param->make_resident_gauge) {
5774  if (gaugePrecise != nullptr) delete gaugePrecise;
5775  gaugePrecise = cudaInGauge;
5776  } else {
5777  delete cudaInGauge;
5778  }
5779 
5780  if(timeinfo){
5781  timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
5782  timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
5783  timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
5784  }
5785 
5786  checkCudaError();
5787  return 0;
5788 }
5789 
5790 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
5791  QudaInvertParam *param, const int *X)
5792 {
5793  // DMH: Easiest way to construct ColorSpinorField? Do we require the user
5794  // to declare and fill and invert_param, or can it just be hacked?.
5795 
5798  // wrap CPU host side pointers
5799  ColorSpinorParam cpuParam((void *)hp_x, *param, X, false, param->input_location);
5800  ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);
5801 
5802  cpuParam.v = (void *)hp_y;
5803  ColorSpinorField *h_y = ColorSpinorField::Create(cpuParam);
5804 
5805  // Create device parameter
5806  ColorSpinorParam cudaParam(cpuParam);
5807  cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
5808  cudaParam.create = QUDA_NULL_FIELD_CREATE;
5809  // Quda uses Degrand-Rossi gamma basis for contractions and will
5810  // automatically reorder data if necessary.
5812  cudaParam.setPrecision(cpuParam.Precision(), cpuParam.Precision(), true);
5813 
5814  std::vector<ColorSpinorField *> x, y;
5815  x.push_back(ColorSpinorField::Create(cudaParam));
5816  y.push_back(ColorSpinorField::Create(cudaParam));
5817 
5818  size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();
5819  void *d_result = pool_device_malloc(data_bytes);
5821 
5823  *x[0] = *h_x;
5824  *y[0] = *h_y;
5826 
5828  contractQuda(*x[0], *y[0], d_result, cType);
5830 
5832  qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);
5834 
5836  pool_device_free(d_result);
5837  delete x[0];
5838  delete y[0];
5839  delete h_y;
5840  delete h_x;
5842 
5844 }
5845 
5846 double qChargeQuda()
5847 {
5849 
5850  cudaGaugeField *gauge = nullptr;
5851  if (!gaugeSmeared) {
5852  if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileQCharge);
5853  gauge = extendedGaugeResident;
5854  } else {
5855  gauge = gaugeSmeared;
5856  }
5857  // Do we keep the smeared extended field on memory, or the unsmeared one?
5858 
5860  // create the Fmunu field
5861 
5862  GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
5863  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
5864  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
5865  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
5866  cudaGaugeField Fmunu(tensorParam);
5867 
5870 
5871  computeFmunu(Fmunu, *gauge);
5872  double charge = quda::computeQCharge(Fmunu);
5873 
5876 
5877  return charge;
5878 }
5879 
5880 double qChargeDensityQuda(void *h_qDensity)
5881 {
5883 
5884  cudaGaugeField *gauge = nullptr;
5885  if (!gaugeSmeared) {
5886  if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileQCharge);
5887  gauge = extendedGaugeResident;
5888  } else {
5889  gauge = gaugeSmeared;
5890  }
5891  // Do we keep the smeared extended field on memory, or the unsmeared one?
5893  // create the Fmunu field
5894  GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
5895  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
5896  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
5897  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
5898  cudaGaugeField Fmunu(tensorParam);
5899 
5900  size_t size = Fmunu.Volume() * Fmunu.Precision();
5901  void *d_qDensity = device_malloc(size);
5903 
5905  computeFmunu(Fmunu, *gauge);
5906  double charge = quda::computeQChargeDensity(Fmunu, d_qDensity);
5908 
5910  qudaMemcpy(h_qDensity, d_qDensity, size, cudaMemcpyDeviceToHost);
5912 
5914  device_free(d_qDensity);
5916 
5918 
5919  return charge;
5920 }
void new_quda_invert_param_(QudaInvertParam *param)
cudaColorSpinorField * tmp2
double action[2]
Definition: quda.h:202
QudaCloverFieldOrder order
Definition: clover_field.h:21
static QudaGaugeParam qudaGaugeParam
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
static bool reunit_allow_svd
double secs
Definition: quda.h:251
QudaTboundary t_boundary
Definition: gauge_field.h:20
double iter_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:188
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508
QudaDiracFieldOrder dirac_order
Definition: quda.h:219
QudaMassNormalization mass_normalization
Definition: quda.h:208
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:182
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
void comm_finalize(void)
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:532
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void freeCloverQuda(void)
void dumpNullVectors() const
Dump the null-space vectors to disk. Will recurse dumping all levels.
Definition: multigrid.cpp:1045
static bool do_not_profile_quda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
cudaColorSpinorField * tmp1
QudaBoolean use_dagger
Definition: quda.h:401
int commDimPartitioned(int dir)
double * TrLog() const
Definition: clover_field.h:88
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
QudaFieldLocation clover_location
Definition: quda.h:223
QudaSolveType solve_type
Definition: quda.h:205
double computeQCharge(const GaugeField &Fmunu)
Compute the topological charge.
enum QudaPrecision_s QudaPrecision
void destroy()
Destroy the CUBLAS context.
Definition: blas_cublas.cu:38
double tol_hq
Definition: test_util.cpp:1657
void freeGaugeQuda(void)
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.
int ga_pad
Definition: quda.h:63
double_complex c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:112
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
int make_resident_mom
Definition: quda.h:83
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double momActionQuda(void *momentum, QudaGaugeParam *param)
size_t gauge_offset
Definition: quda.h:87
double mu
Definition: quda.h:114
void * V(bool inverse=false)
Definition: clover_field.h:74
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Multi-Shift Conjugate Gradient Solver.
Definition: invert_quda.h:1121
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
Definition: dirac_quda.h:1138
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
QudaInverterType inv_type_precondition
Definition: quda.h:270
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
double kappa
Definition: test_util.cpp:1647
QudaLinkType type
Definition: quda.h:42
double kappa
Definition: quda.h:106
void end(void)
Definition: blas_quda.cu:489
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
QudaPrecision cuda_prec_ritz
Definition: quda.h:324
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.
#define errorQuda(...)
Definition: util_quda.h:121
void loadSloppyCloverQuda(const QudaPrecision prec[])
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
void init()
Definition: blas_quda.cu:483
QudaDslashType dslash_type
Definition: quda.h:102
QudaReconstructType reconstruct_precondition
Definition: quda.h:59
QudaFieldCreate create
Definition: clover_field.h:22
QudaInverterType inv_type
Definition: quda.h:103
Fortran interface functions.
size_t Bytes() const
Definition: clover_field.h:98
QudaPrecision cuda_prec
Definition: quda.h:214
int return_clover_inverse
Definition: quda.h:242
#define host_free(ptr)
Definition: malloc_quda.h:71
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:589
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void performSTOUTnStep(unsigned int nSteps, double rho)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.
Definition: gauge_stout.cu:129
double epsilon
Definition: test_util.cpp:1649
cudaGaugeField * cudaMom
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
Definition: momentum.cu:328
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:69
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
QudaPrecision cpu_prec
Definition: quda.h:213
void plaq_quda_(double plaq[3])
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for eigensolveQuda.
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.
uint64_t checksum(bool mini=false) const
static bool enable_profiler
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
cudaGaugeField * gaugeLongPrecise
static int rank
Definition: comm_mpi.cpp:44
const ColorSpinorField & Even() const
void saveTuneCache(bool error=false)
Definition: tune.cpp:426
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.
const ColorSpinorField & Odd() const
static ColorSpinorField * Create(const ColorSpinorParam &param)
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
const int Nstream
Definition: quda_internal.h:83
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:579
cudaGaugeField * gaugeFatPrecondition
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
double trlogA[2]
Definition: quda.h:237
void assertAllMemFree()
Definition: malloc.cpp:384
__host__ __device__ void copy(T1 &a, const T2 &b)
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
static int R[4]
void setMPICommHandleQuda(void *mycomm)
QudaDagType dagger
Definition: quda.h:207
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
cpuGaugeField * cpuMom
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.
void free_clover_quda_(void)
QudaGaugeParam gauge_param
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:216
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
ColorSpinorField & Component(const int idx) const
int chrono_replace_last
Definition: quda.h:356
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
Definition: quda.h:576
QudaPrecision clover_cuda_prec_refinement_sloppy
Definition: quda.h:227
void destroyDeflationQuda(void *df)
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double computeQChargeDensity(const GaugeField &Fmunu, void *result)
Compute the topological charge density per lattice site.
size_t mom_offset
Definition: quda.h:88
static TimeProfile profileInit2End("initQuda-endQuda", false)
Complex c_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:28
bool forceMonitor()
Whether we are monitoring the force or not.
Definition: momentum.cu:13
cudaGaugeField * gaugeFatRefinement
int comm_gpuid(void)
int return_clover
Definition: quda.h:241
int length[]
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:340
cudaGaugeField * gaugeLongRefinement
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
void reset(bool refresh=false)
This method resets the solver, e.g., when a parameter has changed such as the mass.
Definition: multigrid.cpp:117
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
int make_resident_solution
Definition: quda.h:347
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
QudaBoolean setup_minimize_memory
Definition: quda.h:609
int overwrite_mom
Definition: quda.h:78
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.
double qChargeDensityQuda(void *h_qDensity)
Calculates the topological charge from gaugeSmeared, if it exist, or from gaugePrecise if no smeared ...
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
static bool InitMagma
int compute_clover_trlog
Definition: quda.h:236
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
int chrono_index
Definition: quda.h:365
char * gitversion
Definition: version.cpp:4
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:226
int compute_action
Definition: quda.h:197
cudaGaugeField * gaugeLongExtended
QudaPrecision chrono_precision
Definition: quda.h:368
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
QudaFieldLocation input_location
Definition: quda.h:99
void destroyGaugeFieldQuda(void *gauge)
QudaBoolean use_poly_acc
Definition: quda.h:387
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:191
cudaGaugeField * gauge
Definition: dirac_quda.h:31
size_t site_size
Definition: quda.h:89
void CloseMagma()
Definition: blas_magma.cu:323
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
cudaCloverField * cloverPrecondition
double Csw() const
Definition: clover_field.h:118
QudaUseInitGuess use_init_guess
Definition: quda.h:231
DeflationParam * deflParam
Definition: deflation.h:187
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:492
void init_quda_(int *dev)
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:566
int getGaugePadding(GaugeFieldParam &param)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.
Definition: gauge_ape.cu:128
QudaGaugeParam param
Definition: pack_test.cpp:17
double_complex b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:111
void openMagma()
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:178
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi.cpp.
Definition: comm_mpi.cpp:58
static int ndim
Definition: layout_hyper.c:53
QudaSolutionType solution_type
Definition: quda.h:204
int nConv
Definition: quda.h:420
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:601
QudaMemoryType mem_type_ritz
Definition: quda.h:450
QudaSolverNormalization solver_normalization
Definition: quda.h:209
static double svd_rel_error
int Ncolor() const
Definition: gauge_field.h:249
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
QudaPrecision clover_cuda_prec
Definition: quda.h:225
const int * R() const
std::vector< cudaColorSpinorField * > solutionResident
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
bool is_composite
for deflation solvers:
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
int chrono_use_resident
Definition: quda.h:359
double Last(QudaProfileType idx)
Definition: timer.h:251
void initQuda(int dev)
int comm_size(void)
Definition: comm_mpi.cpp:88
QudaInvertParam * invert_param
Definition: quda.h:381
Complex b_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:27
cudaGaugeField * gaugeFatSloppy
cudaDeviceProp deviceProp
#define qudaDeviceSynchronize()
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
double secs
Definition: quda.h:468
double tol
Definition: test_util.cpp:1656
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:457
QudaFieldLocation output_location
Definition: quda.h:100
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:228
QudaFieldLocation location
QudaInvertParam inv_param
Definition: covdev_test.cpp:37
int setNumaAffinityNVML(int deviceid)
bool canReuseResidentGauge(QudaInvertParam *inv_param)
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:187
void freeSloppyGaugeQuda()
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:428
double m5
Definition: quda.h:108
size_t Bytes() const
Definition: gauge_field.h:311
static GaugeField * Create(const GaugeFieldParam &param)
Create the gauge field, with meta data specified in the parameter struct.
cpuGaugeField * cpuFatLink
void arpack_solve(std::vector< ColorSpinorField *> &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data...
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity, const double &kappa5) const
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:260
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
cudaGaugeField * gaugeRefinement
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:504
QudaPrecision cuda_prec_sloppy
Definition: quda.h:215
DiracMatrix * m
Definition: deflation.h:183
double qChargeQuda()
static bool initialized
Profiler for initQuda.
void Clover(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
cudaCloverField * cloverSloppy
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
QudaVerbosity verbosity
Definition: quda.h:244
static bool reunit_svd_only
ColorSpinorParam csParam
Definition: pack_test.cpp:24
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const =0
void free_gauge_quda_()
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:179
cudaCloverField * cloverRefinement
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:44
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:185
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
QudaDagType dagger
Definition: dirac_quda.h:30
cpuColorSpinorField * in
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
static const std::string quda_version
Definition: tune.cpp:114
TimeProfile & profile
Definition: multigrid.h:481
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:578
double gflops
Definition: quda.h:250
void Dagger(QudaDagType dag) const
Definition: dirac_quda.h:182
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:33
double gflops
Definition: quda.h:465
void Print()
Definition: timer.cpp:7
QudaPrecision cuda_prec_precondition
Definition: quda.h:58
void free_sloppy_gauge_quda_()
QudaSiteSubset SiteSubset() const
QudaCloverFieldOrder clover_order
Definition: quda.h:230
void createDslashEvents()
Definition: dslash_quda.cu:95
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
double Anisotropy() const
Definition: gauge_field.h:252
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
constexpr int size
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
static int lex_rank_from_coords(const int *coords, void *fdata)
void freeSloppyCloverQuda()
QudaGaugeFieldOrder order
Definition: gauge_field.h:17
#define warningQuda(...)
Definition: util_quda.h:133
virtual void blocksolve(ColorSpinorField &out, ColorSpinorField &in)
Definition: solver.cpp:198
void flushForceMonitor()
Flush any outstanding force monitoring information.
Definition: momentum.cu:29
void performAPEnStep(unsigned int nSteps, double alpha)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
QudaBoolean use_norm_op
Definition: quda.h:402
QudaDiracType type
Definition: dirac_quda.h:22
#define QUDA_MAX_CHRONO
void OpenMagma()
Definition: blas_magma.cu:307
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
Definition: quda.h:495
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
static bool invalidate_clover
void Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
int chrono_make_resident
Definition: quda.h:353
QudaBoolean arpack_check
Definition: quda.h:429
int Volume() const
static double svd_abs_error
QudaMatPCType matpcType
Definition: dirac_quda.h:29
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:181
void comm_set_gridsize_(int *grid)
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:176
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
cudaGaugeField * gaugeFatPrecise
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
void initQudaMemory()
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:514
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
void init_quda_memory_()
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.
cudaCloverField * cloverPrecise
enum QudaParity_s QudaParity
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:446
static bool comms_initialized
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.
Definition: gauge_stout.cu:269
static int * num_failures_h
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
double mass
Definition: quda.h:105
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
QudaFieldLocation location
Definition: quda.h:453
Deflation * defl
Definition: deflation.h:189
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.
QudaPrecision halo_precision
Definition: dirac_quda.h:46
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
Definition: malloc_quda.h:66
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:55
int dims[QUDA_MAX_DIM]
static void init_default_comms()
void setMass(double mass)
Definition: dirac_quda.h:169
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
Definition: util_quda.cpp:83
static int * num_failures_d
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
int compute_clover_inverse
Definition: quda.h:240
static int dims[4]
Definition: face_gauge.cpp:41
#define checkCudaErrorNoSync()
Definition: util_quda.h:145
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: dirac.cpp:90
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:277
#define STR(x)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
static void profilerStop(const char *f)
enum QudaFieldLocation_s QudaFieldLocation
QudaFieldLocation location
Definition: quda.h:34
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:131
double clover_rho
Definition: quda.h:234
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
GaugeCovDev * dirac
Definition: covdev_test.cpp:73
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.
cpuColorSpinorField * out
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.
QudaPrecision cuda_prec_precondition
Definition: quda.h:217
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.
cudaGaugeField * gaugePrecondition
bool twisted
Clover coefficient.
Definition: clover_field.h:17
int deflation_grid
Definition: quda.h:336
static TimeProfile profileCovDev("covDevQuda")
Profiler for momentum action.
GaugeFieldParam gParam
void printQudaEigParam(QudaEigParam *param)
Definition: check_params.h:143
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83
Conjugate-Gradient Solver.
Definition: invert_quda.h:570
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)
Definition: contract.cu:107
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
Definition: invert_quda.h:1171
static TimeProfile profileEigensolve("eigensolveQuda")
Profiler for computeFatLinkQuda.
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static double unitarize_eps
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.
cudaCloverField * clover
Definition: dirac_quda.h:35
static bool redundant_comms
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
QudaLinkType link_type
Definition: gauge_field.h:19
std::vector< ColorSpinorField * > B
Definition: multigrid.h:475
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
void printPeakMemUsage()
Definition: malloc.cpp:375
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
QudaPrecision Precision() const
Definition: lattice_field.h:58
void loadTuneCache()
Definition: tune.cpp:322
void * newDeflationQuda(QudaEigParam *eig_param)
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:598
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
int use_resident_gauge
Definition: quda.h:80
#define printfQuda(...)
Definition: util_quda.h:115
QudaMemoryType mem_type
Definition: lattice_field.h:73
void new_quda_gauge_param_(QudaGaugeParam *param)
cudaGaugeField * fatGauge
Definition: dirac_quda.h:32
QudaTwistFlavorType twist_flavor
Definition: quda.h:117
static void profilerStart(const char *f)
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
int return_result_gauge
Definition: quda.h:84
cudaGaugeField * gaugeSmeared
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")
double Tadpole() const
Definition: gauge_field.h:253
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:194
cudaGaugeField * cudaGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.
int chrono_max_dim
Definition: quda.h:362
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
#define device_malloc(size)
Definition: malloc_quda.h:64
cudaStream_t * streams
int use_resident_mom
Definition: quda.h:81
QudaReconstructType reconstruct
Definition: gauge_field.h:16
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:24
void closeMagma()
QudaFieldCreate create
Definition: gauge_field.h:26
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void printLaunchTimer()
Definition: tune.cpp:843
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:461
enum QudaContractType_s QudaContractType
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.
void * longlink
int compute_true_res
Definition: quda.h:125
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaResidualType residual_type
Definition: quda.h:320
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:64
int num_offset
Definition: quda.h:169
void flushChronoQuda(int i)
Flush the chronological history for the given index.
cudaGaugeField * longGauge
Definition: dirac_quda.h:33
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
Definition: util_quda.cpp:94
enum QudaVerbosity_s QudaVerbosity
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void * fatlink
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
void createCloverQuda(QudaInvertParam *invertParam)
int return_result_mom
Definition: quda.h:85
int test_type
Definition: test_util.cpp:1636
void end_quda_()
int compute_clover
Definition: quda.h:239
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
double epsilon
Definition: quda.h:115
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
cpuGaugeField * cpuGauge
#define checkCudaError()
Definition: util_quda.h:161
QudaFieldGeometry geometry
Definition: gauge_field.h:28
static void PrintGlobal()
Definition: timer.cpp:81
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:75
cudaGaugeField * gaugePrecise
QudaTboundary TBoundary() const
Definition: gauge_field.h:254
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:259
#define mapped_malloc(size)
Definition: malloc_quda.h:68
int use_resident_solution
Definition: quda.h:350
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:159
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void updateR()
update the radius for halos.
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.
cudaGaugeField * cudaForce
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
static TimeProfile profileContract("contractQuda")
Profiler for covariant derivative.
int make_resident_gauge
Definition: quda.h:82
cudaGaugeField * gaugeFatExtended
void copy(const GaugeField &src)
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void CloverInv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const =0
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204
QudaDslashType dslash_type_precondition
Definition: quda.h:284
QudaPrecision clover_cpu_prec
Definition: quda.h:224
QudaPrecision cuda_prec_ritz
Definition: quda.h:447
QudaParity parity
Definition: covdev_test.cpp:54
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:79
static int opp(int dir)
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void destroyDslashEvents()
Definition: dslash_quda.cu:144
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...
QudaPrecision prec
Definition: test_util.cpp:1608
ColorSpinorField * RV
Definition: deflation.h:185
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
QudaMatPCType matpc_type
Definition: quda.h:206
cudaGaugeField * momResident
ColorSpinorField * tmp1
Definition: dirac_quda.h:41
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
cpuGaugeField * cpuForce
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.
double kappa5
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.
Definition: gauge_plaq.cu:65
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
void * newMultigridQuda(QudaMultigridParam *mg_param)
void plaqQuda(double plaq[3])
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:56
unsigned long long bytes
Definition: blas_quda.cu:23
QudaPrecision cpu_prec
Definition: quda.h:47
cudaGaugeField * gaugeSloppy
int comm_dim_partitioned(int dim)
void initQudaDevice(int dev)
void endQuda(void)
int overlap
Definition: quda.h:76
QudaGaugeParam newQudaGaugeParam(void)
const int * X() const
QudaInvertParam * invert_param
Definition: quda.h:478
void checkClover(QudaInvertParam *param)
#define device_free(ptr)
Definition: malloc_quda.h:69
double reliable_delta_refinement
Definition: quda.h:130
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
double clover_coeff
Definition: quda.h:233
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
QudaVerbosity verbosity
Definition: test_util.cpp:1614
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()