quda-ref/v1.0.0/interface__quda_8cpp_source.html

 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <sys/time.h>
 #include <complex.h>

 #include <quda.h>
 #include <quda_fortran.h>
 #include <quda_internal.h>
 #include <comm_quda.h>
 #include <tune_quda.h>
 #include <blas_quda.h>
 #include <gauge_field.h>
 #include <dirac_quda.h>
 #include <dslash_quda.h>
 #include <invert_quda.h>
 #include <eigensolve_quda.h>
 #include <color_spinor_field.h>
 #include <clover_field.h>
 #include <llfat_quda.h>
 #include <unitarization_links.h>
 #include <algorithm>
 #include <staggered_oprod.h>
 #include <ks_improved_force.h>
 #include <ks_force_quda.h>
 #include <random_quda.h>
 #include <mpi_comm_handle.h>

 #include <multigrid.h>

 #include <deflation.h>

 #ifdef NUMA_NVML
 #include <numa_affinity.h>
 #endif

 #ifdef QUDA_NVML
 #include <nvml.h>
 #endif

 #include <cuda.h>

 #include <ks_force_quda.h>

 #ifdef GPU_GAUGE_FORCE
 #include <gauge_force_quda.h>
 #endif
 #include <gauge_update_quda.h>

 #define MAX(a,b) ((a)>(b)? (a):(b))
 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))

 #define spinorSiteSize 24 // real numbers per spinor

 #define MAX_GPU_NUM_PER_NODE 16

 // define newQudaGaugeParam() and newQudaInvertParam()
 #define INIT_PARAM
 #include "check_params.h"
 #undef INIT_PARAM

 // define (static) checkGaugeParam() and checkInvertParam()
 #define CHECK_PARAM
 #include "check_params.h"
 #undef CHECK_PARAM

 // define printQudaGaugeParam() and printQudaInvertParam()
 #define PRINT_PARAM
 #include "check_params.h"
 #undef PRINT_PARAM

 #include <gauge_tools.h>
 #include <contract_quda.h>

 #include <momentum.h>


 #include <cuda_profiler_api.h>

 using namespace quda;

 static int R[4] = {0, 0, 0, 0};
 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels
 static bool redundant_comms = false;

 #include <blas_cublas.h>

 //for MAGMA lib:
 #include <blas_magma.h>

 static bool InitMagma = false;

 void openMagma() {

   if (!InitMagma) {
     OpenMagma();
     InitMagma = true;
   } else {
     printfQuda("\nMAGMA library was already initialized..\n");
   }

 }

 void closeMagma(){

   if (InitMagma) {
     CloseMagma();
     InitMagma = false;
   } else {
     printfQuda("\nMAGMA library was not initialized..\n");
   }

 }

 cudaGaugeField *gaugePrecise = nullptr;
 cudaGaugeField *gaugeSloppy = nullptr;
 cudaGaugeField *gaugePrecondition = nullptr;
 cudaGaugeField *gaugeRefinement = nullptr;
 cudaGaugeField *gaugeExtended = nullptr;

 cudaGaugeField *gaugeFatPrecise = nullptr;
 cudaGaugeField *gaugeFatSloppy = nullptr;
 cudaGaugeField *gaugeFatPrecondition = nullptr;
 cudaGaugeField *gaugeFatRefinement = nullptr;
 cudaGaugeField *gaugeFatExtended = nullptr;

 cudaGaugeField *gaugeLongExtended = nullptr;
 cudaGaugeField *gaugeLongPrecise = nullptr;
 cudaGaugeField *gaugeLongSloppy = nullptr;
 cudaGaugeField *gaugeLongPrecondition = nullptr;
 cudaGaugeField *gaugeLongRefinement = nullptr;

 cudaGaugeField *gaugeSmeared = nullptr;

 cudaCloverField *cloverPrecise = nullptr;
 cudaCloverField *cloverSloppy = nullptr;
 cudaCloverField *cloverPrecondition = nullptr;
 cudaCloverField *cloverRefinement = nullptr;

 cudaGaugeField *momResident = nullptr;
 cudaGaugeField *extendedGaugeResident = nullptr;

 std::vector<cudaColorSpinorField*> solutionResident;

 // vector of spinors used for forecasting solutions in HMC
 #define QUDA_MAX_CHRONO 12
 // each entry is one p
 std::vector< std::vector<ColorSpinorField*> > chronoResident(QUDA_MAX_CHRONO);

 // Mapped memory buffer used to hold unitarization failures
 static int *num_failures_h = nullptr;
 static int *num_failures_d = nullptr;

 cudaDeviceProp deviceProp;
 cudaStream_t *streams;

 static bool initialized = false;

 static TimeProfile profileInit("initQuda");

 static TimeProfile profileGauge("loadGaugeQuda");

 static TimeProfile profileClover("loadCloverQuda");

 static TimeProfile profileDslash("dslashQuda");

 static TimeProfile profileInvert("invertQuda");

 static TimeProfile profileMulti("invertMultiShiftQuda");

 static TimeProfile profileEigensolve("eigensolveQuda");

 static TimeProfile profileFatLink("computeKSLinkQuda");

 static TimeProfile profileGaugeForce("computeGaugeForceQuda");

 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");

 static TimeProfile profileExtendedGauge("createExtendedGaugeField");

 static TimeProfile profileCloverForce("computeCloverForceQuda");

 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");

 static TimeProfile profileHISQForce("computeHISQForceQuda");

 static TimeProfile profilePlaq("plaqQuda");

 static TimeProfile profileWuppertal("wuppertalQuda");

 static TimeProfile profileGauss("gaussQuda");

 static TimeProfile profileQCharge("qChargeQuda");

 static TimeProfile profileAPE("APEQuda");

 static TimeProfile profileSTOUT("STOUTQuda");

 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");

 static TimeProfile profileProject("projectSU3Quda");

 static TimeProfile profilePhase("staggeredPhaseQuda");

 static TimeProfile profileContract("contractQuda");

 static TimeProfile profileCovDev("covDevQuda");

 static TimeProfile profileMomAction("momActionQuda");

 static TimeProfile profileEnd("endQuda");

 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");
 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");

 static TimeProfile profileInit2End("initQuda-endQuda",false);

 static bool enable_profiler = false;
 static bool do_not_profile_quda = false;

 static void profilerStart(const char *f) {


   static std::vector<int> target_list;
   static bool enable = false;
   static bool init = false;
   if (!init) {
     char *profile_target_env = getenv("QUDA_ENABLE_TARGET_PROFILE"); // selectively enable profiling for a given solve

     if ( profile_target_env ) {
       std::stringstream target_stream(profile_target_env);

       int target;
       while(target_stream >> target) {
        target_list.push_back(target);
        if (target_stream.peek() == ',') target_stream.ignore();
      }

      if (target_list.size() > 0) {
        std::sort(target_list.begin(), target_list.end());
        target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );
        warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size());
        enable = true;
      }
    }


     char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts
     if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0)))  {
       do_not_profile_quda=true;
       printfQuda("Disabling profiling in QUDA\n");
     }
     init = true;
   }

   static int target_count = 0;
   static unsigned int i = 0;
   if (do_not_profile_quda){
     cudaProfilerStop();
     printfQuda("Stopping profiling in QUDA\n");
   } else {
     if (enable) {
       if (i < target_list.size() && target_count++ == target_list[i]) {
         enable_profiler = true;
         printfQuda("Starting profiling for %s\n", f);
         cudaProfilerStart();
       i++; // advance to next target
     }
   }
 }
 }

 static void profilerStop(const char *f) {
   if(do_not_profile_quda){
     cudaProfilerStart();
   } else {

     if (enable_profiler) {
       printfQuda("Stopping profiling for %s\n", f);
       cudaProfilerStop();
       enable_profiler = false;
     }
   }
 }


 namespace quda {
   void printLaunchTimer();
 }

 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
 {
   setVerbosity(verbosity);
   setOutputPrefix(prefix);
   setOutputFile(outfile);
 }


 typedef struct {
   int ndim;
   int dims[QUDA_MAX_DIM];
 } LexMapData;

 static int lex_rank_from_coords(const int *coords, void *fdata)
 {
   auto *md = static_cast<LexMapData *>(fdata);

   int rank = coords[0];
   for (int i = 1; i < md->ndim; i++) {
     rank = md->dims[i] * rank + coords[i];
   }
   return rank;
 }

 #ifdef QMP_COMMS

 static int qmp_rank_from_coords(const int *coords, void *fdata)
 {
   return QMP_get_node_number_from(coords);
 }
 #endif

 // Provision for user control over MPI comm handle
 // Assumes an MPI implementation of QMP

 #if defined(QMP_COMMS) || defined(MPI_COMMS)
 MPI_Comm MPI_COMM_HANDLE;
 static int user_set_comm_handle = 0;
 #endif

 void setMPICommHandleQuda(void *mycomm)
 {
 #if defined(QMP_COMMS) || defined(MPI_COMMS)
   MPI_COMM_HANDLE = *((MPI_Comm *)mycomm);
   user_set_comm_handle = 1;
 #endif
 }

 #ifdef QMP_COMMS
 static void initQMPComms(void)
 {
   // Default comm handle is taken from QMP
   // WARNING: Assumes an MPI implementation of QMP
   if (!user_set_comm_handle) {
     void *mycomm;
     QMP_get_mpi_comm(QMP_comm_get_default(), &mycomm);
     setMPICommHandleQuda(mycomm);
   }
 }
 #elif defined(MPI_COMMS)
 static void initMPIComms(void)
 {
   // Default comm handle is MPI_COMM_WORLD
   if (!user_set_comm_handle) {
     static MPI_Comm mycomm;
     MPI_Comm_dup(MPI_COMM_WORLD, &mycomm);
     setMPICommHandleQuda((void *)&mycomm);
   }
 }
 #endif

 static bool comms_initialized = false;

 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
 {
   if (comms_initialized) return;

 #if QMP_COMMS
   initQMPComms();
 #elif defined(MPI_COMMS)
   initMPIComms();
 #endif

   if (nDim != 4) {
     errorQuda("Number of communication grid dimensions must be 4");
   }

   LexMapData map_data;
   if (!func) {

 #if QMP_COMMS
     if (QMP_logical_topology_is_declared()) {
       if (QMP_get_logical_number_of_dimensions() != 4) {
         errorQuda("QMP logical topology must have 4 dimensions");
       }
       for (int i=0; i<nDim; i++) {
         int qdim = QMP_get_logical_dimensions()[i];
         if(qdim != dims[i]) {
           errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
         }
       }
       fdata = nullptr;
       func = qmp_rank_from_coords;
     } else {
       warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
 #endif

       map_data.ndim = nDim;
       for (int i=0; i<nDim; i++) {
         map_data.dims[i] = dims[i];
       }
       fdata = (void *) &map_data;
       func = lex_rank_from_coords;

 #if QMP_COMMS
     }
 #endif

   }
   comm_init(nDim, dims, func, fdata);
   comms_initialized = true;
 }


 static void init_default_comms()
 {
 #if defined(QMP_COMMS)
   if (QMP_logical_topology_is_declared()) {
     int ndim = QMP_get_logical_number_of_dimensions();
     const int *dims = QMP_get_logical_dimensions();
     initCommsGridQuda(ndim, dims, nullptr, nullptr);
   } else {
     errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
         " and QMP logical topology has not been declared");
   }
 #elif defined(MPI_COMMS)
   errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
 #else // single-GPU
   const int dims[4] = {1, 1, 1, 1};
   initCommsGridQuda(4, dims, nullptr, nullptr);
 #endif
 }


 #define STR_(x) #x
 #define STR(x) STR_(x)
   static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
 #undef STR
 #undef STR_

 extern char* gitversion;

 /*
  * Set the device that QUDA uses.
  */
 void initQudaDevice(int dev) {

   //static bool initialized = false;
   if (initialized) return;
   initialized = true;

   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_INIT);

   if (getVerbosity() >= QUDA_SUMMARIZE) {
 #ifdef GITVERSION
     printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
 #else
     printfQuda("QUDA %s\n",quda_version.c_str());
 #endif
   }

   int driver_version;
   cudaDriverGetVersion(&driver_version);
   printfQuda("CUDA Driver version = %d\n", driver_version);

   int runtime_version;
   cudaRuntimeGetVersion(&runtime_version);
   printfQuda("CUDA Runtime version = %d\n", runtime_version);

 #ifdef QUDA_NVML
   nvmlReturn_t result = nvmlInit();
   if (NVML_SUCCESS != result) errorQuda("NVML Init failed with error %d", result);
   const int length = 80;
   char graphics_version[length];
   result = nvmlSystemGetDriverVersion(graphics_version, length);
   if (NVML_SUCCESS != result) errorQuda("nvmlSystemGetDriverVersion failed with error %d", result);
   printfQuda("Graphic driver version = %s\n", graphics_version);
   result = nvmlShutdown();
   if (NVML_SUCCESS != result) errorQuda("NVML Shutdown failed with error %d", result);
 #endif

 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000)
   //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
   // not needed for CUDA >= 4.1
   char* cni_str = getenv("CUDA_NIC_INTEROP");
   if(cni_str == nullptr){
     errorQuda("Environment variable CUDA_NIC_INTEROP is not set");
   }
   int cni_int = atoi(cni_str);
   if (cni_int != 1){
     errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");
   }
 #endif

   int deviceCount;
   cudaGetDeviceCount(&deviceCount);
   if (deviceCount == 0) {
     errorQuda("No CUDA devices found");
   }

   for(int i=0; i<deviceCount; i++) {
     cudaGetDeviceProperties(&deviceProp, i);
     checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
     if (getVerbosity() >= QUDA_SUMMARIZE) {
       printfQuda("Found device %d: %s\n", i, deviceProp.name);
     }
   }

 #ifdef MULTI_GPU
   if (dev < 0) {
     if (!comms_initialized) {
       errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
     }
     dev = comm_gpuid();
   }
 #else
   if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
 #endif

   cudaGetDeviceProperties(&deviceProp, dev);
   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
   if (deviceProp.major < 1) {
     errorQuda("Device %d does not support CUDA", dev);
   }


 // Check GPU and QUDA build compatibiliy
 // 4 cases:
 // a) QUDA and GPU match: great
 // b) QUDA built for higher compute capability: error
 // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
 // d) QUDA built for same major compute capability but lower minor: warn

   const int my_major = __COMPUTE_CAPABILITY__ / 100;
   const int my_minor = (__COMPUTE_CAPABILITY__  - my_major * 100) / 10;
 // b) UDA was compiled for a higher compute capability
   if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
     errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);


 // c) QUDA was compiled for a lower compute capability
   if (deviceProp.major < my_major) {
     char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
     if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
       if (getVerbosity() > QUDA_SILENT) warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
     } else {
       errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.", deviceProp.major, deviceProp.minor, my_major, my_minor);
     }
   }
 // d) QUDA built for same major compute capability but lower minor
   if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
     warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
   }

   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Using device %d: %s\n", dev, deviceProp.name);
   }
 #ifndef USE_QDPJIT
   cudaSetDevice(dev);
   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
 #endif


 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML)
   char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
   if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
     if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
   }
   else{
     setNumaAffinityNVML(dev);
   }
 #endif


   cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
   //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
   // cudaGetDeviceProperties(&deviceProp, dev);

   { // determine if we will do CPU or GPU data reordering (default is GPU)
     char *reorder_str = getenv("QUDA_REORDER_LOCATION");

     if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {
       warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
       reorder_location_set(QUDA_CUDA_FIELD_LOCATION);
     } else {
       warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
       reorder_location_set(QUDA_CPU_FIELD_LOCATION);
     }
   }

   profileInit.TPSTOP(QUDA_PROFILE_INIT);
   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 /*
  * Any persistent memory allocations that QUDA uses are done here.
  */
 void initQudaMemory()
 {
   profileInit.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_INIT);

   if (!comms_initialized) init_default_comms();

   streams = new cudaStream_t[Nstream];

   int greatestPriority;
   int leastPriority;
   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
   for (int i=0; i<Nstream-1; i++) {
     cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
   }
   cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);

   checkCudaError();
   createDslashEvents();
   blas::init();
   cublas::init();

   // initalize the memory pool allocators
   pool::init();

   num_failures_h = static_cast<int*>(mapped_malloc(sizeof(int)));
   cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);

   loadTuneCache();

   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));

   profileInit.TPSTOP(QUDA_PROFILE_INIT);
   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void updateR()
 {
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 }

 void initQuda(int dev)
 {
   // initialize communications topology, if not already done explicitly via initCommsGridQuda()
   if (!comms_initialized) init_default_comms();

   // set the device that QUDA uses
   initQudaDevice(dev);

   // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
   initQudaMemory();
 }

 // helper for creating extended gauge fields
 static cudaGaugeField* createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile,
              bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
 {
   profile.TPSTART(QUDA_PROFILE_INIT);
   int y[4];
   for (int dir=0; dir<4; ++dir) y[dir] = in.X()[dir] + 2*R[dir];
   int pad = 0;

   GaugeFieldParam gParamEx(y, in.Precision(), recon != QUDA_RECONSTRUCT_INVALID ? recon : in.Reconstruct(), pad,
          in.Geometry(), QUDA_GHOST_EXCHANGE_EXTENDED);
   gParamEx.create = QUDA_ZERO_FIELD_CREATE;
   gParamEx.order = in.Order();
   gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
   gParamEx.t_boundary = in.TBoundary();
   gParamEx.nFace = 1;
   gParamEx.tadpole = in.Tadpole();
   gParamEx.anisotropy = in.Anisotropy();
   for (int d=0; d<4; d++) gParamEx.r[d] = R[d];

   auto *out = new cudaGaugeField(gParamEx);

   // copy input field into the extended device gauge field
   copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION);

   profile.TPSTOP(QUDA_PROFILE_INIT);

   // now fill up the halos
   out->exchangeExtendedGhost(R,profile,redundant_comms);

   return out;
 }

 // This is a flag used to signal when we have downloaded new gauge
 // field.  Set by loadGaugeQuda and consumed by loadCloverQuda as one
 // possible flag to indicate we need to recompute the clover field
 static bool invalidate_clover = true;

 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");
   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);

   checkGaugeParam(param);

   profileGauge.TPSTART(QUDA_PROFILE_INIT);
   // Set the specific input parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(h_gauge, *param);

   // if we are using half precision then we need to compute the fat
   // link maximum while still on the cpu
   // FIXME get a kernel for this
   if (param->type == QUDA_ASQTAD_FAT_LINKS)
     gauge_param.compute_fat_link_max = true;

   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?
     static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
     static_cast<GaugeField*>(new cudaGaugeField(gauge_param));

   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
     static size_t checksum = SIZE_MAX;
     size_t in_checksum = in->checksum(true);
     if (in_checksum == checksum) {
       if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
       profileGauge.TPSTOP(QUDA_PROFILE_INIT);
       profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
       delete in;
       invalidate_clover = false;
       return;
     }
     checksum = in_checksum;
     invalidate_clover = true;
   }

   // free any current gauge field before new allocations to reduce memory overhead
   switch (param->type) {
     case QUDA_WILSON_LINKS:
       if (gaugeRefinement != gaugeSloppy && gaugeRefinement) delete gaugeRefinement;
       if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
       if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
       if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement) delete gaugeFatRefinement;
       if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
       if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
       if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise;
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement) delete gaugeLongRefinement;
       if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
       if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
       if (gaugeLongPrecise) delete gaugeLongPrecise;
       break;
     case QUDA_SMEARED_LINKS:
       if (gaugeSmeared) delete gaugeSmeared;
       break;
     default:
       errorQuda("Invalid gauge type %d", param->type);
   }

   // if not preserving then copy the gauge field passed in
   cudaGaugeField *precise = nullptr;

   // switch the parameters for creating the mirror precise cuda gauge field
   gauge_param.create = QUDA_NULL_FIELD_CREATE;
   gauge_param.reconstruct = param->reconstruct;
   gauge_param.setPrecision(param->cuda_prec, true);
   gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   gauge_param.pad = param->ga_pad;

   precise = new cudaGaugeField(gauge_param);

   if (param->use_resident_gauge) {
     if(gaugePrecise == nullptr) errorQuda("No resident gauge field");
     // copy rather than point at to ensure that the padded region is filled in
     precise->copy(*gaugePrecise);
     precise->exchangeGhost();
     delete gaugePrecise;
     gaugePrecise = nullptr;
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
     profileGauge.TPSTART(QUDA_PROFILE_H2D);
     precise->copy(*in);
     profileGauge.TPSTOP(QUDA_PROFILE_H2D);
   }

   // for gaugeSmeared we are interested only in the precise version
   if (param->type == QUDA_SMEARED_LINKS) {
     gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);

     profileGauge.TPSTART(QUDA_PROFILE_FREE);
     delete precise;
     delete in;
     profileGauge.TPSTOP(QUDA_PROFILE_FREE);

     profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
     return;
   }

   // creating sloppy fields isn't really compute, but it is work done on the gpu
   profileGauge.TPSTART(QUDA_PROFILE_COMPUTE);

   // switch the parameters for creating the mirror sloppy cuda gauge field
   gauge_param.reconstruct = param->reconstruct_sloppy;
   gauge_param.setPrecision(param->cuda_prec_sloppy, true);
   cudaGaugeField *sloppy = nullptr;
   if (param->cuda_prec != param->cuda_prec_sloppy ||
       param->reconstruct != param->reconstruct_sloppy) {
     sloppy = new cudaGaugeField(gauge_param);
     sloppy->copy(*precise);
   } else {
     sloppy = precise;
   }

   // switch the parameters for creating the mirror preconditioner cuda gauge field
   gauge_param.reconstruct = param->reconstruct_precondition;
   gauge_param.setPrecision(param->cuda_prec_precondition, true);
   cudaGaugeField *precondition = nullptr;
   if (param->cuda_prec_sloppy != param->cuda_prec_precondition ||
       param->reconstruct_sloppy != param->reconstruct_precondition) {
     precondition = new cudaGaugeField(gauge_param);
     precondition->copy(*sloppy);
   } else {
     precondition = sloppy;
   }

   // switch the parameters for creating the refinement cuda gauge field
   gauge_param.reconstruct = param->reconstruct_refinement_sloppy;
   gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);
   cudaGaugeField *refinement = nullptr;
   if (param->cuda_prec_sloppy != param->cuda_prec_refinement_sloppy
       || param->reconstruct_sloppy != param->reconstruct_refinement_sloppy) {
     refinement = new cudaGaugeField(gauge_param);
     refinement->copy(*sloppy);
   } else {
     refinement = sloppy;
   }

   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);

   // create an extended preconditioning field
   cudaGaugeField* extended = nullptr;
   if (param->overlap){
     int R[4]; // domain-overlap widths in different directions
     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
     extended = createExtendedGauge(*precondition, R, profileGauge);
   }

   switch (param->type) {
     case QUDA_WILSON_LINKS:
       gaugePrecise = precise;
       gaugeSloppy = sloppy;
       gaugePrecondition = precondition;
       gaugeRefinement = refinement;

       if(param->overlap) gaugeExtended = extended;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       gaugeFatPrecise = precise;
       gaugeFatSloppy = sloppy;
       gaugeFatPrecondition = precondition;
       gaugeFatRefinement = refinement;

       if(param->overlap){
         if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
   gaugeFatExtended = extended;
       }
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       gaugeLongPrecise = precise;
       gaugeLongSloppy = sloppy;
       gaugeLongPrecondition = precondition;
       gaugeLongRefinement = refinement;

       if(param->overlap){
         if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
     gaugeLongExtended = extended;
       }
       break;
     default:
       errorQuda("Invalid gauge type %d", param->type);
   }

   profileGauge.TPSTART(QUDA_PROFILE_FREE);
   delete in;
   profileGauge.TPSTOP(QUDA_PROFILE_FREE);

   if (extendedGaugeResident) {
     // updated the resident gauge field if needed
     const int *R_ = extendedGaugeResident->R();
     const int R[] = { R_[0], R_[1], R_[2], R_[3] };
     QudaReconstructType recon = extendedGaugeResident->Reconstruct();
     delete extendedGaugeResident;

     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
   }

   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);

   if (param->location != QUDA_CPU_FIELD_LOCATION)
     errorQuda("Non-cpu output location not yet supported");

   if (!initialized) errorQuda("QUDA not initialized");
   checkGaugeParam(param);

   // Set the specific cpu parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(h_gauge, *param);
   cpuGaugeField cpuGauge(gauge_param);
   cudaGaugeField *cudaGauge = nullptr;
   switch (param->type) {
     case QUDA_WILSON_LINKS:
       cudaGauge = gaugePrecise;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       cudaGauge = gaugeFatPrecise;
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       cudaGauge = gaugeLongPrecise;
       break;
     case QUDA_SMEARED_LINKS:
       gauge_param.create = QUDA_NULL_FIELD_CREATE;
       gauge_param.reconstruct = param->reconstruct;
       gauge_param.setPrecision(param->cuda_prec, true);
       gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
       gauge_param.pad = param->ga_pad;
       cudaGauge = new cudaGaugeField(gauge_param);
       copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
       break;
     default:
       errorQuda("Invalid gauge type");
   }

   profileGauge.TPSTART(QUDA_PROFILE_D2H);
   cudaGauge->saveCPUField(cpuGauge);
   profileGauge.TPSTOP(QUDA_PROFILE_D2H);

   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }

   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void loadSloppyCloverQuda(const QudaPrecision prec[]);
 void freeSloppyCloverQuda();

 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
   profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   profileClover.TPSTART(QUDA_PROFILE_INIT);

   checkCloverParam(inv_param);
   bool device_calc = false; // calculate clover and inverse on the device?

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   if (!initialized) errorQuda("QUDA not initialized");

   if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
     device_calc = true;
     if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set");
     if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
   }

   if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION)  errorQuda("Half precision not supported on CPU");
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded before clover");
   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {
     errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
   }

   // determines whether operator is preconditioned when calling invertQuda()
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
       inv_param->solve_type == QUDA_NORMOP_PC_SOLVE ||
       inv_param->solve_type == QUDA_NORMERR_PC_SOLVE );

   // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
   bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
       inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);

   // uninverted clover term is required when applying unpreconditioned operator,
   // but note that dslashQuda() is always preconditioned
   if (!h_clover && !pc_solve && !pc_solution) {
     //warningQuda("Uninverted clover term not loaded");
   }

   // uninverted clover term is also required for "asymmetric" preconditioning
   if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
     warningQuda("Uninverted clover term not loaded");
   }

   bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
 #ifdef DYNAMIC_CLOVER
   bool dynamic_clover = true;
 #else
   bool dynamic_clover = false;
 #endif

   CloverFieldParam clover_param;
   clover_param.nDim = 4;
   clover_param.csw = inv_param->clover_coeff;
   clover_param.twisted = twisted;
   clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
   clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
   for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
   clover_param.pad = inv_param->cl_pad;
   clover_param.create = QUDA_NULL_FIELD_CREATE;
   clover_param.norm = nullptr;
   clover_param.invNorm = nullptr;
   clover_param.setPrecision(inv_param->clover_cuda_prec);
   clover_param.direct = h_clover || device_calc ? true : false;
   clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false;
   CloverField *in = nullptr;
   profileClover.TPSTOP(QUDA_PROFILE_INIT);

   // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)
   bool clover_update = false;
   double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
   if (!cloverPrecise || invalidate_clover || inv_param->clover_coeff != csw_old) clover_update = true;

   // compute or download clover field only if gauge field has been updated or clover field doesn't exist
   if (clover_update) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
     freeSloppyCloverQuda();
     if (cloverPrecise) delete cloverPrecise;

     profileClover.TPSTART(QUDA_PROFILE_INIT);
     cloverPrecise = new cudaCloverField(clover_param);

     if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
       // create a param for the cpu clover field
       CloverFieldParam inParam(clover_param);
       inParam.setPrecision(inv_param->clover_cpu_prec);
       inParam.order = inv_param->clover_order;
       inParam.direct = h_clover ? true : false;
       inParam.inverse = h_clovinv ? true : false;
       inParam.clover = h_clover;
       inParam.cloverInv = h_clovinv;
       inParam.create = QUDA_REFERENCE_FIELD_CREATE;
       in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
   static_cast<CloverField*>(new cpuCloverField(inParam)) :
   static_cast<CloverField*>(new cudaCloverField(inParam));
     }
     profileClover.TPSTOP(QUDA_PROFILE_INIT);

     if (!device_calc) {
       profileClover.TPSTART(QUDA_PROFILE_H2D);
       bool inverse = (h_clovinv && !inv_param->compute_clover_inverse && !dynamic_clover);
       cloverPrecise->copy(*in, inverse);
       profileClover.TPSTOP(QUDA_PROFILE_H2D);
     } else {
       profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
       createCloverQuda(inv_param);
       profileClover.TPSTART(QUDA_PROFILE_TOTAL);
     }

     // inverted clover term is required when applying preconditioned operator
     if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
       profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
       if (!dynamic_clover) {
   cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog);
   if (inv_param->compute_clover_trlog) {
     inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
     inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
   }
       }
       profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
     }
   } else {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
   }

   clover_param.direct = true;
   clover_param.inverse = dynamic_clover ? false : true;

   cloverPrecise->setRho(inv_param->clover_rho);

   QudaPrecision prec[] = {inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition,
                           inv_param->clover_cuda_prec_refinement_sloppy};
   loadSloppyCloverQuda(prec);

   // if requested, copy back the clover / inverse field
   if (inv_param->return_clover || inv_param->return_clover_inverse) {
     if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");

     // copy the inverted clover term into host application order on the device
     clover_param.setPrecision(inv_param->clover_cpu_prec);
     clover_param.direct = (h_clover && inv_param->return_clover);
     clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);

     // this isn't really "epilogue" but this label suffices
     profileClover.TPSTART(QUDA_PROFILE_EPILOGUE);
     cudaCloverField *hack = nullptr;
     if (!dynamic_clover) {
       clover_param.order = inv_param->clover_order;
       hack = new cudaCloverField(clover_param);
       hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
     } else {
       auto *hackOfTheHack = new cudaCloverField(clover_param);  // Hack of the hack
       hackOfTheHack->copy(*cloverPrecise, false);
       cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog);
       if (inv_param->compute_clover_trlog) {
   inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
   inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
       }
       clover_param.order = inv_param->clover_order;
       hack = new cudaCloverField(clover_param);
       hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
       delete hackOfTheHack;
     }
     profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE);

     // copy the field into the host application's clover field
     profileClover.TPSTART(QUDA_PROFILE_D2H);
     if (inv_param->return_clover) {
       qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
     }
     if (inv_param->return_clover_inverse) {
       qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
     }

     profileClover.TPSTOP(QUDA_PROFILE_D2H);

     delete hack;
     checkCudaError();
   }

   profileClover.TPSTART(QUDA_PROFILE_FREE);
   if (in) delete in; // delete object referencing input field
   profileClover.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void freeSloppyCloverQuda();

 void loadSloppyCloverQuda(const QudaPrecision *prec)
 {
   freeSloppyCloverQuda();

   if (cloverPrecise) {
     // create the mirror sloppy clover field
     CloverFieldParam clover_param(*cloverPrecise);

     clover_param.setPrecision(prec[0]);

     if (cloverPrecise->V(false) != cloverPrecise->V(true)) {
       clover_param.direct = true;
       clover_param.inverse = true;
     } else {
       clover_param.direct = false;
       clover_param.inverse = true;
     }

     if (clover_param.Precision() != cloverPrecise->Precision()) {
       cloverSloppy = new cudaCloverField(clover_param);
       cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
     } else {
       cloverSloppy = cloverPrecise;
     }

     // switch the parameters for creating the mirror preconditioner clover field
     clover_param.setPrecision(prec[1]);

     // create the mirror preconditioner clover field
     if (clover_param.Precision() != cloverSloppy->Precision()) {
       cloverPrecondition = new cudaCloverField(clover_param);
       cloverPrecondition->copy(*cloverSloppy, clover_param.inverse);
     } else {
       cloverPrecondition = cloverSloppy;
     }

     // switch the parameters for creating the mirror preconditioner clover field
     clover_param.setPrecision(prec[2]);

     // create the mirror preconditioner clover field
     if (clover_param.Precision() != cloverSloppy->Precision()) {
       cloverRefinement = new cudaCloverField(clover_param);
       cloverRefinement->copy(*cloverSloppy, clover_param.inverse);
     } else {
       cloverRefinement = cloverSloppy;
     }
   }

 }

 // just free the sloppy fields used in mixed-precision solvers
 void freeSloppyGaugeQuda()
 {
   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugeSloppy != gaugeRefinement && gaugeRefinement) delete gaugeRefinement;
   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;

   gaugeRefinement = nullptr;
   gaugePrecondition = nullptr;
   gaugeSloppy = nullptr;

   if (gaugeLongSloppy != gaugeLongRefinement && gaugeLongRefinement) delete gaugeLongRefinement;
   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;

   gaugeLongRefinement = nullptr;
   gaugeLongPrecondition = nullptr;
   gaugeLongSloppy = nullptr;

   if (gaugeFatSloppy != gaugeFatRefinement && gaugeFatRefinement) delete gaugeFatRefinement;
   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;

   gaugeFatRefinement = nullptr;
   gaugeFatPrecondition = nullptr;
   gaugeFatSloppy = nullptr;
 }

 void freeGaugeQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");

   freeSloppyGaugeQuda();

   if (gaugePrecise) delete gaugePrecise;
   if (gaugeExtended) delete gaugeExtended;

   gaugePrecise = nullptr;
   gaugeExtended = nullptr;

   if (gaugeLongPrecise) delete gaugeLongPrecise;
   if (gaugeLongExtended) delete gaugeLongExtended;

   gaugeLongPrecise = nullptr;
   gaugeLongExtended = nullptr;

   if (gaugeFatPrecise) delete gaugeFatPrecise;

   gaugeFatPrecise = nullptr;
   gaugeFatExtended = nullptr;

   if (gaugeSmeared) delete gaugeSmeared;

   gaugeSmeared = nullptr;
   // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
   if (extendedGaugeResident) {
     delete extendedGaugeResident;
     extendedGaugeResident = nullptr;
   }
 }

 void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
 {
   // first do SU3 links (if they exist)
   if (gaugePrecise) {
     GaugeFieldParam gauge_param(*gaugePrecise);
     // switch the parameters for creating the mirror sloppy cuda gauge field

     gauge_param.reconstruct = recon[0];
     gauge_param.setPrecision(prec[0], true);

     if (gaugeSloppy) errorQuda("gaugeSloppy already exists");

     if (gauge_param.Precision() != gaugePrecise->Precision() || gauge_param.reconstruct != gaugePrecise->Reconstruct()) {
       gaugeSloppy = new cudaGaugeField(gauge_param);
       gaugeSloppy->copy(*gaugePrecise);
     } else {
       gaugeSloppy = gaugePrecise;
     }

     // switch the parameters for creating the mirror preconditioner cuda gauge field
     gauge_param.reconstruct = recon[1];
     gauge_param.setPrecision(prec[1], true);

     if (gaugePrecondition) errorQuda("gaugePrecondition already exists");

     if (gauge_param.Precision() != gaugeSloppy->Precision() || gauge_param.reconstruct != gaugeSloppy->Reconstruct()) {
       gaugePrecondition = new cudaGaugeField(gauge_param);
       gaugePrecondition->copy(*gaugeSloppy);
     } else {
       gaugePrecondition = gaugeSloppy;
     }

     // switch the parameters for creating the mirror refinement cuda gauge field
     gauge_param.reconstruct = recon[2];
     gauge_param.setPrecision(prec[2], true);

     if (gaugeRefinement) errorQuda("gaugeRefinement already exists");

     if (gauge_param.Precision() != gaugeSloppy->Precision() || gauge_param.reconstruct != gaugeSloppy->Reconstruct()) {
       gaugeRefinement = new cudaGaugeField(gauge_param);
       gaugeRefinement->copy(*gaugeSloppy);
     } else {
       gaugeRefinement = gaugeSloppy;
     }
   }

   // fat links (if they exist)
   if (gaugeFatPrecise) {
     GaugeFieldParam gauge_param(*gaugeFatPrecise);

     gauge_param.setPrecision(prec[0], true);

     if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");

     if (gauge_param.Precision() != gaugeFatPrecise->Precision()
         || gauge_param.reconstruct != gaugeFatPrecise->Reconstruct()) {
       gaugeFatSloppy = new cudaGaugeField(gauge_param);
       gaugeFatSloppy->copy(*gaugeFatPrecise);
     } else {
       gaugeFatSloppy = gaugeFatPrecise;
     }

     // switch the parameters for creating the mirror preconditioner cuda gauge field
     gauge_param.setPrecision(prec[1], true);

     if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");

     if (gauge_param.Precision() != gaugeFatSloppy->Precision()
         || gauge_param.reconstruct != gaugeFatSloppy->Reconstruct()) {
       gaugeFatPrecondition = new cudaGaugeField(gauge_param);
       gaugeFatPrecondition->copy(*gaugeFatSloppy);
     } else {
       gaugeFatPrecondition = gaugeFatSloppy;
     }

     // switch the parameters for creating the mirror refinement cuda gauge field
     gauge_param.setPrecision(prec[2], true);

     if (gaugeFatRefinement) errorQuda("gaugeFatRefinement already exists\n");

     if (gauge_param.Precision() != gaugeFatSloppy->Precision()
         || gauge_param.reconstruct != gaugeFatSloppy->Reconstruct()) {
       gaugeFatRefinement = new cudaGaugeField(gauge_param);
       gaugeFatRefinement->copy(*gaugeFatSloppy);
     } else {
       gaugeFatRefinement = gaugeFatSloppy;
     }
   }

   // long links (if they exist)
   if (gaugeLongPrecise) {
     GaugeFieldParam gauge_param(*gaugeLongPrecise);

     gauge_param.reconstruct = recon[0];
     gauge_param.setPrecision(prec[0], true);

     if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");

     if (gauge_param.Precision() != gaugeLongPrecise->Precision()
         || gauge_param.reconstruct != gaugeLongPrecise->Reconstruct()) {
       gaugeLongSloppy = new cudaGaugeField(gauge_param);
       gaugeLongSloppy->copy(*gaugeLongPrecise);
     } else {
       gaugeLongSloppy = gaugeLongPrecise;
     }

     // switch the parameters for creating the mirror preconditioner cuda gauge field
     gauge_param.reconstruct = recon[1];
     gauge_param.setPrecision(prec[1], true);

     if (gaugeLongPrecondition) errorQuda("gaugeLongPrecondition already exists\n");

     if (gauge_param.Precision() != gaugeLongSloppy->Precision()
         || gauge_param.reconstruct != gaugeLongSloppy->Reconstruct()) {
       gaugeLongPrecondition = new cudaGaugeField(gauge_param);
       gaugeLongPrecondition->copy(*gaugeLongSloppy);
     } else {
       gaugeLongPrecondition = gaugeLongSloppy;
     }

     // switch the parameters for creating the mirror refinement cuda gauge field
     gauge_param.reconstruct = recon[2];
     gauge_param.setPrecision(prec[2], true);

     if (gaugeLongRefinement) errorQuda("gaugeLongRefinement already exists\n");

     if (gauge_param.Precision() != gaugeLongSloppy->Precision()
         || gauge_param.reconstruct != gaugeLongSloppy->Reconstruct()) {
       gaugeLongRefinement = new cudaGaugeField(gauge_param);
       gaugeLongRefinement->copy(*gaugeLongSloppy);
     } else {
       gaugeLongRefinement = gaugeLongSloppy;
     }
   }
 }

 void freeSloppyCloverQuda()
 {
   if (!initialized) errorQuda("QUDA not initialized");
   if (cloverRefinement != cloverSloppy && cloverRefinement) delete cloverRefinement;
   if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition;
   if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;

   cloverRefinement = nullptr;
   cloverPrecondition = nullptr;
   cloverSloppy = nullptr;
 }

 void freeCloverQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");
   freeSloppyCloverQuda();
   if (cloverPrecise) delete cloverPrecise;
   cloverPrecise = nullptr;
 }

 void flushChronoQuda(int i)
 {
   if (i >= QUDA_MAX_CHRONO)
     errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);

   auto &basis = chronoResident[i];

   for (auto v : basis) {
     if (v)  delete v;
   }
   basis.clear();
 }

 void endQuda(void)
 {
   profileEnd.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) return;

   freeGaugeQuda();
   freeCloverQuda();

   for (int i=0; i<QUDA_MAX_CHRONO; i++) flushChronoQuda(i);

   for (auto v : solutionResident) if (v) delete v;
   solutionResident.clear();

   if(momResident) delete momResident;

   LatticeField::freeGhostBuffer();
   cpuColorSpinorField::freeGhostBuffer();

   cublas::destroy();
   blas::end();

   pool::flush_pinned();
   pool::flush_device();

   host_free(num_failures_h);
   num_failures_h = nullptr;
   num_failures_d = nullptr;

   if (streams) {
     for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
     delete []streams;
     streams = nullptr;
   }
   destroyDslashEvents();

   saveTuneCache();
   saveProfile();

   // flush any outstanding force monitoring (if enabled)
   flushForceMonitor();

   initialized = false;

   comm_finalize();
   comms_initialized = false;

   profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);
   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);

   // print out the profile information of the lifetime of the library
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     profileInit.Print();
     profileGauge.Print();
     profileClover.Print();
     profileDslash.Print();
     profileInvert.Print();
     profileMulti.Print();
     profileEigensolve.Print();
     profileFatLink.Print();
     profileGaugeForce.Print();
     profileGaugeUpdate.Print();
     profileExtendedGauge.Print();
     profileCloverForce.Print();
     profileStaggeredForce.Print();
     profileHISQForce.Print();
     profileContract.Print();
     profileCovDev.Print();
     profilePlaq.Print();
     profileQCharge.Print();
     profileAPE.Print();
     profileSTOUT.Print();
     profileProject.Print();
     profilePhase.Print();
     profileMomAction.Print();
     profileEnd.Print();

     profileInit2End.Print();
     TimeProfile::PrintGlobal();

     printLaunchTimer();
     printAPIProfile();

     printfQuda("\n");
     printPeakMemUsage();
     printfQuda("\n");
   }

   assertAllMemFree();

   char *device_reset_env = getenv("QUDA_DEVICE_RESET");
   if (device_reset_env && strcmp(device_reset_env,"1") == 0) {
     // end this CUDA context
     cudaDeviceReset();
   }

 }


 namespace quda {

   void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
   {
     double kappa = inv_param->kappa;
     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
       kappa *= gaugePrecise->Anisotropy();
     }

     switch (inv_param->dslash_type) {
     case QUDA_WILSON_DSLASH:
       diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
       break;
     case QUDA_CLOVER_WILSON_DSLASH:
       diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
       break;
     case QUDA_DOMAIN_WALL_DSLASH:
       diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC;
       diracParam.Ls = inv_param->Ls;
       break;
     case QUDA_DOMAIN_WALL_4D_DSLASH:
       diracParam.type = pc ? QUDA_DOMAIN_WALL_4DPC_DIRAC : QUDA_DOMAIN_WALL_4D_DIRAC;
       diracParam.Ls = inv_param->Ls;
       break;
     case QUDA_MOBIUS_DWF_DSLASH:
       if (inv_param->Ls > QUDA_MAX_DWF_LS)
   errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC;
       diracParam.Ls = inv_param->Ls;
       if (sizeof(Complex) != sizeof(double _Complex)) {
         errorQuda("Irreconcilable difference between interface and internal complex number conventions");
       }
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
         printfQuda("Printing b_5 and c_5 values\n");
         for (int i = 0; i < diracParam.Ls; i++) {
           printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),
               diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
           // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,
           // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,
           // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );
         }
       }
       break;
     case QUDA_STAGGERED_DSLASH:
       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
       break;
     case QUDA_ASQTAD_DSLASH:
       diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
       break;
     case QUDA_TWISTED_MASS_DSLASH:
       diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC;
       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
   diracParam.Ls = 1;
   diracParam.epsilon = 0.0;
       } else {
   diracParam.Ls = 2;
   diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
       }
       break;
     case QUDA_TWISTED_CLOVER_DSLASH:
       diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC;
       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET)  {
   diracParam.Ls = 1;
   diracParam.epsilon = 0.0;
       } else {
   diracParam.Ls = 2;
   diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
       }
       break;
     case QUDA_LAPLACE_DSLASH:
       diracParam.type = pc ? QUDA_GAUGE_LAPLACEPC_DIRAC : QUDA_GAUGE_LAPLACE_DIRAC;
       diracParam.laplace3D = inv_param->laplace3D;
       break;
     case QUDA_COVDEV_DSLASH:
       diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
       break;
     default:
       errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
     }

     diracParam.matpcType = inv_param->matpc_type;
     diracParam.dagger = inv_param->dagger;
     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise : gaugePrecise;
     diracParam.fatGauge = gaugeFatPrecise;
     diracParam.longGauge = gaugeLongPrecise;
     diracParam.clover = cloverPrecise;
     diracParam.kappa = kappa;
     diracParam.mass = inv_param->mass;
     diracParam.m5 = inv_param->m5;
     diracParam.mu = inv_param->mu;

     for (int i=0; i<4; i++) diracParam.commDim[i] = 1;   // comms are always on

     if (diracParam.gauge->Precision() != inv_param->cuda_prec)
       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
                 inv_param->cuda_prec);
   }


   void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);

     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatSloppy : gaugeSloppy;
     diracParam.fatGauge = gaugeFatSloppy;
     diracParam.longGauge = gaugeLongSloppy;
     diracParam.clover = cloverSloppy;

     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
     }

     if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy)
       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
                 inv_param->cuda_prec_sloppy);
   }

   void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);

     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatRefinement : gaugeRefinement;
     diracParam.fatGauge = gaugeFatRefinement;
     diracParam.longGauge = gaugeLongRefinement;
     diracParam.clover = cloverRefinement;

     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
     }

     if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy)
       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
                 inv_param->cuda_prec_refinement_sloppy);
   }

   // The preconditioner currently mimicks the sloppy operator with no comms
   void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
   {
     setDiracParam(diracParam, inv_param, pc);

     if (inv_param->overlap) {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;
       diracParam.fatGauge = gaugeFatExtended;
       diracParam.longGauge = gaugeLongExtended;
     } else {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecondition : gaugePrecondition;
       diracParam.fatGauge = gaugeFatPrecondition;
       diracParam.longGauge = gaugeLongPrecondition;
     }
     diracParam.clover = cloverPrecondition;

     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = comms ? 1 : 0;
     }

     // In the preconditioned staggered CG allow a different dslash type in the preconditioning
     if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH
        && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) {
        diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
        diracParam.gauge = gaugeFatPrecondition;
     }

     if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition)
       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),
                 inv_param->cuda_prec_precondition);
   }


   void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
     DiracParam diracPreParam;

     setDiracParam(diracParam, &param, pc_solve);
     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
     bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ?  false : true ;//inc eigCG needs 2 sloppy precisions.
     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator
     dSloppy = Dirac::create(diracSloppyParam);
     dPre = Dirac::create(diracPreParam);
   }

   void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
     DiracParam diracPreParam;
     DiracParam diracRefParam;

     setDiracParam(diracParam, &param, pc_solve);
     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
     setDiracRefineParam(diracRefParam, &param, pc_solve);
     bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ?  false : true ;//inc eigCG needs 2 sloppy precisions.
     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator
     dSloppy = Dirac::create(diracSloppyParam);
     dPre = Dirac::create(diracPreParam);
     dRef = Dirac::create(diracRefParam);
   }

   static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];

   void massRescale(cudaColorSpinorField &b, QudaInvertParam &param) {

     double kappa5 = (0.5/(5.0 + param.m5));
     double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
         param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
         param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       printfQuda("Mass rescale: Kappa is: %g\n", kappa);
       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
       double nin = blas::norm2(b);
       printfQuda("Mass rescale: norm of source in = %g\n", nin);
     }

     // staggered dslash uses mass normalization internally
     if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {
       switch (param.solution_type) {
         case QUDA_MAT_SOLUTION:
         case QUDA_MATPC_SOLUTION:
           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);
           break;
         case QUDA_MATDAG_MAT_SOLUTION:
         case QUDA_MATPCDAG_MATPC_SOLUTION:
           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);
           break;
         default:
           errorQuda("Not implemented");
       }
       return;
     }

     for(int i=0; i<param.num_offset; i++) {
       unscaled_shifts[i] = param.offset[i];
     }

     // multiply the source to compensate for normalization of the Dirac operator, if necessary
     switch (param.solution_type) {
       case QUDA_MAT_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(2.0*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;
         }
         break;
       case QUDA_MATDAG_MAT_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         }
         break;
       case QUDA_MATPC_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(2.0*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;
         }
         break;
       case QUDA_MATPCDAG_MATPC_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
     blas::ax(16.0*std::pow(kappa,4), b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 16.0*std::pow(kappa,4);
         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         }
         break;
       default:
         errorQuda("Solution type %d not supported", param.solution_type);
     }

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       printfQuda("Mass rescale: Kappa is: %g\n", kappa);
       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
       double nin = blas::norm2(b);
       printfQuda("Mass rescale: norm of source out = %g\n", nin);
     }

   }
 }

 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
   profileDslash.TPSTART(QUDA_PROFILE_TOTAL);
   profileDslash.TPSTART(QUDA_PROFILE_INIT);

   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;

   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField in(*in_h, cudaParam);
   cudaColorSpinorField out(in, cudaParam);

   bool pc = true;
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   profileDslash.TPSTOP(QUDA_PROFILE_INIT);

   profileDslash.TPSTART(QUDA_PROFILE_H2D);
   in = *in_h;
   profileDslash.TPSTOP(QUDA_PROFILE_H2D);

   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
        inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )
     blas::ax(1.0/(2.0*inv_param->mass), in);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gauge.Anisotropy(), in);
   }

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     cudaColorSpinorField tmp1(in, cudaParam);
     ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
     dirac->Dslash(out, tmp1, parity); // apply the operator
   } else {
     dirac->Dslash(out, in, parity); // apply the operator
   }
   profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileDslash.TPSTART(QUDA_PROFILE_D2H);
   *out_h = out;
   profileDslash.TPSTOP(QUDA_PROFILE_D2H);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   profileDslash.TPSTART(QUDA_PROFILE_FREE);
   delete dirac; // clean up

   delete out_h;
   delete in_h;
   profileDslash.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();
   profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
 {
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;

   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
     errorQuda("Gauge field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gauge.Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator
   printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);
   switch (test_type) {
     case 0:
       dirac.Dslash4(out, in, parity);
       break;
     case 1:
       dirac.Dslash5(out, in, parity);
       break;
     case 2:
       dirac.Dslash5inv(out, in, parity, inv_param->kappa);
       break;
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }

 void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
 {
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;

   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
     errorQuda("Gauge field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gauge.Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   DiracMobiusPC dirac(diracParam); // create the Dirac operator
   switch (test_type) {
     case 0:
       dirac.Dslash4(out, in, parity);
       break;
     case 1:
       dirac.Dslash5(out, in, parity);
       break;
     case 2:
       dirac.Dslash4pre(out, in, parity);
       break;
     case 3:
       dirac.Dslash5inv(out, in, parity);
       break;
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }


 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
 {
   pushVerbosity(inv_param->verbosity);

   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;

   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   dirac->M(out, in); // apply the operator
   delete dirac; // clean up

   double kappa = inv_param->kappa;
   if (pc) {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.5/kappa, out);
     }
   } else {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.5/kappa, out);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }


 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
 {
   pushVerbosity(inv_param->verbosity);

   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;

   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)
       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   //  double kappa = inv_param->kappa;
   //  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   dirac->MdagM(out, in); // apply the operator
   delete dirac; // clean up

   double kappa = inv_param->kappa;
   if (pc) {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
       blas::ax(1.0/std::pow(2.0*kappa,4), out);
     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     }
   } else {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }

 namespace quda
 {
   bool canReuseResidentGauge(QudaInvertParam *param)
   {
     if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
       return (gaugePrecise != nullptr) and param->cuda_prec == gaugePrecise->Precision();
     } else {
       return (gaugeFatPrecise != nullptr) and param->cuda_prec == gaugeFatPrecise->Precision();
     }
   }
 } // namespace quda

 void checkClover(QudaInvertParam *param) {

   if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
     return;
   }

   if (param->cuda_prec != cloverPrecise->Precision()) {
     errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());
   }

   if ( (!cloverSloppy || param->cuda_prec_sloppy != cloverSloppy->Precision()) ||
        (!cloverPrecondition || param->cuda_prec_precondition != cloverPrecondition->Precision()) ||
        (!cloverRefinement || param->cuda_prec_refinement_sloppy != cloverRefinement->Precision()) ) {
     freeSloppyCloverQuda();
     QudaPrecision prec[] = {param->cuda_prec_sloppy, param->cuda_prec_precondition, param->cuda_prec_refinement_sloppy};
     loadSloppyCloverQuda(prec);
   }

   if (cloverPrecise == nullptr) errorQuda("Precise clover field doesn't exist");
   if (cloverSloppy == nullptr) errorQuda("Sloppy clover field doesn't exist");
   if (cloverPrecondition == nullptr) errorQuda("Precondition clover field doesn't exist");
   if (cloverRefinement == nullptr) errorQuda("Refinement clover field doesn't exist");
 }

 quda::cudaGaugeField *checkGauge(QudaInvertParam *param)
 {
   quda::cudaGaugeField *cudaGauge = nullptr;
   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");

     if (param->cuda_prec != gaugePrecise->Precision()) {
       errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
     }

     if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
         || param->cuda_prec_precondition != gaugePrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()) {
       QudaPrecision precision[3]
           = {param->cuda_prec_sloppy, param->cuda_prec_precondition, param->cuda_prec_refinement_sloppy};
       QudaReconstructType recon[3]
           = {gaugeSloppy->Reconstruct(), gaugePrecondition->Reconstruct(), gaugeRefinement->Reconstruct()};
       freeSloppyGaugeQuda();
       loadSloppyGaugeQuda(precision, recon);
     }

     if (gaugeSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");
     if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
     if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
     if (param->overlap) {
       if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
     }
     cudaGauge = gaugePrecise;
   } else {
     if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");
     if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");

     if (param->cuda_prec != gaugeFatPrecise->Precision()) {
       errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());
     }

     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
         || param->cuda_prec_precondition != gaugeFatPrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision()
         || param->cuda_prec_sloppy != gaugeLongSloppy->Precision()
         || param->cuda_prec_precondition != gaugeLongPrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeLongRefinement->Precision()) {

       QudaPrecision precision[3]
         = {param->cuda_prec_sloppy, param->cuda_prec_precondition, param->cuda_prec_refinement_sloppy};
       // recon is always no for fat links, so just use long reconstructs here
       QudaReconstructType recon[3]
         = {gaugeLongSloppy->Reconstruct(), gaugeLongPrecondition->Reconstruct(), gaugeLongRefinement->Reconstruct()};
       freeSloppyGaugeQuda();
       loadSloppyGaugeQuda(precision, recon);
     }

     if (gaugeFatSloppy == nullptr) errorQuda("Sloppy gauge fat field doesn't exist");
     if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");
     if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");
     if (param->overlap) {
       if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
     }

     if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");
     if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");
     if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");
     if (param->overlap) {
       if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
     }
     cudaGauge = gaugeFatPrecise;
   }

   checkClover(param);

   return cudaGauge;
 }

 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
 {
   pushVerbosity(inv_param->verbosity);

   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugePrecise == nullptr) errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr) errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))
     errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), true);

   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gaugePrecise->Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);
   //FIXME: Do we need this for twisted clover???
   DiracCloverPC dirac(diracParam); // create the Dirac operator
   if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
   else dirac.CloverInv(out, in, parity);

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   /*for (int i=0; i<in_h->Volume(); i++) {
     ((cpuColorSpinorField*)out_h)->PrintVector(i);
     }*/

   delete out_h;
   delete in_h;

   popVerbosity();
 }

 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
 {
   profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL);
   profileEigensolve.TPSTART(QUDA_PROFILE_INIT);

   // Transfer the inv param structure contained in eig_param
   QudaInvertParam *inv_param = eig_param->invert_param;

   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH || inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH
       || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)
     setKernelPackT(true);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     printQudaInvertParam(inv_param);
     printQudaEigParam(eig_param);
   }

   checkInvertParam(inv_param);
   checkEigParam(eig_param);
   cudaGaugeField *cudaGauge = checkGauge(inv_param);

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE)
     || (inv_param->solve_type == QUDA_NORMERR_PC_SOLVE);

   inv_param->secs = 0;
   inv_param->gflops = 0;
   inv_param->iter = 0;

   // Define problem matrix
   //------------------------------------------------------
   Dirac *d = nullptr;
   Dirac *dSloppy = nullptr;
   Dirac *dPre = nullptr;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *inv_param, pc_solve);
   Dirac &dirac = *d;

   // Create device side ColorSpinorField vector space and to pass to the
   // compute function.
   const int *X = cudaGauge->X();
   ColorSpinorParam cpuParam(host_evecs[0], *inv_param, X, inv_param->solution_type, inv_param->input_location);

   // create wrappers around application vector set
   std::vector<ColorSpinorField *> host_evecs_;
   for (int i = 0; i < eig_param->nConv; i++) {
     cpuParam.v = host_evecs[i];
     host_evecs_.push_back(ColorSpinorField::Create(cpuParam));
   }

   ColorSpinorParam cudaParam(cpuParam);
   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
   cudaParam.setPrecision(eig_param->cuda_prec_ritz, eig_param->cuda_prec_ritz, true);

   std::vector<Complex> evals(eig_param->nConv, 0.0);
   std::vector<ColorSpinorField *> kSpace;
   for (int i = 0; i < eig_param->nConv; i++) { kSpace.push_back(ColorSpinorField::Create(cudaParam)); }

   // If you use polynomial acceleration on a non-symmetric matrix,
   // the solver will fail.
   if (eig_param->use_poly_acc && !eig_param->use_norm_op && !(inv_param->dslash_type == QUDA_LAPLACE_DSLASH)) {
     // Breaking up the boolean check a little bit. If it's a staggered dslash type and a PC type, we can use poly accel.
     if (!((inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH) && inv_param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
       errorQuda("Polynomial acceleration with non-symmetric matrices not supported");
     }
   }

   profileEigensolve.TPSTOP(QUDA_PROFILE_INIT);

   if (!eig_param->use_norm_op && !eig_param->use_dagger) {
     DiracM m(dirac);
     if (eig_param->arpack_check) {
       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
     } else {
       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
       (*eig_solve)(kSpace, evals);
       delete eig_solve;
     }
   } else if (!eig_param->use_norm_op && eig_param->use_dagger) {
     DiracMdag m(dirac);
     if (eig_param->arpack_check) {
       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
     } else {
       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
       (*eig_solve)(kSpace, evals);
       delete eig_solve;
     }
   } else if (eig_param->use_norm_op && !eig_param->use_dagger) {
     DiracMdagM m(dirac);
     if (eig_param->arpack_check) {
       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
     } else {
       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
       (*eig_solve)(kSpace, evals);
       delete eig_solve;
     }
   } else if (eig_param->use_norm_op && eig_param->use_dagger) {
     DiracMMdag m(dirac);
     if (eig_param->arpack_check) {
       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
     } else {
       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);
       (*eig_solve)(kSpace, evals);
       delete eig_solve;
     }
   } else {
     errorQuda("Invalid use_norm_op and dagger combination");
   }

   // Copy eigen values back
   for (int i = 0; i < eig_param->nConv; i++) { host_evals[i] = real(evals[i]) + imag(evals[i]) * _Complex_I; }

   // Transfer Eigenpairs back to host if using GPU eigensolver
   if (!(eig_param->arpack_check)) {
     profileEigensolve.TPSTART(QUDA_PROFILE_D2H);
     for (int i = 0; i < eig_param->nConv; i++) *host_evecs_[i] = *kSpace[i];
     profileEigensolve.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileEigensolve.TPSTART(QUDA_PROFILE_FREE);
   for (int i = 0; i < eig_param->nConv; i++) delete host_evecs_[i];
   delete d;
   delete dSloppy;
   delete dPre;
   for (int i = 0; i < eig_param->nConv; i++) delete kSpace[i];
   profileEigensolve.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
   : profile(profile) {
   profile.TPSTART(QUDA_PROFILE_INIT);
   QudaInvertParam *param = mg_param.invert_param;

   checkMultigridParam(&mg_param);
   cudaGaugeField *cudaGauge = checkGauge(param);

   // check MG params (needs to go somewhere else)
   if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
     errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);
   for (int i=0; i<mg_param.n_level; i++) {
     if (mg_param.smoother_solve_type[i] != QUDA_DIRECT_SOLVE && mg_param.smoother_solve_type[i] != QUDA_DIRECT_PC_SOLVE)
       errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);
   }
   if (param->solve_type != QUDA_DIRECT_SOLVE)
     errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param);
   mg_param.secs = 0;
   mg_param.gflops = 0;

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   // create the dirac operators for the fine grid

   // this is the Dirac operator we use for inter-grid residual computation
   DiracParam diracParam;
   setDiracSloppyParam(diracParam, param, outer_pc_solve);
   d = Dirac::create(diracParam);
   m = new DiracM(*d);

   // this is the Dirac operator we use for smoothing
   DiracParam diracSmoothParam;
   bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
     (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
   setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
   diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0];
   dSmooth = Dirac::create(diracSmoothParam);
   mSmooth = new DiracM(*dSmooth);

   // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
   DiracParam diracSmoothSloppyParam;
   setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve,
        mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false);
   diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0];

   dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);
   mSmoothSloppy = new DiracM(*dSmoothSloppy);

   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating vector of nullptr space fields of length %d\n", mg_param.n_vec[0]);

   ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]);
   csParam.create = QUDA_NULL_FIELD_CREATE;
   QudaPrecision Bprec = mg_param.precision_null[0];
   Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);
   csParam.setPrecision(Bprec);
   csParam.fieldOrder = mg_param.setup_location[0] == QUDA_CUDA_FIELD_LOCATION ? QUDA_FLOAT2_FIELD_ORDER : QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
   csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE;
   B.resize(mg_param.n_vec[0]);
   for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); }

   // fill out the MG parameters for the fine level
   mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);

   mg = new MG(*mgParam, profile);
   mgParam->updateInvertParam(*param);

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
   profile.TPSTOP(QUDA_PROFILE_INIT);
 }

 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profilerStart(__func__);

   pushVerbosity(mg_param->invert_param->verbosity);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   auto *mg = new multigrid_solver(*mg_param, profileInvert);
   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   saveTuneCache();

   popVerbosity();

   profilerStop(__func__);
   return static_cast<void*>(mg);
 }

 void destroyMultigridQuda(void *mg) {
   delete static_cast<multigrid_solver*>(mg);
 }

 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);

   pushVerbosity(mg_param->invert_param->verbosity);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);

   auto *mg = static_cast<multigrid_solver*>(mg_);
   checkMultigridParam(mg_param);

   QudaInvertParam *param = mg_param->invert_param;
   // check the gauge fields have been created and set the precision as needed
   checkGauge(param);

   // for reporting level 1 is the fine level but internally use level 0 for indexing
   // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" );
   // setOutputPrefix(prefix);
   setOutputPrefix("MG level 1 (GPU): "); //fix me

   if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Updating operator on level 1 of %d levels\n", mg->mgParam->Nlevel);

   bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   // free the previous dirac operators
   if (mg->m) delete mg->m;
   if (mg->mSmooth) delete mg->mSmooth;
   if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;

   if (mg->d) delete mg->d;
   if (mg->dSmooth) delete mg->dSmooth;
   if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;

   // create new fine dirac operators

   // this is the Dirac operator we use for inter-grid residual computation
   DiracParam diracParam;
   setDiracSloppyParam(diracParam, param, outer_pc_solve);
   mg->d = Dirac::create(diracParam);
   mg->m = new DiracM(*(mg->d));

   // this is the Dirac operator we use for smoothing
   DiracParam diracSmoothParam;
   bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
     (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
   setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
   mg->dSmooth = Dirac::create(diracSmoothParam);
   mg->mSmooth = new DiracM(*(mg->dSmooth));

   // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
   DiracParam diracSmoothSloppyParam;
   setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
   mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
   mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));

   mg->mgParam->matResidual = mg->m;
   mg->mgParam->matSmooth = mg->mSmooth;
   mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;

   mg->mgParam->updateInvertParam(*param);
   if(mg->mgParam->mg_global.invert_param != param)
     mg->mgParam->mg_global.invert_param = param;

   bool refresh = true;
   mg->mg->reset(refresh);

   setOutputPrefix("");

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   popVerbosity();

   profilerStop(__func__);
 }

 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
   pushVerbosity(mg_param->invert_param->verbosity);
   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   auto *mg = static_cast<multigrid_solver*>(mg_);
   checkMultigridParam(mg_param);
   checkGauge(mg_param->invert_param);

   mg->mg->dumpNullVectors();

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
   popVerbosity();
   profilerStop(__func__);
 }

 deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
   : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr),  profile(profile) {

   QudaInvertParam *param = eig_param.invert_param;

   if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;

   profile.TPSTART(QUDA_PROFILE_INIT);

   cudaGaugeField *cudaGauge = checkGauge(param);
   eig_param.secs   = 0;
   eig_param.gflops = 0;

   DiracParam diracParam;
   if(eig_param.cuda_prec_ritz == param->cuda_prec)
   {
     setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
   } else {
     setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
   }

   const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   d = Dirac::create(diracParam);
   m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));

   ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location);

   ritzParam.create        = QUDA_ZERO_FIELD_CREATE;
   ritzParam.is_composite  = true;
   ritzParam.is_component  = false;
   ritzParam.composite_dim = param->nev*param->deflation_grid;
   ritzParam.setPrecision(param->cuda_prec_ritz);

   if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
     ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order
     if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;

     //select memory location here, by default ritz vectors will be allocated on the device
     //but if not sufficient device memory, then the user may choose mapped type of memory
     ritzParam.mem_type = eig_param.mem_type_ritz;
   } else { //host location
     ritzParam.mem_type = QUDA_MEMORY_PINNED;
   }

   int ritzVolume = 1;
   for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];

   if (getVerbosity() == QUDA_DEBUG_VERBOSE) {

     size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());
     printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());
     if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");
     else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED)
       printfQuda("Using mapped memory type.\n");
   }

   RV = ColorSpinorField::Create(ritzParam);

   deflParam = new DeflationParam(eig_param, RV, *m);

   defl = new Deflation(*deflParam, profile);

   profile.TPSTOP(QUDA_PROFILE_INIT);
 }

 void* newDeflationQuda(QudaEigParam *eig_param) {
   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
 #ifdef MAGMA_LIB
   openMagma();
 #endif
   auto *defl = new deflated_solver(*eig_param, profileInvert);

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   saveProfile(__func__);
   flushProfile();
   return static_cast<void*>(defl);
 }

 void destroyDeflationQuda(void *df) {
 #ifdef MAGMA_LIB
   closeMagma();
 #endif
   delete static_cast<deflated_solver*>(df);
 }

 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
   profilerStart(__func__);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);

   checkInvertParam(param, hp_x, hp_b);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
   // for now, though, so here we factorize everything for convenience.

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
     (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
     (param->solve_type == QUDA_DIRECT_PC_SOLVE);
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   Dirac *d = nullptr;
   Dirac *dSloppy = nullptr;
   Dirac *dPre = nullptr;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;
   Dirac &diracPre = *dPre;

   profileInvert.TPSTART(QUDA_PROFILE_H2D);

   ColorSpinorField *b = nullptr;
   ColorSpinorField *x = nullptr;
   ColorSpinorField *in = nullptr;
   ColorSpinorField *out = nullptr;

   const int *X = cudaGauge->X();

   // wrap CPU host side pointers
   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);

   cpuParam.v = hp_x;
   cpuParam.location = param->output_location;
   ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);

   // download source
   ColorSpinorParam cudaParam(cpuParam, *param);
   cudaParam.create = QUDA_COPY_FIELD_CREATE;
   b = new cudaColorSpinorField(*h_b, cudaParam);

   // now check if we need to invalidate the solutionResident vectors
   bool invalidate = false;
   if (param->use_resident_solution == 1) {
     for (auto v : solutionResident)
       if (b->Precision() != v->Precision() || b->SiteSubset() != v->SiteSubset()) { invalidate = true; break; }

     if (invalidate) {
       for (auto v : solutionResident) if (v) delete v;
       solutionResident.clear();
     }

     if (!solutionResident.size()) {
       cudaParam.create = QUDA_NULL_FIELD_CREATE;
       solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution
     }
     x = solutionResident[0];
   } else {
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     x = new cudaColorSpinorField(cudaParam);
   }

   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
     // initial guess only supported for single-pass solvers
     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
       errorQuda("Initial guess not supported for two-pass solver");
     }

     *x = *h_x; // solution
   } else { // zero initial guess
     blas::zero(*x);
   }

   profileInvert.TPSTOP(QUDA_PROFILE_H2D);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);

   double nb = blas::norm2(*b);
   if (nb==0.0) errorQuda("Source has zero norm");

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nh_b = blas::norm2(*h_b);
     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
     if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) {
       double nh_x = blas::norm2(*h_x);
       double nx = blas::norm2(*x);
       printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
     }
   }

   // rescale the source and solution vectors to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     blas::ax(1.0/sqrt(nb), *b);
     blas::ax(1.0/sqrt(nb), *x);
   }

   massRescale(*static_cast<cudaColorSpinorField*>(b), *param);

   dirac.prepare(in, out, *x, *b, param->solution_type);

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nin = blas::norm2(*in);
     double nout = blas::norm2(*out);
     printfQuda("Prepared source = %g\n", nin);
     printfQuda("Prepared solution = %g\n", nout);
   }

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nin = blas::norm2(*in);
     printfQuda("Prepared source post mass rescale = %g\n", nin);
   }

   // solution_type specifies *what* system is to be solved.
   // solve_type specifies *how* the system is to be solved.
   //
   // We have the following four cases (plus preconditioned variants):
   //
   // solution_type    solve_type    Effect
   // -------------    ----------    ------
   // MAT              DIRECT        Solve Ax=b
   // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y
   // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)
   // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b
   // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y
   //
   // We generally require that the solution_type and solve_type
   // preconditioning match.  As an exception, the unpreconditioned MAT
   // solution_type may be used with any solve_type, including
   // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the
   // preconditioned source and reconstruction of the full solution are
   // taken care of by Dirac::prepare() and Dirac::reconstruct(),
   // respectively.

   if (pc_solution && !pc_solve) {
     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
   }

   if (!mat_solution && !pc_solution && pc_solve) {
     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
   }

   if (!mat_solution && norm_error_solve) {
     errorQuda("Normal-error solve requires Mat solution");
   }

   if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {
     errorQuda("Multigrid preconditioning only supported for direct solves");
   }

   if (param->chrono_use_resident && ( norm_error_solve) ){
     errorQuda("Chronological forcasting only presently supported for M^dagger M solver");
   }

   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);

   if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
     cudaColorSpinorField tmp(*in);
     dirac.Mdag(*in, tmp);
   } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
     DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);
     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     blas::copy(*in, *out);
     solverParam.updateInvertParam(*param);
     delete solve;
   }

   if (direct_solve) {
     DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);
     // chronological forecasting
     if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
       profileInvert.TPSTART(QUDA_PROFILE_CHRONO);

       auto &basis = chronoResident[param->chrono_index];

       ColorSpinorParam cs_param(*basis[0]);
       ColorSpinorField *tmp = ColorSpinorField::Create(cs_param);
       ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
       std::vector<ColorSpinorField*> Ap;
       for (unsigned int k=0; k < basis.size(); k++) {
         Ap.emplace_back((ColorSpinorField::Create(cs_param)));
       }

       if (param->chrono_precision == param->cuda_prec) {
         for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
       } else if (param->chrono_precision == param->cuda_prec_sloppy) {
         for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
       } else {
         errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
                   param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
       }

       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = false;
       MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);

       blas::copy(*tmp, *in);
       mre(*out, *tmp, basis, Ap);

       for (auto ap: Ap) {
         if (ap) delete (ap);
       }
       delete tmp;
       if (tmp2 != out) delete tmp2;

       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
     }

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     solverParam.updateInvertParam(*param);
     delete solve;
   } else if (!norm_error_solve) {
     DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);

     // chronological forecasting
     if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {
       profileInvert.TPSTART(QUDA_PROFILE_CHRONO);

       auto &basis = chronoResident[param->chrono_index];

       ColorSpinorParam cs_param(*basis[0]);
       std::vector<ColorSpinorField*> Ap;
       ColorSpinorField *tmp = ColorSpinorField::Create(cs_param);
       ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);
       for (unsigned int k=0; k < basis.size(); k++) {
         Ap.emplace_back((ColorSpinorField::Create(cs_param)));
       }

       if (param->chrono_precision == param->cuda_prec) {
         for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);
       } else if (param->chrono_precision == param->cuda_prec_sloppy) {
         for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
       } else {
         errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
                   param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);
       }

       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = true;
       MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);

       blas::copy(*tmp, *in);
       mre(*out, *tmp, basis, Ap);

       for (auto ap: Ap) {
         if (ap) delete(ap);
       }
       delete tmp;
       if (tmp2 != out) delete tmp2;

       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
     }

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     solverParam.updateInvertParam(*param);
     delete solve;
   } else { // norm_error_solve
     DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     cudaColorSpinorField tmp(*out);
     SolverParam solverParam(*param);
     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(tmp, *in); // y = (M M^\dag) b
     dirac.Mdag(*out, tmp);  // x = M^dag y
     solverParam.updateInvertParam(*param);
     delete solve;
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     double nx = blas::norm2(*x);
     printfQuda("Solution = %g\n",nx);
   }

   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   if (param->chrono_make_resident) {
     if(param->chrono_max_dim < 1){
       errorQuda("Cannot chrono_make_resident with chrono_max_dim %i",param->chrono_max_dim);
     }

     const int i = param->chrono_index;
     if (i >= QUDA_MAX_CHRONO)
       errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);

     auto &basis = chronoResident[i];

     if(param->chrono_max_dim < (int)basis.size()){
       errorQuda("Requested chrono_max_dim %i is smaller than already existing chroology %i",param->chrono_max_dim,(int)basis.size());
     }

     if(not param->chrono_replace_last){
       // if we have not filled the space yet just augment
       if ((int)basis.size() < param->chrono_max_dim) {
         ColorSpinorParam cs_param(*out);
         cs_param.setPrecision(param->chrono_precision);
         basis.emplace_back(ColorSpinorField::Create(cs_param));
       }

       // shuffle every entry down one and bring the last to the front
       ColorSpinorField *tmp = basis[basis.size()-1];
       for (unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];
         basis[0] = tmp;
     }
     *(basis[0]) = *out; // set first entry to new solution
   }
   dirac.reconstruct(*x, *b, param->solution_type);

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     // rescale the solution
     blas::ax(sqrt(nb), *x);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   if (!param->make_resident_solution) {
     profileInvert.TPSTART(QUDA_PROFILE_D2H);
     *h_x = *x;
     profileInvert.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);

   if (param->compute_action) {
     Complex action = blas::cDotProduct(*b, *x);
     param->action[0] = action.real();
     param->action[1] = action.imag();
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     double nx = blas::norm2(*x);
     double nh_x = blas::norm2(*h_x);
     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   profileInvert.TPSTART(QUDA_PROFILE_FREE);

   delete h_b;
   delete h_x;
   delete b;

   if (param->use_resident_solution && !param->make_resident_solution) {
     for (auto v: solutionResident) if (v) delete v;
     solutionResident.clear();
   } else if (!param->make_resident_solution) {
     delete x;
   }

   delete d;
   delete dSloppy;
   delete dPre;

   profileInvert.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   profilerStop(__func__);
 }


 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
 {
   // currently that code is just a copy of invertQuda and cannot work
   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);

   checkInvertParam(param, _hp_x[0], _hp_b[0]);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
   // for now, though, so here we factorize everything for convenience.

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
     (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
     (param->solve_type == QUDA_DIRECT_PC_SOLVE);
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   Dirac *d = nullptr;
   Dirac *dSloppy = nullptr;
   Dirac *dPre = nullptr;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;
   Dirac &diracPre = *dPre;

   profileInvert.TPSTART(QUDA_PROFILE_H2D);

   // std::vector<ColorSpinorField*> b;  // Cuda Solutions
   // b.resize(param->num_src);
   // std::vector<ColorSpinorField*> x;  // Cuda Solutions
   // x.resize(param->num_src);
   ColorSpinorField* in;  // = nullptr;
   //in.resize(param->num_src);
   ColorSpinorField* out;  // = nullptr;
   //out.resize(param->num_src);

   // for(int i=0;i < param->num_src;i++){
   //   in[i] = nullptr;
   //   out[i] = nullptr;
   // }

   const int *X = cudaGauge->X();


   // Host pointers for x, take a copy of the input host pointers
   void** hp_x;
   hp_x = new void* [ param->num_src ];

   void** hp_b;
   hp_b = new void* [param->num_src];

   for(int i=0;i < param->num_src;i++){
     hp_x[i] = _hp_x[i];
     hp_b[i] = _hp_b[i];
   }

   // wrap CPU host side pointers
   ColorSpinorParam cpuParam(hp_b[0], *param, X, pc_solution, param->input_location);
   std::vector<ColorSpinorField*> h_b;
   h_b.resize(param->num_src);
   for(int i=0; i < param->num_src; i++) {
     cpuParam.v = hp_b[i]; //MW seems wird in the loop
     h_b[i] = ColorSpinorField::Create(cpuParam);
   }

  // cpuParam.v = hp_x;
   cpuParam.location = param->output_location;
   std::vector<ColorSpinorField*> h_x;
   h_x.resize(param->num_src);
 //
   for(int i=0; i < param->num_src; i++) {
     cpuParam.v = hp_x[i]; //MW seems wird in the loop
     h_x[i] = ColorSpinorField::Create(cpuParam);
   }


   // MW currently checked until here

   // download source
   printfQuda("Setup b\n");
   ColorSpinorParam cudaParam(cpuParam, *param);
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaParam.is_composite = true;
   cudaParam.composite_dim = param->num_src;

   printfQuda("Create b \n");
   ColorSpinorField *b = ColorSpinorField::Create(cudaParam);


   for(int i=0; i < param->num_src; i++) {
     b->Component(i) = *h_b[i];
   }
   printfQuda("Done b \n");

     ColorSpinorField *x;
   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
     // initial guess only supported for single-pass solvers
     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
       errorQuda("Initial guess not supported for two-pass solver");
     }
     cudaParam.is_composite = true;
     cudaParam.is_component = false;
     cudaParam.composite_dim = param->num_src;

     x = ColorSpinorField::Create(cudaParam);
     for(int i=0; i < param->num_src; i++) {
       x->Component(i) = *h_x[i];
     }

   } else { // zero initial guess
     // Create the solution fields filled with zero
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
       printfQuda("Create x \n");
     x = ColorSpinorField::Create(cudaParam);
       printfQuda("Done x \n");
  // solution
   }

   profileInvert.TPSTOP(QUDA_PROFILE_H2D);

   auto * nb = new double[param->num_src];
   for(int i=0; i < param->num_src; i++) {
     nb[i] = blas::norm2(b->Component(i));
     printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nb[i], nb[i]);
     if (nb[i]==0.0) errorQuda("Source has zero norm");

     if (getVerbosity() >= QUDA_VERBOSE) {
       double nh_b = blas::norm2(*h_b[i]);
       double nh_x = blas::norm2(*h_x[i]);
       double nx = blas::norm2(x->Component(i));
       printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nh_b, nb[i]);
       printfQuda("Solution %i: CPU = %g, CUDA copy = %g\n", i, nh_x, nx);
     }
   }

   // MW checked until here do far

   // rescale the source and solution vectors to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     for(int i=0; i < param->num_src; i++) {
       blas::ax(1.0/sqrt(nb[i]), b->Component(i));
       blas::ax(1.0/sqrt(nb[i]), x->Component(i));
     }
   }

   for(int i=0; i < param->num_src; i++) {
     massRescale(dynamic_cast<cudaColorSpinorField&>( b->Component(i) ), *param);
   }

   // MW: need to check what dirac.prepare does
   // for now let's just try looping of num_rhs already here???
   // for(int i=0; i < param->num_src; i++) {
     dirac.prepare(in, out, *x, *b, param->solution_type);
 for(int i=0; i < param->num_src; i++) {
     if (getVerbosity() >= QUDA_VERBOSE) {
       double nin = blas::norm2((in->Component(i)));
       double nout = blas::norm2((out->Component(i)));
       printfQuda("Prepared source %i = %g\n", i, nin);
       printfQuda("Prepared solution %i = %g\n", i, nout);
     }

     if (getVerbosity() >= QUDA_VERBOSE) {
       double nin = blas::norm2(in->Component(i));
       printfQuda("Prepared source %i post mass rescale = %g\n", i, nin);
     }
   }

     // solution_type specifies *what* system is to be solved.
     // solve_type specifies *how* the system is to be solved.
     //
     // We have the following four cases (plus preconditioned variants):
     //
     // solution_type    solve_type    Effect
     // -------------    ----------    ------
     // MAT              DIRECT        Solve Ax=b
     // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y
     // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)
     // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b
     // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y
     //
     // We generally require that the solution_type and solve_type
     // preconditioning match.  As an exception, the unpreconditioned MAT
     // solution_type may be used with any solve_type, including
     // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the
     // preconditioned source and reconstruction of the full solution are
     // taken care of by Dirac::prepare() and Dirac::reconstruct(),
     // respectively.

     if (pc_solution && !pc_solve) {
       errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
     }

     if (!mat_solution && !pc_solution && pc_solve) {
       errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
     }

     if (!mat_solution && norm_error_solve) {
       errorQuda("Normal-error solve requires Mat solution");
     }

     if (param->inv_type_precondition == QUDA_MG_INVERTER && (pc_solve || pc_solution || !direct_solve || !mat_solution))
       errorQuda("Multigrid preconditioning only supported for direct non-red-black solve");

     if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
       for(int i=0; i < param->num_src; i++) {
         cudaColorSpinorField tmp((in->Component(i)));
         dirac.Mdag(in->Component(i), tmp);
       }
     } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
       DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->blocksolve(*out,*in);
       for(int i=0; i < param->num_src; i++) {
         blas::copy(in->Component(i), out->Component(i));
       }
       solverParam.updateInvertParam(*param);
       delete solve;
     }

     if (direct_solve) {
       DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->blocksolve(*out,*in);
       solverParam.updateInvertParam(*param);
       delete solve;
     } else if (!norm_error_solve) {
       DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->blocksolve(*out,*in);
       solverParam.updateInvertParam(*param);
       delete solve;
     } else { // norm_error_solve
       DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       errorQuda("norm_error_solve not supported in multi source solve");
       //cudaColorSpinorField tmp(*out);
       // SolverParam solverParam(*param);
       //Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       //(*solve)(tmp, *in); // y = (M M^\dag) b
       //dirac.Mdag(*out, tmp);  // x = M^dag y
       //solverParam.updateInvertParam(*param,i,i);
       // delete solve;
     }

     if (getVerbosity() >= QUDA_VERBOSE){
       for(int i=0; i < param->num_src; i++) {
         double nx = blas::norm2(x->Component(i));
         printfQuda("Solution %i = %g\n",i, nx);
       }
     }


   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   for(int i=0; i< param->num_src; i++){
     dirac.reconstruct(x->Component(i), b->Component(i), param->solution_type);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     for(int i=0; i< param->num_src; i++){
       // rescale the solution
       blas::ax(sqrt(nb[i]), x->Component(i));
     }
   }

   // MW -- not sure how to handle that here
   if (!param->make_resident_solution) {
     profileInvert.TPSTART(QUDA_PROFILE_D2H);
     for(int i=0; i< param->num_src; i++){
       *h_x[i] = x->Component(i);
     }
     profileInvert.TPSTOP(QUDA_PROFILE_D2H);
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     for(int i=0; i< param->num_src; i++){
       double nx = blas::norm2(x->Component(i));
       double nh_x = blas::norm2(*h_x[i]);
       printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
     }
   }

   //FIX need to make sure all deletes are correct again
   for(int i=0; i < param->num_src; i++){
     delete h_x[i];
     // delete x[i];
     delete h_b[i];
     // delete b[i];
   }
    delete [] hp_b;
    delete [] hp_x;
 //   delete [] b;
 //  if (!param->make_resident_solution) delete x; // FIXME make this cleaner

   delete d;
   delete dSloppy;
   delete dPre;
   delete x;
   delete b;

   popVerbosity();

   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
 {
   profilerStart(__func__);

   profileMulti.TPSTART(QUDA_PROFILE_TOTAL);
   profileMulti.TPSTART(QUDA_PROFILE_INIT);

   if (!initialized) errorQuda("QUDA not initialized");

   checkInvertParam(param, _hp_x[0], _hp_b);

   // check the gauge fields have been created
   checkGauge(param);

   if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
         param->num_offset, QUDA_MAX_MULTI_SHIFT);

   pushVerbosity(param->verbosity);

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);


   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH) {

     if (param->solution_type != QUDA_MATPC_SOLUTION) {
       errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type");
     }

     if (param->solve_type != QUDA_DIRECT_PC_SOLVE) {
       errorQuda("For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");
     }

   } else { // Wilson type

     if (mat_solution) {
       errorQuda("For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
     }
     if (direct_solve) {
       errorQuda("For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");
     }
     if (pc_solution & !pc_solve) {
       errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");
     }
     if (!pc_solution & pc_solve) {
       errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
     }
   }

   // Timing and FLOP counters
   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   for (int i=0; i<param->num_offset-1; i++) {
     for (int j=i+1; j<param->num_offset; j++) {
       if (param->offset[i] > param->offset[j])
         errorQuda("Offsets must be ordered from smallest to largest");
     }
   }

   // Host pointers for x, take a copy of the input host pointers
   void** hp_x;
   hp_x = new void* [ param->num_offset ];

   void* hp_b = _hp_b;
   for(int i=0;i < param->num_offset;i++){
     hp_x[i] = _hp_x[i];
   }

   // Create the matrix.
   // The way this works is that createDirac will create 'd' and 'dSloppy'
   // which are global. We then grab these with references...
   //
   // Balint: Isn't there a nice construction pattern we could use here? This is
   // expedient but yucky.
   //  DiracParam diracParam;
   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH){
     param->mass = sqrt(param->offset[0]/4);
   }

   Dirac *d = nullptr;
   Dirac *dSloppy = nullptr;
   Dirac *dPre = nullptr;
   Dirac *dRefine = nullptr;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, dRefine, *param, pc_solve);
   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;


   cudaColorSpinorField *b = nullptr;   // Cuda RHS
   std::vector<ColorSpinorField*> x;  // Cuda Solutions
   x.resize(param->num_offset);
   std::vector<ColorSpinorField*> p;
   std::unique_ptr<double[]> r2_old(new double[param->num_offset]);

   // Grab the dimension array of the input gauge field.
   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
     gaugeFatPrecise->X() : gaugePrecise->X();

   // This creates a ColorSpinorParam struct, from the host data
   // pointer, the definitions in param, the dimensions X, and whether
   // the solution is on a checkerboard instruction or not. These can
   // then be used as 'instructions' to create the actual
   // ColorSpinorField
   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);

   std::vector<ColorSpinorField*> h_x;
   h_x.resize(param->num_offset);

   cpuParam.location = param->output_location;
   for(int i=0; i < param->num_offset; i++) {
     cpuParam.v = hp_x[i];
     h_x[i] = ColorSpinorField::Create(cpuParam);
   }

   profileMulti.TPSTOP(QUDA_PROFILE_INIT);
   profileMulti.TPSTART(QUDA_PROFILE_H2D);
   // Now I need a colorSpinorParam for the device
   ColorSpinorParam cudaParam(cpuParam, *param);
   // This setting will download a host vector
   cudaParam.create = QUDA_COPY_FIELD_CREATE;
   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
   profileMulti.TPSTOP(QUDA_PROFILE_H2D);

   profileMulti.TPSTART(QUDA_PROFILE_INIT);
   // Create the solution fields filled with zero
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   // now check if we need to invalidate the solutionResident vectors
   bool invalidate = false;
   for (auto v : solutionResident)
     if (cudaParam.Precision() != v->Precision()) { invalidate = true; break; }

   if (invalidate) {
     for (auto v : solutionResident) delete v;
     solutionResident.clear();
   }

   // grow resident solutions to be big enough
   for (int i=solutionResident.size(); i < param->num_offset; i++) {
     solutionResident.push_back(new cudaColorSpinorField(cudaParam));
   }
   for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];

   profileMulti.TPSTOP(QUDA_PROFILE_INIT);


   profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);

   // Check source norms
   double nb = blas::norm2(*b);
   if (nb==0.0) errorQuda("Source has zero norm");

   if(getVerbosity() >= QUDA_VERBOSE ) {
     double nh_b = blas::norm2(*h_b);
     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
   }

   // rescale the source vector to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     blas::ax(1.0/sqrt(nb), *b);
   }

   massRescale(*b, *param);
   profileMulti.TPSTOP(QUDA_PROFILE_PREAMBLE);

   DiracMatrix *m, *mSloppy;

   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH) {
     m = new DiracM(dirac);
     mSloppy = new DiracM(diracSloppy);
   } else {
     m = new DiracMdagM(dirac);
     mSloppy = new DiracMdagM(diracSloppy);
   }

   SolverParam solverParam(*param);
   MultiShiftCG cg_m(*m, *mSloppy, solverParam, profileMulti);
   cg_m(x, *b, p, r2_old.get());
   solverParam.updateInvertParam(*param);

   delete m;
   delete mSloppy;

   if (param->compute_true_res) {
     // check each shift has the desired tolerance and use sequential CG to refine
     profileMulti.TPSTART(QUDA_PROFILE_INIT);
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
     cudaColorSpinorField r(*b, cudaParam);
     profileMulti.TPSTOP(QUDA_PROFILE_INIT);
     QudaInvertParam refineparam = *param;
     refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy;
     Dirac &dirac = *d;
     Dirac &diracSloppy = *dRefine;

 #define REFINE_INCREASING_MASS
 #ifdef REFINE_INCREASING_MASS
     for(int i=0; i < param->num_offset; i++) {
 #else
     for(int i=param->num_offset-1; i >= 0; i--) {
 #endif
       double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
   param->true_res_hq_offset[i] : 0;
       double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
   param->tol_hq_offset[i] : 0;

       /*
   In the case where the shifted systems have zero tolerance
   specified, we refine these systems until either the limit of
   precision is reached (prec_tol) or until the tolerance reaches
   the iterated residual tolerance of the previous multi-shift
   solver (iter_res_offset[i]), which ever is greater.
       */
       const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12
       const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));
       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
       if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
   if (getVerbosity() >= QUDA_SUMMARIZE)
     printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
          i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);

         // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
             param->dslash_type == QUDA_STAGGERED_DSLASH) {
           dirac.setMass(sqrt(param->offset[i]/4));
           diracSloppy.setMass(sqrt(param->offset[i]/4));
         }

         DiracMatrix *m, *mSloppy;

         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
             param->dslash_type == QUDA_STAGGERED_DSLASH) {
           m = new DiracM(dirac);
           mSloppy = new DiracM(diracSloppy);
         } else {
           m = new DiracMdagM(dirac);
           mSloppy = new DiracMdagM(diracSloppy);
         }

   // need to curry in the shift if we are not doing staggered
   if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
       param->dslash_type != QUDA_STAGGERED_DSLASH) {
     m->shift = param->offset[i];
     mSloppy->shift = param->offset[i];
   }

   if (false) { // experimenting with Minimum residual extrapolation
     // only perform MRE using current and previously refined solutions
 #ifdef REFINE_INCREASING_MASS
     const int nRefine = i+1;
 #else
     const int nRefine = param->num_offset - i + 1;
 #endif

     std::vector<ColorSpinorField*> q;
     q.resize(nRefine);
     std::vector<ColorSpinorField*> z;
     z.resize(nRefine);
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     cudaColorSpinorField tmp(cudaParam);

     for(int j=0; j < nRefine; j++) {
       q[j] = new cudaColorSpinorField(cudaParam);
       z[j] = new cudaColorSpinorField(cudaParam);
     }

     *z[0] = *x[0]; // zero solution already solved
 #ifdef REFINE_INCREASING_MASS
     for (int j=1; j<nRefine; j++) *z[j] = *x[j];
 #else
     for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];
 #endif

     bool orthogonal = true;
     bool apply_mat = true;
           bool hermitian = true;
     MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
     blas::copy(tmp, *b);
     mre(*x[i], tmp, z, q);

     for(int j=0; j < nRefine; j++) {
       delete q[j];
       delete z[j];
     }
   }

   SolverParam solverParam(refineparam);
   solverParam.iter = 0;
   solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES;
   solverParam.tol = (param->tol_offset[i] > 0.0 ?  param->tol_offset[i] : iter_tol); // set L2 tolerance
   solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
         solverParam.delta = param->reliable_delta_refinement;

         {
           CG cg(*m, *mSloppy, solverParam, profileMulti);
           if (i==0)
             cg(*x[i], *b, p[i], r2_old[i]);
           else
             cg(*x[i], *b);
         }

         solverParam.true_res_offset[i] = solverParam.true_res;
         solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
         solverParam.updateInvertParam(*param,i);

         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
             param->dslash_type == QUDA_STAGGERED_DSLASH) {
           dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
           diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
         }

         delete m;
         delete mSloppy;

       }
     }
   }

   // restore shifts -- avoid side effects
   for(int i=0; i < param->num_offset; i++) {
     param->offset[i] = unscaled_shifts[i];
   }

   profileMulti.TPSTART(QUDA_PROFILE_D2H);

   if (param->compute_action) {
     Complex action(0);
     for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);
     param->action[0] = action.real();
     param->action[1] = action.imag();
   }

   for(int i=0; i < param->num_offset; i++) {
     if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
       blas::ax(sqrt(nb), *x[i]);
     }

     if (getVerbosity() >= QUDA_VERBOSE){
       double nx = blas::norm2(*x[i]);
       printfQuda("Solution %d = %g\n", i, nx);
     }

     if (!param->make_resident_solution) *h_x[i] = *x[i];
   }
   profileMulti.TPSTOP(QUDA_PROFILE_D2H);

   profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);

   if (!param->make_resident_solution) {
     for (auto v: solutionResident) if (v) delete v;
     solutionResident.clear();
   }

   profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);

   profileMulti.TPSTART(QUDA_PROFILE_FREE);
   for(int i=0; i < param->num_offset; i++){
     delete h_x[i];
     //if (!param->make_resident_solution) delete x[i];
   }

   delete h_b;
   delete b;

   delete [] hp_x;

   delete d;
   delete dSloppy;
   delete dPre;
   delete dRefine;
   for (auto& pp : p) delete pp;

   profileMulti.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);

   profilerStop(__func__);
 }

 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param) {

 #ifdef GPU_FATLINK
   profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);
   profileFatLink.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(param);

   if (ulink) {
     const double unitarize_eps = 1e-14;
     const double max_error = 1e-10;
     const int reunit_allow_svd = 1;
     const int reunit_svd_only  = 0;
     const double svd_rel_error = 1e-6;
     const double svd_abs_error = 1e-6;
     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only,
              svd_rel_error, svd_abs_error);
   }

   GaugeFieldParam gParam(fatlink, *param, QUDA_GENERAL_LINKS);
   cpuGaugeField cpuFatLink(gParam);   // create the host fatlink
   gParam.gauge = longlink;
   cpuGaugeField cpuLongLink(gParam);  // create the host longlink
   gParam.gauge = ulink;
   cpuGaugeField cpuUnitarizedLink(gParam);
   gParam.link_type = param->type;
   gParam.gauge     = inlink;
   cpuGaugeField cpuInLink(gParam);    // create the host sitelink

   // create the device fields
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(param->cuda_prec, true);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);

   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);

   profileFatLink.TPSTART(QUDA_PROFILE_H2D);
   cudaInLink->loadCPUField(cpuInLink);
   profileFatLink.TPSTOP(QUDA_PROFILE_H2D);

   cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);

   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLink;
   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   gParam.create = QUDA_ZERO_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.setPrecision(param->cuda_prec, true);
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
   cudaGaugeField *cudaUnitarizedLink = ulink ? new cudaGaugeField(gParam) : nullptr;
   cudaGaugeField *cudaLongLink = longlink ? new cudaGaugeField(gParam) : nullptr;

   profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
   fatLongKSLink(cudaFatLink, cudaLongLink, *cudaInLinkEx, path_coeff);
   profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (ulink) {
     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
     *num_failures_h = 0;
     quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
     if (*num_failures_h>0) errorQuda("Error in unitarization component of the hisq fattening: %d failures\n", *num_failures_h);
     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
   }

   profileFatLink.TPSTART(QUDA_PROFILE_D2H);
   if (ulink) cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink);
   if (fatlink) cudaFatLink->saveCPUField(cpuFatLink);
   if (longlink) cudaLongLink->saveCPUField(cpuLongLink);
   profileFatLink.TPSTOP(QUDA_PROFILE_D2H);

   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaFatLink;
   if (longlink) delete cudaLongLink;
   if (ulink) delete cudaUnitarizedLink;
   delete cudaInLinkEx;
   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);
 #else
   errorQuda("Fat-link has not been built");
 #endif // GPU_FATLINK
 }

 int getGaugePadding(GaugeFieldParam& param){
   int pad = 0;
 #ifdef MULTI_GPU
   int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
   int face_size[4];
   for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
   pad = *std::max_element(face_size, face_size+4);
 #endif

   return pad;
 }

 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
         double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
 {
 #ifdef GPU_GAUGE_FORCE
   profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(qudaGaugeParam);

   GaugeFieldParam gParam(siteLink, *qudaGaugeParam);
   gParam.site_offset = qudaGaugeParam->gauge_offset;
   gParam.site_size = qudaGaugeParam->site_size;
   cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;

   cudaGaugeField* cudaSiteLink = nullptr;

   if (qudaGaugeParam->use_resident_gauge) {
     if (!gaugePrecise) errorQuda("No resident gauge field to use");
     cudaSiteLink = gaugePrecise;
   } else {
     gParam.create = QUDA_NULL_FIELD_CREATE;
     gParam.reconstruct = qudaGaugeParam->reconstruct;
     gParam.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||
         qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?
       QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

     cudaSiteLink = new cudaGaugeField(gParam);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);

     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
     cudaSiteLink->loadCPUField(*cpuSiteLink);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);

     profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
   }

   GaugeFieldParam gParamMom(mom, *qudaGaugeParam, QUDA_ASQTAD_MOM_LINKS);
   // FIXME - test program always uses MILC for mom but can use QDP for gauge
   if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;
   if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
   else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

   gParamMom.site_offset = qudaGaugeParam->mom_offset;
   gParamMom.site_size = qudaGaugeParam->site_size;
   cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;

   cudaGaugeField* cudaMom = nullptr;
   if (qudaGaugeParam->use_resident_mom) {
     if (!momResident) errorQuda("No resident momentum field to use");
     cudaMom = momResident;
     if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
     cudaMom = new cudaGaugeField(gParamMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
     if (!qudaGaugeParam->overwrite_mom) {
       profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
       cudaMom->loadCPUField(*cpuMom);
       profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
     }
   }

   cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);

   // actually do the computation
   profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
   if (!forceMonitor()) {
     gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf,  path_length, loop_coeff, num_paths, max_length);
   } else {
     // if we are monitoring the force, separate the force computation from the momentum update
     GaugeFieldParam gParam(*cudaMom);
     gParam.create = QUDA_ZERO_FIELD_CREATE;
     GaugeField *force = GaugeField::Create(gParam);
     gaugeForce(*force, *cudaGauge, 1.0, input_path_buf,  path_length, loop_coeff, num_paths, max_length);
     updateMomentum(*cudaMom, eb3, *force, "gauge");
     delete force;
   }
   profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (qudaGaugeParam->return_result_mom) {
     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
     cudaMom->saveCPUField(*cpuMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
   if (qudaGaugeParam->make_resident_gauge) {
     if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
     gaugePrecise = cudaSiteLink;
   } else {
     delete cudaSiteLink;
   }

   if (qudaGaugeParam->make_resident_mom) {
     if (momResident && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   if (cpuSiteLink) delete cpuSiteLink;
   if (cpuMom) delete cpuMom;

   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaGauge;
   } else {
     delete cudaGauge;
   }
   profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);

   profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);

   checkCudaError();
 #else
   errorQuda("Gauge force has not been built");
 #endif // GPU_GAUGE_FORCE
   return 0;
 }

 void createCloverQuda(QudaInvertParam* invertParam)
 {
   profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   if (!cloverPrecise) errorQuda("Clover field not allocated");

   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   int R[4];
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
   cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);

   profileClover.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field
   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField Fmunu(tensorParam);
   profileClover.TPSTOP(QUDA_PROFILE_INIT);

   profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
   computeFmunu(Fmunu, *gauge);
   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);
   profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);

   // FIXME always preserve the extended gauge
   extendedGaugeResident = gauge;
 }

 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 {
   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
   gParam.geometry = static_cast<QudaFieldGeometry>(geometry);
   if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
     errorQuda("Only scalar and vector geometries are supported\n");

   cpuGaugeField *cpuGauge = nullptr;
   if (gauge) cpuGauge = new cpuGaugeField(gParam);

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   auto* cudaGauge = new cudaGaugeField(gParam);

   if (gauge) {
     cudaGauge->loadCPUField(*cpuGauge);
     delete cpuGauge;
   }

   return cudaGauge;
 }


 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){

   auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);

   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
   gParam.geometry = cudaGauge->Geometry();

   cpuGaugeField cpuGauge(gParam);
   cudaGauge->saveCPUField(cpuGauge);

 }


 void destroyGaugeFieldQuda(void* gauge){
   auto* g = reinterpret_cast<cudaGaugeField*>(gauge);
   delete g;
 }


 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *h_force, void **x,
              QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
 {
   profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);

   // create the host momentum field
   gParam.reconstruct = gauge_param->reconstruct;
   gParam.t_boundary = QUDA_PERIODIC_T;
   cpuGaugeField cpuMom(gParam);

   // create the host momentum field
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.gauge = h_force;
   cpuGaugeField cpuForce(gParam);

   // create the device momentum field
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;

   // create temporary field for quark-field outer product
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   cudaGaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};

   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
   qParam.nSpin = 1;
   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 5; // 5 since staggered mrhs
   qParam.setPrecision(gParam.Precision());
   qParam.pad = 0;
   for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];
   qParam.x[4] = 1;
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);

   if (gauge_param->use_resident_mom) {
     if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
     cudaMom = momResident;
   } else {
     // download the initial momentum (FIXME make an option just to return?)
     cudaMom->loadCPUField(cpuMom);
   }

   // resident gauge field is required
   if (!gauge_param->use_resident_gauge || !gaugePrecise)
     errorQuda("Resident gauge field is required");

   if (!gaugePrecise->StaggeredPhaseApplied()) {
     errorQuda("Gauge field requires the staggered phase factors to be applied");
   }

   // check if staggered phase is the desired one
   if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {
     errorQuda("Requested staggered phase %d, but found %d\n",
               gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());
   }

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);

   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
   for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);

   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
       errorQuda("solutionResident.size() %lu does not match number of shifts %d",
     solutionResident.size(), nvector);
   }

   // create the staggered operator
   DiracParam diracParam;
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
   if (!pc_solve)
     errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type);
   setDiracParam(diracParam, inv_param, pc_solve);
   Dirac *dirac = Dirac::create(diracParam);

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);

   for (int i=0; i<nvector; i++) {
     ColorSpinorField &x = *(X[i]);

     if (inv_param->use_resident_solution) x.Even() = *(solutionResident[i]);
     else errorQuda("%s requires resident solution", __func__);

     // set the odd solution component
     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
   }

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);

 #if 0
   if (inv_param->use_resident_solution) {
     for (auto v : solutionResident) if (v) delete solutionResident[i];
     solutionResident.clear();
   }
 #endif
   delete dirac;

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);

   // compute quark-field outer product
   for (int i=0; i<nvector; i++) {
     ColorSpinorField &x = *(X[i]);
     // second component is zero since we have no three hop term
     double coeff[2] = {inv_param->residue[i], 0.0};

     // Operate on even-parity sites
     computeStaggeredOprod(cudaForce_, x, coeff, 1);
   }

   // mom += delta * [U * force]TA
   applyU(cudaForce, *gaugePrecise);
   updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");
   qudaDeviceSynchronize();

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);

   if (gauge_param->return_result_mom) {
     // copy the momentum field back to the host
     cudaMom->saveCPUField(cpuMom);
   }

   if (gauge_param->make_resident_mom) {
     // make the momentum field resident
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);

   for (int i=0; i<nvector; i++) delete X[i];

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);

   checkCudaError();
 }

 void computeHISQForceQuda(void* const milc_momentum,
                           double dt,
                           const double level2_coeff[6],
                           const double fat7_coeff[6],
                           const void* const w_link,
                           const void* const v_link,
                           const void* const u_link,
                           void **fermion,
                           int num_terms,
                           int num_naik_terms,
                           double **coeff,
                           QudaGaugeParam* gParam)
 {
 #ifdef  GPU_STAGGERED_OPROD
   using namespace quda;
   using namespace quda::fermion_force;
   profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);
   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);

   checkGaugeParam(gParam);

   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);

   // create the device outer-product field
   GaugeFieldParam oParam(0, *gParam, QUDA_GENERAL_LINKS);
   oParam.nFace = 0;
   oParam.create = QUDA_ZERO_FIELD_CREATE;
   oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
   cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
   cudaGaugeField *naikOprod = new cudaGaugeField(oParam);

   {
     // default settings for the unitarization
     const double unitarize_eps = 1e-14;
     const double hisq_force_filter = 5e-5;
     const double max_det_error = 1e-10;
     const bool   allow_svd = true;
     const bool   svd_only = false;
     const double svd_rel_err = 1e-8;
     const double svd_abs_err = 1e-8;

     setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);
   }

   double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
   // You have to look at the MILC routine to understand the following
   // Basically, I have already absorbed the one-link coefficient

   GaugeFieldParam param(milc_momentum, *gParam, QUDA_ASQTAD_MOM_LINKS);
   //param.nFace = 0;
   param.order  = QUDA_MILC_GAUGE_ORDER;
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange =  QUDA_GHOST_EXCHANGE_NO;
   cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;

   param.link_type = QUDA_GENERAL_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_NO;
   param.gauge = (void*)w_link;
   cpuGaugeField cpuWLink(param);
   param.gauge = (void*)v_link;
   cpuGaugeField cpuVLink(param);
   param.gauge = (void*)u_link;
   cpuGaugeField cpuULink(param);

   param.create = QUDA_ZERO_FIELD_CREATE;
   param.order  = QUDA_FLOAT2_GAUGE_ORDER;
   param.link_type = QUDA_ASQTAD_MOM_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_10;
   GaugeFieldParam momParam(param);

   param.create = QUDA_ZERO_FIELD_CREATE;
   param.link_type = QUDA_GENERAL_LINKS;
   param.setPrecision(gParam->cpu_prec, true);

   int R[4] = { 2*comm_dim_partitioned(0), 2*comm_dim_partitioned(1), 2*comm_dim_partitioned(2), 2*comm_dim_partitioned(3) };
   for (int dir=0; dir<4; ++dir) {
     param.x[dir] += 2*R[dir];
     param.r[dir] = R[dir];
   }

   param.reconstruct = QUDA_RECONSTRUCT_NO;
   param.create = QUDA_ZERO_FIELD_CREATE;
   param.setPrecision(gParam->cpu_prec);
   param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;

   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

   { // do outer-product computation
     ColorSpinorParam qParam;
     qParam.nColor = 3;
     qParam.nSpin = 1;
     qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
     qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
     qParam.nDim = 4;
     qParam.setPrecision(oParam.Precision());
     qParam.pad = 0;
     for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];

     // create the device quark field
     qParam.create = QUDA_NULL_FIELD_CREATE;
     qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
     cudaColorSpinorField cudaQuark(qParam);

     // create the host quark field
     qParam.create = QUDA_REFERENCE_FIELD_CREATE;
     qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
     qParam.v = fermion[0];

     { // regular terms
       GaugeField *oprod[2] = {stapleOprod, naikOprod};

       // loop over different quark fields
       for(int i=0; i<num_terms; ++i){

         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i];
         cpuColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);

         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }

     { // naik terms
       oneLinkOprod->copy(*stapleOprod);
       ax(level2_coeff[0], *oneLinkOprod);
       GaugeField *oprod[2] = {oneLinkOprod, naikOprod};

       // loop over different quark fields
       for(int i=0; i<num_naik_terms; ++i){

         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i + num_terms - num_naik_terms];
         cpuColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);

         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
   }

   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
   cudaGaugeField* cudaInForce = new cudaGaugeField(param);
   copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
   delete stapleOprod;

   cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
   copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
   delete oneLinkOprod;

   cudaGaugeField* cudaGauge = new cudaGaugeField(param);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

   cudaGauge->loadCPUField(cpuWLink, profileHISQForce);

   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   // Load naik outer product
   copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
   delete naikOprod;

   // Compute Naik three-link term
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1]);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);

   // load v-link
   cudaGauge->loadCPUField(cpuVLink, profileHISQForce);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   *num_failures_h = 0;
   unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);

   cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());

   // read in u-link
   cudaGauge->loadCPUField(cpuULink, profileHISQForce);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);

   // Compute Fat7-staple term
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   delete cudaInForce;
   cudaGaugeField* cudaMom = new cudaGaugeField(momParam);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqCompleteForce(*cudaOutForce, *cudaGauge);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (gParam->use_resident_mom) {
     if (!momResident) errorQuda("No resident momentum field to use");
     updateMomentum(*momResident, dt, *cudaOutForce, "hisq");
   } else {
     updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
   }

   if (gParam->return_result_mom) {
     // Close the paths, make anti-hermitian, and store in compressed format
     if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
   }

   profileHISQForce.TPSTART(QUDA_PROFILE_FREE);

   if (cpuMom) delete cpuMom;

   if (!gParam->make_resident_mom) {
     delete momResident;
     momResident = nullptr;
   }
   if (cudaMom) delete cudaMom;
   delete cudaOutForce;
   delete cudaGauge;
   profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);

   profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);

 #else
   errorQuda("HISQ force has not been built");
 #endif
 }

 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,
           double *coeff, double kappa2, double ck,
           int nvector, double multiplicity, void *gauge,
           QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {

   using namespace quda;
   profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileCloverForce.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");

   GaugeFieldParam fParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);
   // create the host momentum field
   fParam.reconstruct = QUDA_RECONSTRUCT_10;
   fParam.order = gauge_param->gauge_order;
   cpuGaugeField cpuMom(fParam);

   // create the device momentum field
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   cudaGaugeField cudaMom(fParam);

   // create the device force field
   fParam.link_type = QUDA_GENERAL_LINKS;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   fParam.reconstruct = QUDA_RECONSTRUCT_NO;
   cudaGaugeField cudaForce(fParam);

   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
   qParam.nSpin = 4;
   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 4;
   qParam.setPrecision(fParam.Precision());
   qParam.pad = 0;
   for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];

   // create the device quark field
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
   qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;

   std::vector<ColorSpinorField*> quarkX, quarkP;
   for (int i=0; i<nvector; i++) {
     quarkX.push_back(ColorSpinorField::Create(qParam));
     quarkP.push_back(ColorSpinorField::Create(qParam));
   }

   qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
   qParam.x[0] /= 2;
   cudaColorSpinorField tmp(qParam);

   // create the host quark field
   qParam.create = QUDA_REFERENCE_FIELD_CREATE;
   qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc_solve);
   diracParam.tmp1 = &tmp; // use as temporary for dirac->M
   Dirac *dirac = Dirac::create(diracParam);

   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
       errorQuda("solutionResident.size() %lu does not match number of shifts %d",
     solutionResident.size(), nvector);
   }

   cudaGaugeField &gaugeEx = *extendedGaugeResident;

   // create oprod and trace fields
   fParam.geometry = QUDA_TENSOR_GEOMETRY;
   cudaGaugeField oprod(fParam);

   profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);

   std::vector<double> force_coeff(nvector);
   // loop over different quark fields
   for(int i=0; i<nvector; i++){
     ColorSpinorField &x = *(quarkX[i]);
     ColorSpinorField &p = *(quarkP[i]);

     if (!inv_param->use_resident_solution) {
       // for downloading x_e
       qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
       qParam.x[0] /= 2;

       // Wrap the even-parity MILC quark field
       profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
       qParam.v = h_x[i];
       cpuColorSpinorField cpuQuarkX(qParam); // create host quark field
       profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);

       profileCloverForce.TPSTART(QUDA_PROFILE_H2D);
       x.Even() = cpuQuarkX;
       profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);

       profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
       gamma5(x.Even(), x.Even());
     } else {
       x.Even() = *(solutionResident[i]);
     }

     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
     dirac->M(p.Even(), x.Even());
     dirac->Dagger(QUDA_DAG_YES);
     dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
     dirac->Dagger(QUDA_DAG_NO);

     gamma5(x, x);
     gamma5(p, p);

     force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
   }

   computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);

   // In double precision the clover derivative is faster with no reconstruct
   cudaGaugeField *u = &gaugeEx;
   if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
     GaugeFieldParam param(gaugeEx);
     param.reconstruct = QUDA_RECONSTRUCT_NO;
     u = new cudaGaugeField(param);
     u -> copy(gaugeEx);
   }

   computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);

   /* Now the U dA/dU terms */
   std::vector< std::vector<double> > ferm_epsilon(nvector);
   for (int shift = 0; shift < nvector; shift++) {
     ferm_epsilon[shift].reserve(2);
     ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
     ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
   }

   computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);

   cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);

   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);

   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);

   if (u != &gaugeEx) delete u;

   updateMomentum(cudaMom, -1.0, cudaForce, "clover");
   profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   // copy the outer product field back to the host
   profileCloverForce.TPSTART(QUDA_PROFILE_D2H);
   cudaMom.saveCPUField(cpuMom);
   profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);

   profileCloverForce.TPSTART(QUDA_PROFILE_FREE);

   for (int i=0; i<nvector; i++) {
     delete quarkX[i];
     delete quarkP[i];
   }

 #if 0
   if (inv_param->use_resident_solution) {
     for (auto v : solutionResident) if (v) delete v;
     solutionResident.clear();
   }
 #endif
   delete dirac;
   profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();
   profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 void updateGaugeFieldQuda(void* gauge,
         void* momentum,
         double dt,
         int conj_mom,
         int exact,
         QudaGaugeParam* param)
 {
   profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);

   // create the host fields
   GaugeFieldParam gParam(gauge, *param, QUDA_SU3_LINKS);
   gParam.site_offset = param->gauge_offset;
   gParam.site_size = param->site_size;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;

   GaugeFieldParam gParamMom(momentum, *param);
   gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParamMom.site_offset = param->mom_offset;
   gParamMom.site_size = param->site_size;
   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;

   // create the device fields
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;

   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
   cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
   auto *cudaOutGauge = new cudaGaugeField(gParam);

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);

   if (!param->use_resident_gauge) {   // load fields onto the device
     cudaInGauge->loadCPUField(*cpuGauge);
   } else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = nullptr;
   }

   if (!param->use_resident_mom) {
     cudaMom->loadCPUField(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
     momResident = nullptr;
   }

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);
   updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
       (bool)conj_mom, (bool)exact);
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (param->return_result_gauge) {
     // copy the gauge field back to the host
     profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
     cudaOutGauge->saveCPUField(*cpuGauge);
     profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_gauge) {
     if (gaugePrecise != nullptr) delete gaugePrecise;
     gaugePrecise = cudaOutGauge;
   } else {
     delete cudaOutGauge;
   }

   if (param->make_resident_mom) {
     if (momResident != nullptr && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   delete cudaInGauge;
   if (cpuMom) delete cpuMom;
   if (cpuGauge) delete cpuGauge;
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);
 }

  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
    profileProject.TPSTART(QUDA_PROFILE_TOTAL);

    profileProject.TPSTART(QUDA_PROFILE_INIT);
    checkGaugeParam(param);

    // create the gauge field
    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
    gParam.site_offset = param->gauge_offset;
    gParam.site_size = param->site_size;
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;

    // create the device fields
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
    profileProject.TPSTOP(QUDA_PROFILE_INIT);

    if (param->use_resident_gauge) {
      if (!gaugePrecise) errorQuda("No resident gauge field to use");
      cudaGauge = gaugePrecise;
    } else {
      profileProject.TPSTART(QUDA_PROFILE_H2D);
      cudaGauge->loadCPUField(*cpuGauge);
      profileProject.TPSTOP(QUDA_PROFILE_H2D);
    }

    profileProject.TPSTART(QUDA_PROFILE_COMPUTE);
    *num_failures_h = 0;

    // project onto SU(3)
    projectSU3(*cudaGauge, tol, num_failures_d);

    profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);

    if(*num_failures_h>0)
      errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);

    profileProject.TPSTART(QUDA_PROFILE_D2H);
    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
    profileProject.TPSTOP(QUDA_PROFILE_D2H);

    if (param->make_resident_gauge) {
      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
      gaugePrecise = cudaGauge;
    } else {
      delete cudaGauge;
    }

    profileProject.TPSTART(QUDA_PROFILE_FREE);
    if (cpuGauge) delete cpuGauge;
    profileProject.TPSTOP(QUDA_PROFILE_FREE);

    profileProject.TPSTOP(QUDA_PROFILE_TOTAL);
  }

  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
    profilePhase.TPSTART(QUDA_PROFILE_TOTAL);

    profilePhase.TPSTART(QUDA_PROFILE_INIT);
    checkGaugeParam(param);

    // create the gauge field
    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;

    // create the device fields
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
    profilePhase.TPSTOP(QUDA_PROFILE_INIT);

    if (param->use_resident_gauge) {
      if (!gaugePrecise) errorQuda("No resident gauge field to use");
      cudaGauge = gaugePrecise;
    } else {
      profilePhase.TPSTART(QUDA_PROFILE_H2D);
      cudaGauge->loadCPUField(*cpuGauge);
      profilePhase.TPSTOP(QUDA_PROFILE_H2D);
    }

    profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
    *num_failures_h = 0;

    // apply / remove phase as appropriate
    if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();
    else cudaGauge->removeStaggeredPhase();

    profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);

    profilePhase.TPSTART(QUDA_PROFILE_D2H);
    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
    profilePhase.TPSTOP(QUDA_PROFILE_D2H);

    if (param->make_resident_gauge) {
      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;
      gaugePrecise = cudaGauge;
    } else {
      delete cudaGauge;
    }

    profilePhase.TPSTART(QUDA_PROFILE_FREE);
    if (cpuGauge) delete cpuGauge;
    profilePhase.TPSTOP(QUDA_PROFILE_FREE);

    profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);
  }

 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
 {
   profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);

   profileMomAction.TPSTART(QUDA_PROFILE_INIT);
   checkGaugeParam(param);

   // create the momentum fields
   GaugeFieldParam gParam(momentum, *param, QUDA_ASQTAD_MOM_LINKS);
   gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
     QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;

   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr;

   // create the device fields
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;

   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;

   profileMomAction.TPSTOP(QUDA_PROFILE_INIT);

   profileMomAction.TPSTART(QUDA_PROFILE_H2D);
   if (!param->use_resident_mom) {
     cudaMom->loadCPUField(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
   }
   profileMomAction.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);
   double action = computeMomAction(*cudaMom);
   profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileMomAction.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_mom) {
     if (momResident != nullptr && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
     momResident = nullptr;
   }
   if (cpuMom) {
     delete cpuMom;
   }

   profileMomAction.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();

   profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);
   return action;
 }

 /*
   The following functions are for the Fortran interface.
 */

 void init_quda_(int *dev) { initQuda(*dev); }
 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
 void init_quda_memory_() { initQudaMemory(); }
 void end_quda_() { endQuda(); }
 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
 void free_gauge_quda_() { freeGaugeQuda(); }
 void free_sloppy_gauge_quda_() { freeSloppyGaugeQuda(); }
 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
 void free_clover_quda_(void) { freeCloverQuda(); }
 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
     QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
     QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
 { MatQuda(h_out, h_in, inv_param); }
 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
 { MatDagMatQuda(h_out, h_in, inv_param); }
 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {
   fflush(stdout);
   // ensure that fifth dimension is set to 1
   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
   invertQuda(hp_x, hp_b, param);
   fflush(stdout);
 }

 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param) {
   // ensure that fifth dimension is set to 1
   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;

   if (!gaugePrecise) errorQuda("Resident gauge field not allocated");

   // get data into array of pointers
   int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;

   // compute offset assuming TIFR padded ordering (FIXME)
   if (param->dirac_order != QUDA_TIFR_PADDED_DIRAC_ORDER)
     errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");

   const int *X = gaugePrecise->X();
   size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;
   void *hp_x[QUDA_MAX_MULTI_SHIFT];
   for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;

   invertMultiShiftQuda(hp_x, hp_b, param);
 }

 void flush_chrono_quda_(int *index) { flushChronoQuda(*index); }

 void register_pinned_quda_(void *ptr, size_t *bytes) {
   cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
   checkCudaError();
 }

 void unregister_pinned_quda_(void *ptr) {
   cudaHostUnregister(ptr);
   checkCudaError();
 }

 void new_quda_gauge_param_(QudaGaugeParam *param) {
   *param = newQudaGaugeParam();
 }
 void new_quda_invert_param_(QudaInvertParam *param) {
   *param = newQudaInvertParam();
 }

 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
     bool *conj_mom, bool *exact,
     QudaGaugeParam *param) {
   updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
 }

 static inline int opp(int dir) { return 7-dir; }

 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){

   int index=0;
   // Plaquette paths
   if (num_loop_types >= 1)
     for(int i=0; i<4; ++i){
       if(i==dir) continue;
       paths[index][0] = i;        paths[index][1] = opp(dir);   paths[index++][2] = opp(i);
       paths[index][0] = opp(i);   paths[index][1] = opp(dir);   paths[index++][2] = i;
     }

   // Rectangle Paths
   if (num_loop_types >= 2)
     for(int i=0; i<4; ++i){
       if(i==dir) continue;
       paths[index][0] = paths[index][1] = i;       paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
       index++;
       paths[index][0] = paths[index][1] = opp(i);  paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
       index++;
       paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
       index++;
       paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
       index++;
       paths[index][0] = i;  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
       index++;
       paths[index][0] = opp(i);  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
       index++;
     }

   if (num_loop_types >= 3) {
     // Staple paths
     for(int i=0; i<4; ++i){
       for(int j=0; j<4; ++j){
   if(i==dir || j==dir || i==j) continue;
   paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
   index++;
   paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
   index++;
   paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
   index++;
   paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
   index++;
      }
     }
   }

 }

 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,
              QudaGaugeParam *param) {

   int numPaths = 0;
   switch (*num_loop_types) {
   case 1:
     numPaths = 6;
     break;
   case 2:
     numPaths = 24;
     break;
   case 3:
     numPaths = 48;
     break;
   default:
     errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);
   }

   auto *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
   int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));

   if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
       loop_coeff[i] = coeff[0];
       path_length[i] = 3;
     }
   if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {
       loop_coeff[i] = coeff[1];
       path_length[i] = 5;
     }
   if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {
       loop_coeff[i] = coeff[2];
       path_length[i] = 5;
     }

   int** input_path_buf[4];
   for(int dir=0; dir<4; ++dir){
     input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
     for(int i=0; i<numPaths; ++i){
       input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));
     }
     createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
   }

   int max_length = 6;

   computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);

   for(auto & dir : input_path_buf){
     for(int i=0; i<numPaths; ++i) host_free(dir[i]);
     host_free(dir);
   }

   host_free(path_length);
   host_free(loop_coeff);
 }

 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {
   computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);
 }

 // apply the staggered phases
 void apply_staggered_phase_quda_() {
   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");
   if (gaugePrecise) {
     gaugePrecise->applyStaggeredPhase();
   } else {
     errorQuda("No persistent gauge field");
   }
 }

 // remove the staggered phases
 void remove_staggered_phase_quda_() {
   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");
   if (gaugePrecise) {
     gaugePrecise->removeStaggeredPhase();
   } else {
     errorQuda("No persistent gauge field");
   }
   qudaDeviceSynchronize();
 }

 // evaluate the kinetic term
 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {
   *kin = momActionQuda(momentum, param);
 }


 #ifdef MULTI_GPU
 static int bqcd_rank_from_coords(const int *coords, void *fdata)
 {
   int *dims = static_cast<int *>(fdata);

   int rank = coords[3];
   for (int i = 2; i >= 0; i--) {
     rank = dims[i] * rank + coords[i];
   }
   return rank;
 }
 #endif

 void comm_set_gridsize_(int *grid)
 {
 #ifdef MULTI_GPU
   initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
 #endif
 }

 void set_kernel_pack_t_(int* pack)
 {
   bool pack_ = *pack ? true : false;
   setKernelPackT(pack_);
 }

 void gaussGaugeQuda(unsigned long long seed, double sigma)
 {
   profileGauss.TPSTART(QUDA_PROFILE_TOTAL);

   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");

   cudaGaugeField *data = gaugePrecise;

   GaugeFieldParam param(*data);
   param.reconstruct = QUDA_RECONSTRUCT_12;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField u(param);

   profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
   quda::gaugeGauss(*data, seed, sigma);
   profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (extendedGaugeResident) {
     *extendedGaugeResident = *gaugePrecise;
     extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
   }

   profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 /*
  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.
  */
 void plaq_quda_(double plaq[3]) {
   plaqQuda(plaq);
 }

 void plaqQuda(double plaq[3])
 {
   profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);

   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");

   cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;

   profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
   double3 plaq3 = quda::plaquette(*data);
   plaq[0] = plaq3.x;
   plaq[1] = plaq3.y;
   plaq[2] = plaq3.z;
   profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);

   profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 /*
  * Performs a deep copy from the internal extendedGaugeResident field.
  */
 void copyExtendedResidentGaugeQuda(void* resident_gauge, QudaFieldLocation loc)
 {
   //profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);

   if (!gaugePrecise) errorQuda("Cannot perform deep copy of resident gauge field as there is no resident gauge field");

   cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;

   auto* io_gauge = (cudaGaugeField*)resident_gauge;

   copyExtendedGauge(*io_gauge, *extendedGaugeResident, loc);

   //profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
 {
   profileWuppertal.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   cudaGaugeField *precise = nullptr;

   if (gaugeSmeared != nullptr) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
     GaugeFieldParam gParam(*gaugePrecise);
     gParam.create = QUDA_NULL_FIELD_CREATE;
     precise = new cudaGaugeField(gParam);
     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     precise->exchangeGhost();
   } else {
     if (getVerbosity() >= QUDA_VERBOSE)
       printfQuda("Wuppertal smearing done with gaugePrecise\n");
     precise = gaugePrecise;
   }

   ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), false, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);
   int parity = 0;

   for (unsigned int i=0; i<nSteps; i++) {
     if(i) in = out;
     wuppertalStep(out, in, parity, *precise, alpha);
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       double norm = blas::norm2(out);
       printfQuda("Step %d, vector norm %e\n", i, norm);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   if (gaugeSmeared != nullptr)
     delete precise;

   delete out_h;
   delete in_h;

   popVerbosity();

   profileWuppertal.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performAPEnStep(unsigned int nSteps, double alpha)
 {
   profileAPE.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != nullptr) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);

   GaugeFieldParam gParam(*gaugeSmeared);
   auto *cudaGaugeTemp = new cudaGaugeField(gParam);

   double3 plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after 0 APE steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileAPE,redundant_comms);
     APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,profileAPE,redundant_comms);

   plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after %d APE steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
   }

   profileAPE.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performSTOUTnStep(unsigned int nSteps, double rho)
 {
   profileSTOUT.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != nullptr) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);

   GaugeFieldParam gParam(*gaugeSmeared);
   auto *cudaGaugeTemp = new cudaGaugeField(gParam);

   double3 plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after 0 STOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileSTOUT,redundant_comms);
     STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,redundant_comms);

   plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
   }

   profileSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
 {
   profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != nullptr) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);

   GaugeFieldParam gParam(*gaugeSmeared);
   auto *cudaGaugeTemp = new cudaGaugeField(gParam);

   double3 plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileOvrImpSTOUT,redundant_comms);
     OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,profileOvrImpSTOUT,redundant_comms);

   plaq = plaquette(*gaugeSmeared);
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
   }

   profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 int computeGaugeFixingOVRQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
   const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, \
   const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
 {

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);
   GaugeFieldParam gParam(gauge, *param);
   auto *cpuGauge = new cpuGaugeField(gParam);

   //gParam.pad = getFatLinkPadding(param->X);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   gParam.link_type   = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.order       = (gParam.Precision() == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
   auto *cudaInGauge = new cudaGaugeField(gParam);

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);


   cudaInGauge->loadCPUField(*cpuGauge);
  /* } else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = nullptr;
   } */

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);

   checkCudaError();

   if (comm_size() == 1) {
     // perform the update
     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
     gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
       reunit_interval, stopWtheta);
     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
   } else {
     cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);

     // perform the update
     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
     gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
       reunit_interval, stopWtheta);
     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);

     //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge
     copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
   }

   checkCudaError();
   // copy the gauge field back to the host
   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
   cudaInGauge->saveCPUField(*cpuGauge);
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);

   if (param->make_resident_gauge) {
     if (gaugePrecise != nullptr) delete gaugePrecise;
     gaugePrecise = cudaInGauge;
   } else {
     delete cudaInGauge;
   }

   if(timeinfo){
     timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
     timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
     timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
   }

   checkCudaError();
   return 0;
 }

 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
   const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
   const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
 {

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(gauge, *param);
   auto *cpuGauge = new cpuGaugeField(gParam);

   //gParam.pad = getFatLinkPadding(param->X);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   gParam.link_type   = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.order       = (gParam.Precision() == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   auto *cudaInGauge = new cudaGaugeField(gParam);


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);

   //if (!param->use_resident_gauge) {   // load fields onto the device
   cudaInGauge->loadCPUField(*cpuGauge);
   /*} else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = nullptr;
   } */


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);
   checkCudaError();

   gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);

   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);

   checkCudaError();
   // copy the gauge field back to the host
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
   checkCudaError();
   cudaInGauge->saveCPUField(*cpuGauge);
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
   checkCudaError();

   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);

   if (param->make_resident_gauge) {
     if (gaugePrecise != nullptr) delete gaugePrecise;
     gaugePrecise = cudaInGauge;
   } else {
     delete cudaInGauge;
   }

   if(timeinfo){
     timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
     timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
     timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
   }

   checkCudaError();
   return 0;
 }

 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
                   QudaInvertParam *param, const int *X)
 {
   // DMH: Easiest way to construct ColorSpinorField? Do we require the user
   //     to declare and fill and invert_param, or can it just be hacked?.

   profileContract.TPSTART(QUDA_PROFILE_TOTAL);
   profileContract.TPSTART(QUDA_PROFILE_INIT);
   // wrap CPU host side pointers
   ColorSpinorParam cpuParam((void *)hp_x, *param, X, false, param->input_location);
   ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);

   cpuParam.v = (void *)hp_y;
   ColorSpinorField *h_y = ColorSpinorField::Create(cpuParam);

   // Create device parameter
   ColorSpinorParam cudaParam(cpuParam);
   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   // Quda uses Degrand-Rossi gamma basis for contractions and will
   // automatically reorder data if necessary.
   cudaParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
   cudaParam.setPrecision(cpuParam.Precision(), cpuParam.Precision(), true);

   std::vector<ColorSpinorField *> x, y;
   x.push_back(ColorSpinorField::Create(cudaParam));
   y.push_back(ColorSpinorField::Create(cudaParam));

   size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();
   void *d_result = pool_device_malloc(data_bytes);
   profileContract.TPSTOP(QUDA_PROFILE_INIT);

   profileContract.TPSTART(QUDA_PROFILE_H2D);
   *x[0] = *h_x;
   *y[0] = *h_y;
   profileContract.TPSTOP(QUDA_PROFILE_H2D);

   profileContract.TPSTART(QUDA_PROFILE_COMPUTE);
   contractQuda(*x[0], *y[0], d_result, cType);
   profileContract.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileContract.TPSTART(QUDA_PROFILE_D2H);
   qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);
   profileContract.TPSTOP(QUDA_PROFILE_D2H);

   profileContract.TPSTART(QUDA_PROFILE_FREE);
   pool_device_free(d_result);
   delete x[0];
   delete y[0];
   delete h_y;
   delete h_x;
   profileContract.TPSTOP(QUDA_PROFILE_FREE);

   profileContract.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 double qChargeQuda()
 {
   profileQCharge.TPSTART(QUDA_PROFILE_TOTAL);

   cudaGaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileQCharge);
     gauge = extendedGaugeResident;
   } else {
     gauge = gaugeSmeared;
   }
   // Do we keep the smeared extended field on memory, or the unsmeared one?

   profileQCharge.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field

   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField Fmunu(tensorParam);

   profileQCharge.TPSTOP(QUDA_PROFILE_INIT);
   profileQCharge.TPSTART(QUDA_PROFILE_COMPUTE);

   computeFmunu(Fmunu, *gauge);
   double charge = quda::computeQCharge(Fmunu);

   profileQCharge.TPSTOP(QUDA_PROFILE_COMPUTE);
   profileQCharge.TPSTOP(QUDA_PROFILE_TOTAL);

   return charge;
 }

 double qChargeDensityQuda(void *h_qDensity)
 {
   profileQCharge.TPSTART(QUDA_PROFILE_TOTAL);

   cudaGaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileQCharge);
     gauge = extendedGaugeResident;
   } else {
     gauge = gaugeSmeared;
   }
   // Do we keep the smeared extended field on memory, or the unsmeared one?
   profileQCharge.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field
   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField Fmunu(tensorParam);

   size_t size = Fmunu.Volume() * Fmunu.Precision();
   void *d_qDensity = device_malloc(size);
   profileQCharge.TPSTOP(QUDA_PROFILE_INIT);

   profileQCharge.TPSTART(QUDA_PROFILE_COMPUTE);
   computeFmunu(Fmunu, *gauge);
   double charge = quda::computeQChargeDensity(Fmunu, d_qDensity);
   profileQCharge.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileQCharge.TPSTART(QUDA_PROFILE_D2H);
   qudaMemcpy(h_qDensity, d_qDensity, size, cudaMemcpyDeviceToHost);
   profileQCharge.TPSTOP(QUDA_PROFILE_D2H);

   profileQCharge.TPSTART(QUDA_PROFILE_FREE);
   device_free(d_qDensity);
   profileQCharge.TPSTOP(QUDA_PROFILE_FREE);

   profileQCharge.TPSTOP(QUDA_PROFILE_TOTAL);

   return charge;
 }
new_quda_invert_param_
void new_quda_invert_param_(QudaInvertParam *param)
Definition: interface_quda.cpp:5207

tmp2
cudaColorSpinorField * tmp2
Definition: dslash_ctest.cpp:40

QudaInvertParam_s::action
double action[2]
Definition: quda.h:202

quda::CloverFieldParam::order
QudaCloverFieldOrder order
Definition: clover_field.h:21

QudaInvertParam_s::laplace3D
int laplace3D
Definition: quda.h:119

qudaGaugeParam
static QudaGaugeParam qudaGaugeParam
Definition: gauge_force_test.cpp:16

quda::computeFmunu
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.
Definition: gauge_field_strength_tensor.cu:99

quda::CloverField::setRho
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
Definition: clover_field.cpp:73

reunit_allow_svd
static bool reunit_allow_svd
Definition: hisq_stencil_test.cpp:55

invert_quda.h

QudaInvertParam_s::secs
double secs
Definition: quda.h:251

quda::GaugeFieldParam::t_boundary
QudaTboundary t_boundary
Definition: gauge_field.h:20

QudaInvertParam_s::iter_res_offset
double iter_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:188

copyExtendedResidentGaugeQuda
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
Definition: interface_quda.cpp:5441

quda::blas::ax
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508

QudaInvertParam_s::dirac_order
QudaDiracFieldOrder dirac_order
Definition: quda.h:219

QUDA_VERBOSE
Definition: enum_quda.h:265

QudaInvertParam_s::mass_normalization
QudaMassNormalization mass_normalization
Definition: quda.h:208

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33

QudaInvertParam_s::tol_hq_offset
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:182

QudaGaugeParam_s::reconstruct_sloppy
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

comm_finalize
void comm_finalize(void)
Definition: comm_common.cpp:606

quda::fatLongKSLink
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:532

quda::cloverDerivative
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
Definition: clover_deriv_quda.cu:174

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:72

quda::LatticeFieldParam::ghostExchange
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76

freeCloverQuda
void freeCloverQuda(void)
Definition: interface_quda.cpp:1440

quda::MG::dumpNullVectors
void dumpNullVectors() const
Dump the null-space vectors to disk. Will recurse dumping all levels.
Definition: multigrid.cpp:1045

do_not_profile_quda
static bool do_not_profile_quda
Definition: interface_quda.cpp:250

quda::DiracTwistedCloverPC
Definition: dirac_quda.h:650

computeKSLinkQuda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
Definition: interface_quda.cpp:3974

quda::setDiracSloppyParam
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1661

profileStaggeredForce
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.

quda::ColorSpinorParam::setPrecision
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
Definition: color_spinor_field.h:231

createGaugeFieldQuda
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
Definition: interface_quda.cpp:4229

tmp1
cudaColorSpinorField * tmp1
Definition: dslash_ctest.cpp:40

QudaEigParam_s::use_dagger
QudaBoolean use_dagger
Definition: quda.h:401

QUDA_MAT_SOLUTION
Definition: enum_quda.h:151

QUDA_GAUGE_LAPLACEPC_DIRAC
Definition: enum_quda.h:318

commDimPartitioned
int commDimPartitioned(int dir)
Definition: comm_common.cpp:815

quda::CloverField::TrLog
double * TrLog() const
Definition: clover_field.h:88

loadCloverQuda
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:985

QUDA_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:301

setVerbosityQuda
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
Definition: interface_quda.cpp:323

QudaInvertParam_s::clover_location
QudaFieldLocation clover_location
Definition: quda.h:223

QudaInvertParam_s::solve_type
QudaSolveType solve_type
Definition: quda.h:205

quda::computeQCharge
double computeQCharge(const GaugeField &Fmunu)
Compute the topological charge.
Definition: gauge_qcharge.cu:97

QudaPrecision
enum QudaPrecision_s QudaPrecision

quda::cublas::destroy
void destroy()
Destroy the CUBLAS context.
Definition: blas_cublas.cu:38

tol_hq
double tol_hq
Definition: test_util.cpp:1657

freeGaugeQuda
void freeGaugeQuda(void)
Definition: interface_quda.cpp:1259

quda::DiracMobius::Dslash5
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:75

kinetic_quda_
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
Definition: interface_quda.cpp:5349

QUDA_GAUGE_COVDEV_DIRAC
Definition: enum_quda.h:319

profileFatLink
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.

quda::SolverParam::delta
double delta
Definition: invert_quda.h:70

QudaGaugeParam_s::ga_pad
int ga_pad
Definition: quda.h:63

QudaInvertParam_s::c_5
double_complex c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:112

QUDA_WILSONPC_DIRAC
Definition: enum_quda.h:298

load_gauge_quda_
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:5149

QudaGaugeParam_s::make_resident_mom
int make_resident_mom
Definition: quda.h:83

quda::ColorSpinorParam::is_component
bool is_component
Definition: color_spinor_field.h:105

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
Definition: cuda_gauge_field.cpp:749

computeStaggeredForceQuda
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4271

momActionQuda
double momActionQuda(void *momentum, QudaGaugeParam *param)
Definition: interface_quda.cpp:5084

QudaGaugeParam_s::gauge_offset
size_t gauge_offset
Definition: quda.h:87

QudaInvertParam_s::mu
double mu
Definition: quda.h:114

quda::CloverField::V
void * V(bool inverse=false)
Definition: clover_field.h:74

momentum.h

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1092

quda::MultiShiftCG
Multi-Shift Conjugate Gradient Solver.
Definition: invert_quda.h:1121

profileGaugeUpdate
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.

quda::DiracMatrix::shift
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
Definition: dirac_quda.h:1138

quda::fermion_force::hisqLongLinkForce
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.

quda::fermion_force::setUnitarizeForceConstants
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.

QUDA_MAX_MULTI_SHIFT
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
Definition: quda_constants.h:31

QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:225

gaugeExtended
cudaGaugeField * gaugeExtended
Definition: interface_quda.cpp:121

QudaInvertParam_s::inv_type_precondition
QudaInverterType inv_type_precondition
Definition: quda.h:270

quda::createDirac
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1730

quda::ColorSpinorParam::nColor
int nColor
Definition: color_spinor_field.h:85

printQudaGaugeParam
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

kappa
double kappa
Definition: test_util.cpp:1647

QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:93

QudaGaugeParam_s::type
QudaLinkType type
Definition: quda.h:42

QudaInvertParam_s::kappa
double kappa
Definition: quda.h:106

mpi_comm_handle.h

quda::blas::end
void end(void)
Definition: blas_quda.cu:489

gauge_tools.h

quda::computeCloverForce
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
Definition: clover_outer_product.cu:465

QudaInvertParam_s::cuda_prec_ritz
QudaPrecision cuda_prec_ritz
Definition: quda.h:324

profileQCharge
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.

quda::QUDA_PROFILE_FREE
Definition: timer.h:111

QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:32

check_params.h

quda::DiracM
Definition: dirac_quda.h:1141

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

loadSloppyCloverQuda
void loadSloppyCloverQuda(const QudaPrecision prec[])

quda::blas::norm2
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721

color_spinor_field.h

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Definition: unitarize_links_quda.cu:72

quda::blas::init
void init()
Definition: blas_quda.cu:483

QudaInvertParam_s::dslash_type
QudaDslashType dslash_type
Definition: quda.h:102

QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:46

QudaGaugeParam_s::reconstruct_precondition
QudaReconstructType reconstruct_precondition
Definition: quda.h:59

quda::CloverFieldParam::create
QudaFieldCreate create
Definition: clover_field.h:22

QudaInvertParam_s::inv_type
QudaInverterType inv_type
Definition: quda.h:103

quda_fortran.h
Fortran interface functions.

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

quda::CloverField::Bytes
size_t Bytes() const
Definition: clover_field.h:98

QudaInvertParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:214

QudaInvertParam_s::return_clover_inverse
int return_clover_inverse
Definition: quda.h:242

host_free
#define host_free(ptr)
Definition: malloc_quda.h:71

QudaMultigridParam_s::smoother_solve_type
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:589

QUDA_SMEARED_LINKS
Definition: enum_quda.h:28

quda::fermion_force::hisqCompleteForce
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...

QUDA_ODD_PARITY
Definition: enum_quda.h:288

performSTOUTnStep
void performSTOUTnStep(unsigned int nSteps, double rho)
Definition: interface_quda.cpp:5563

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:350

quda::SolverParam::true_res_hq
double true_res_hq
Definition: invert_quda.h:133

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

quda::GaugeFieldParam::site_offset
size_t site_offset
Definition: gauge_field.h:45

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120

QUDA_TWISTED_CLOVERPC_DIRAC
Definition: enum_quda.h:314

quda::CloverField
Definition: clover_field.h:45

quda::blas::cDotProduct
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764

quda::STOUTStep
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.
Definition: gauge_stout.cu:129

QUDA_WILSON_LINKS
Definition: enum_quda.h:29

epsilon
double epsilon
Definition: test_util.cpp:1649

cudaMom
cudaGaugeField * cudaMom
Definition: hisq_paths_force_test.cpp:27

quda::updateMomentum
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
Definition: momentum.cu:328

quda::DiracMobiusPC
Definition: dirac_quda.h:491

setOutputPrefix
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:69

loadGaugeQuda
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:729

QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:183

QudaInvertParam_s::cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:213

plaq_quda_
void plaq_quda_(double plaq[3])
Definition: interface_quda.cpp:5415

profileMulti
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for eigensolveQuda.

QUDA_SUMMARIZE
Definition: enum_quda.h:264

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

profileOvrImpSTOUT
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.

quda::GaugeField::checksum
uint64_t checksum(bool mini=false) const
Definition: gauge_field.cpp:355

enable_profiler
static bool enable_profiler
Definition: interface_quda.cpp:249

quda::MGParam
Definition: multigrid.h:26

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

gaugeLongPrecise
cudaGaugeField * gaugeLongPrecise
Definition: interface_quda.cpp:130

QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333

rank
static int rank
Definition: comm_mpi.cpp:44

quda::ColorSpinorField::Even
const ColorSpinorField & Even() const
Definition: color_spinor_field.cpp:608

quda::saveTuneCache
void saveTuneCache(bool error=false)
Definition: tune.cpp:426

profileGaugeForce
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.

quda::ColorSpinorField::Odd
const ColorSpinorField & Odd() const
Definition: color_spinor_field.cpp:616

QUDA_WILSON_DIRAC
Definition: enum_quda.h:297

quda::ColorSpinorField::Create
static ColorSpinorField * Create(const ColorSpinorParam &param)
Definition: color_spinor_field.cpp:752

QUDA_COVDEV_DSLASH
Definition: enum_quda.h:97

QUDA_NORMERR_SOLVE
Definition: enum_quda.h:165

profileAPE
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.

QudaGaugeParam_s::staggered_phase_type
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71

QUDA_DIRECT_SOLVE
Definition: enum_quda.h:161

quda::Nstream
const int Nstream
Definition: quda_internal.h:83

QudaMultigridParam_s::smoother_schwarz_type
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:579

QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:359

gaugeFatPrecondition
cudaGaugeField * gaugeFatPrecondition
Definition: interface_quda.cpp:125

quda::fermion_force::unitarizeForce
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.

QUDA_VERSION_MINOR
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2

QudaInvertParam_s::trlogA
double trlogA[2]
Definition: quda.h:237

QUDA_EIGCG_INVERTER
Definition: enum_quda.h:111

quda::assertAllMemFree
void assertAllMemFree()
Definition: malloc.cpp:384

quda::CloverFieldParam::clover
void * clover
Definition: clover_field.h:12

quda::copy
__host__ __device__ void copy(T1 &a, const T2 &b)
Definition: register_traits.h:152

invertMultiSrcQuda
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:3234

R
static int R[4]
Definition: interface_quda.cpp:84

setMPICommHandleQuda
void setMPICommHandleQuda(void *mycomm)
Definition: interface_quda.cpp:368

QudaInvertParam_s::dagger
QudaDagType dagger
Definition: quda.h:207

quda::blas::copy
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355

quda::multigrid_solver::dSmoothSloppy
Dirac * dSmoothSloppy
Definition: multigrid.h:469

quda::cudaColorSpinorField
Definition: color_spinor_field.h:575

cpuMom
cpuGaugeField * cpuMom
Definition: hisq_paths_force_test.cpp:28

profilePlaq
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.

QUDA_TWIST_SINGLET
Definition: enum_quda.h:399

free_clover_quda_
void free_clover_quda_(void)
Definition: interface_quda.cpp:5154

gauge_param
QudaGaugeParam gauge_param
Definition: dslash_ctest.cpp:36

initCommsGridQuda
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
Definition: interface_quda.cpp:401

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258

createExtendedGauge
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
Definition: interface_quda.cpp:692

computeGaugeFixingOVRQuda
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
Definition: interface_quda.cpp:5634

QudaInvertParam_s::cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:216

quda::massRescale
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
Definition: interface_quda.cpp:1769

QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:48

QUDA_MOBIUS_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:305

quda::ColorSpinorField::Component
ColorSpinorField & Component(const int idx) const
Definition: color_spinor_field.cpp:653

QudaInvertParam_s::chrono_replace_last
int chrono_replace_last
Definition: quda.h:356

QudaMultigridParam_s::smoother_halo_precision
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
Definition: quda.h:576

QudaInvertParam_s::clover_cuda_prec_refinement_sloppy
QudaPrecision clover_cuda_prec_refinement_sloppy
Definition: quda.h:227

destroyDeflationQuda
void destroyDeflationQuda(void *df)
Definition: interface_quda.cpp:2823

QudaGaugeParam_s::gauge_order
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43

computeCloverForceQuda
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4684

quda::computeQChargeDensity
double computeQChargeDensity(const GaugeField &Fmunu, void *result)
Compute the topological charge density per lattice site.
Definition: gauge_qcharge.cu:116

QudaGaugeParam_s::mom_offset
size_t mom_offset
Definition: quda.h:88

quda::CloverFieldParam::inverse
bool inverse
Definition: clover_field.h:11

profileInit2End
static TimeProfile profileInit2End("initQuda-endQuda", false)

quda::DiracParam::c_5
Complex c_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:28

quda::forceMonitor
bool forceMonitor()
Whether we are monitoring the force or not.
Definition: momentum.cu:13

quda::SolverParam::iter
int iter
Definition: invert_quda.h:139

gaugeFatRefinement
cudaGaugeField * gaugeFatRefinement
Definition: interface_quda.cpp:126

QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:400

QUDA_MG_INVERTER
Definition: enum_quda.h:117

QudaMultigridParam_s::secs
double secs
Definition: quda.h:645

comm_gpuid
int comm_gpuid(void)
Definition: comm_common.cpp:146

QudaInvertParam_s::return_clover
int return_clover
Definition: quda.h:241

LexMapData::ndim
int ndim
Definition: interface_quda.cpp:332

length
int length[]
Definition: gauge_force_test.cpp:34

quda::gaugeForce
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:340

gaugeLongRefinement
cudaGaugeField * gaugeLongRefinement
Definition: interface_quda.cpp:133

dumpMultigridQuda
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
Definition: interface_quda.cpp:2726

quda::MG::reset
void reset(bool refresh=false)
This method resets the solver, e.g., when a parameter has changed such as the mass.
Definition: multigrid.cpp:117

saveGaugeQuda
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:936

QudaInvertParam_s::make_resident_solution
int make_resident_solution
Definition: quda.h:347

QUDA_UKQCD_GAMMA_BASIS
Definition: enum_quda.h:368

computeHISQForceQuda
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4433

quda::CloverFieldParam::invNorm
void * invNorm
Definition: clover_field.h:15

QUDA_DAG_YES
Definition: enum_quda.h:219

quda::gaugefixingOVR
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
Definition: gauge_fix_ovr.cu:1606

invert_multishift_quda_
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5171

QudaMultigridParam_s::setup_minimize_memory
QudaBoolean setup_minimize_memory
Definition: quda.h:609

QudaGaugeParam_s::overwrite_mom
int overwrite_mom
Definition: quda.h:78

profileMomAction
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.

gauge_force_quda.h

qChargeDensityQuda
double qChargeDensityQuda(void *h_qDensity)
Calculates the topological charge from gaugeSmeared, if it exist, or from gaugePrecise if no smeared ...
Definition: interface_quda.cpp:5880

quda::Dirac::MdagM
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0

InitMagma
static bool InitMagma
Definition: interface_quda.cpp:93

quda::CloverFieldParam::mu2
double mu2
Definition: clover_field.h:18

QudaInvertParam_s::compute_clover_trlog
int compute_clover_trlog
Definition: quda.h:236

quda
Definition: blas_cublas.h:5

quda::LatticeFieldParam::siteSubset
QudaSiteSubset siteSubset
Definition: lattice_field.h:71

eigensolveQuda
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
Definition: interface_quda.cpp:2408

QudaInvertParam_s::chrono_index
int chrono_index
Definition: quda.h:365

gitversion
char * gitversion
Definition: version.cpp:4

QudaInvertParam_s::clover_cuda_prec_sloppy
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:226

QudaInvertParam_s::compute_action
int compute_action
Definition: quda.h:197

gaugeLongExtended
cudaGaugeField * gaugeLongExtended
Definition: interface_quda.cpp:129

QudaInvertParam_s::chrono_precision
QudaPrecision chrono_precision
Definition: quda.h:368

quda::cloverInvert
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
Definition: clover_invert.cu:106

QudaInvertParam_s::input_location
QudaFieldLocation input_location
Definition: quda.h:99

destroyGaugeFieldQuda
void destroyGaugeFieldQuda(void *gauge)
Definition: interface_quda.cpp:4265

QudaEigParam_s::use_poly_acc
QudaBoolean use_poly_acc
Definition: quda.h:387

quda::computeCloverSigmaTrace
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
Definition: clover_trace_quda.cu:242

dslashQuda_4dpc
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:1945

QudaInvertParam_s::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:191

QUDA_MEMORY_PINNED
Definition: enum_quda.h:13

blas_magma.h

quda::DiracParam::gauge
cudaGaugeField * gauge
Definition: dirac_quda.h:31

dirac_quda.h

quda::deflated_solver
Definition: deflation.h:180

QudaGaugeParam_s::site_size
size_t site_size
Definition: quda.h:89

CloseMagma
void CloseMagma()
Definition: blas_magma.cu:323

computeGaugeFixingFFTQuda
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: interface_quda.cpp:5716

cloverPrecondition
cudaCloverField * cloverPrecondition
Definition: interface_quda.cpp:139

quda::CloverField::Csw
double Csw() const
Definition: clover_field.h:118

QudaInvertParam_s::use_init_guess
QudaUseInitGuess use_init_guess
Definition: quda.h:231

quda::deflated_solver::deflParam
DeflationParam * deflParam
Definition: deflation.h:187

QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:362

QudaMultigridParam_s::n_vec
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:492

init_quda_
void init_quda_(int *dev)
Definition: interface_quda.cpp:5145

quda::pool::flush_pinned
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:566

getGaugePadding
int getGaugePadding(GaugeFieldParam &param)
Definition: interface_quda.cpp:4061

quda::APEStep
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.
Definition: gauge_ape.cu:128

param
QudaGaugeParam param
Definition: pack_test.cpp:17

QudaInvertParam_s::b_5
double_complex b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:111

comm_quda.h

openMagma
void openMagma()
Definition: interface_quda.cpp:95

quda::computeMomAction
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:178

quda::setDiracParam
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1562

comm_init
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi.cpp.
Definition: comm_mpi.cpp:58

ndim
static int ndim
Definition: layout_hyper.c:53

QudaInvertParam_s::solution_type
QudaSolutionType solution_type
Definition: quda.h:204

QudaEigParam_s::nConv
int nConv
Definition: quda.h:420

QudaMultigridParam_s::setup_location
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:601

QUDA_MATPC_SOLUTION
Definition: enum_quda.h:153

QudaEigParam_s::mem_type_ritz
QudaMemoryType mem_type_ritz
Definition: quda.h:450

QudaInvertParam_s::solver_normalization
QudaSolverNormalization solver_normalization
Definition: quda.h:209

quda::QUDA_PROFILE_D2H
Definition: timer.h:105

svd_rel_error
static double svd_rel_error
Definition: hisq_stencil_test.cpp:57

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:249

quda::LatticeFieldParam::x
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67

quda::multigrid_solver::dSmooth
Dirac * dSmooth
Definition: multigrid.h:468

QudaInvertParam_s::clover_cuda_prec
QudaPrecision clover_cuda_prec
Definition: quda.h:225

QUDA_TWISTED_MASSPC_DIRAC
Definition: enum_quda.h:312

staggered_oprod.h

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:536

solutionResident
std::vector< cudaColorSpinorField * > solutionResident
Definition: interface_quda.cpp:145

QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:212

QUDA_SILENT
Definition: enum_quda.h:263

quda::GaugeFieldParam
Definition: gauge_field.h:10

invertMultiShiftQuda
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:3579

quda::ColorSpinorParam::is_composite
bool is_composite
for deflation solvers:
Definition: color_spinor_field.h:103

QUDA_TWISTED_MASS_DIRAC
Definition: enum_quda.h:311

quda::DiracMobius::Dslash4
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:48

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
Definition: cuda_gauge_field.cpp:737

QudaInvertParam_s::chrono_use_resident
int chrono_use_resident
Definition: quda.h:359

quda::DiracParam
Definition: dirac_quda.h:19

QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:484

multigrid.h

quda::TimeProfile::Last
double Last(QudaProfileType idx)
Definition: timer.h:251

quda::ColorSpinorParam::composite_dim
int composite_dim
Definition: color_spinor_field.h:104

initQuda
void initQuda(int dev)
Definition: interface_quda.cpp:679

quda::DiracParam::mu
double mu
Definition: dirac_quda.h:37

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:88

QudaEigParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:381

quda::QUDA_PROFILE_CHRONO
Definition: timer.h:113

quda::cudaCloverField
Definition: clover_field.h:168

quda::DiracParam::b_5
Complex b_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:27

gaugeFatSloppy
cudaGaugeField * gaugeFatSloppy
Definition: interface_quda.cpp:124

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:361

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_cuda_api.h:145

quda::ax
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
Definition: gauge_field.cpp:349

QudaEigParam_s::secs
double secs
Definition: quda.h:468

quda::EigenSolver
Definition: eigensolve_quda.h:11

QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340

tol
double tol
Definition: test_util.cpp:1656

quda::pool::init
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:457

quda::DiracParam::mass
double mass
Definition: dirac_quda.h:24

QudaInvertParam_s::output_location
QudaFieldLocation output_location
Definition: quda.h:100

QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:429

quda::unitarizeLinks
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
Definition: unitarize_links_quda.cu:500

quda::fermion_force::hisqStaplesForce
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.

QudaInvertParam_s::clover_cuda_prec_precondition
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:228

quda::ColorSpinorParam::location
QudaFieldLocation location
Definition: color_spinor_field.h:83

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:68

inv_param
QudaInvertParam inv_param
Definition: covdev_test.cpp:37

setNumaAffinityNVML
int setNumaAffinityNVML(int deviceid)
Definition: numa_affinity.cpp:15

quda::CloverFieldParam
Definition: clover_field.h:9

quda::canReuseResidentGauge
bool canReuseResidentGauge(QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2232

quda::EigenSolver::create
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
Definition: eigensolve_quda.cpp:109

quda::DiracMdag
Definition: dirac_quda.h:1258

apply_staggered_phase_quda_
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:5328

LexMapData
Definition: interface_quda.cpp:331

QUDA_CLOVERPC_DIRAC
Definition: enum_quda.h:300

quda::SolverParam::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:187

freeSloppyGaugeQuda
void freeSloppyGaugeQuda()
Definition: interface_quda.cpp:1231

quda::SolverParam::updateInvertParam
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:428

QudaInvertParam_s::m5
double m5
Definition: quda.h:108

QudaGaugeParam_s
Definition: quda.h:32

quda::DiracParam::Ls
int Ls
Definition: dirac_quda.h:26

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:311

QudaMultigridParam_s
Definition: quda.h:476

quda::GaugeField::Create
static GaugeField * Create(const GaugeFieldParam &param)
Create the gauge field, with meta data specified in the parameter struct.
Definition: gauge_field.cpp:359

cpuFatLink
cpuGaugeField * cpuFatLink
Definition: hisq_unitarize_force_test.cpp:17

quda::ColorSpinorParam::fieldOrder
QudaFieldOrder fieldOrder
Definition: color_spinor_field.h:93

quda::arpack_solve
void arpack_solve(std::vector< ColorSpinorField *> &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data...
Definition: quda_arpack_interface.cpp:507

quda::DiracDomainWall4DPC::Dslash5inv
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity, const double &kappa5) const
Definition: dirac_domain_wall_4d.cpp:131

quda::GaugeField::StaggeredPhaseApplied
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:260

gaussGaugeQuda
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
Definition: interface_quda.cpp:5386

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cpp:510

updateMultigridQuda
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
Definition: interface_quda.cpp:2645

gaugeRefinement
cudaGaugeField * gaugeRefinement
Definition: interface_quda.cpp:120

quda::flushProfile
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:504

QudaInvertParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:215

quda::deflated_solver::m
DiracMatrix * m
Definition: deflation.h:183

qChargeQuda
double qChargeQuda()
Definition: interface_quda.cpp:5846

initialized
static bool initialized
Profiler for initQuda.
Definition: interface_quda.cpp:159

quda::DiracClover::Clover
void Clover(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_clover.cpp:48

cloverSloppy
cudaCloverField * cloverSloppy
Definition: interface_quda.cpp:138

quda::multigrid_solver::multigrid_solver
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
Definition: interface_quda.cpp:2547

QudaInvertParam_s::verbosity
QudaVerbosity verbosity
Definition: quda.h:244

quda::QUDA_PROFILE_EPILOGUE
Definition: timer.h:110

quda::DiracParam::laplace3D
int laplace3D
Definition: dirac_quda.h:34

reunit_svd_only
static bool reunit_svd_only
Definition: hisq_stencil_test.cpp:56

csParam
ColorSpinorParam csParam
Definition: pack_test.cpp:24

quda::Dirac::prepare
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const =0

free_gauge_quda_
void free_gauge_quda_()
Definition: interface_quda.cpp:5150

QudaInvertParam_s::tol_offset
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:179

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:44

cloverRefinement
cudaCloverField * cloverRefinement
Definition: interface_quda.cpp:140

quda::DiracParam::commDim
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:44

QudaInvertParam_s::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:185

MatQuda
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2088

profileSTOUT
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.

quda::QUDA_PROFILE_H2D
Definition: timer.h:104

load_clover_quda_
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5152

projectSU3Quda
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
Definition: interface_quda.cpp:4971

quda::DiracParam::dagger
QudaDagType dagger
Definition: dirac_quda.h:30

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

newQudaInvertParam
QudaInvertParam newQudaInvertParam(void)

cudaFatLink
cudaGaugeField * cudaFatLink
Definition: hisq_unitarize_force_test.cpp:16

quda::quda_version
static const std::string quda_version
Definition: tune.cpp:114

quda::multigrid_solver::profile
TimeProfile & profile
Definition: multigrid.h:481

quda::CloverFieldParam::setPrecision
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23

quda::pool::flush_device
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:578

QudaInvertParam_s::gflops
double gflops
Definition: quda.h:250

QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:453

quda::Dirac::Dagger
void Dagger(QudaDagType dag) const
Definition: dirac_quda.h:182

quda::Solver::create
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:33

cpuULink
cpuGaugeField * cpuULink
Definition: unitarize_link_test.cpp:53

QudaEigParam_s::gflops
double gflops
Definition: quda.h:465

quda::cpuGaugeField
Definition: gauge_field.h:580

quda::TimeProfile::Print
void Print()
Definition: timer.cpp:7

QudaGaugeParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:58

free_sloppy_gauge_quda_
void free_sloppy_gauge_quda_()
Definition: interface_quda.cpp:5151

QUDA_DOMAIN_WALL_4DPC_DIRAC
Definition: enum_quda.h:304

quda::ColorSpinorField::SiteSubset
QudaSiteSubset SiteSubset() const
Definition: color_spinor_field.h:481

QudaInvertParam_s::clover_order
QudaCloverFieldOrder clover_order
Definition: quda.h:230

quda::createDslashEvents
void createDslashEvents()
Definition: dslash_quda.cu:95

updateGaugeFieldQuda
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:4869

quda::GaugeField::Anisotropy
double Anisotropy() const
Definition: gauge_field.h:252

quda::ColorSpinorParam::gammaBasis
QudaGammaBasis gammaBasis
Definition: color_spinor_field.h:94

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:125

quda::size
constexpr int size
Definition: dslash_domain_wall_4d.cuh:8

remove_staggered_phase_quda_
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:5338

QudaInvertParam_s::nev
int nev
Definition: quda.h:328

lex_rank_from_coords
static int lex_rank_from_coords(const int *coords, void *fdata)
Definition: interface_quda.cpp:339

QUDA_PERIODIC_T
Definition: enum_quda.h:54

quda::QUDA_PROFILE_COMPUTE
Definition: timer.h:108

QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:88

freeSloppyCloverQuda
void freeSloppyCloverQuda()
Definition: interface_quda.cpp:1428

quda::GaugeFieldParam::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:17

quda::multigrid_solver::m
DiracM * m
Definition: multigrid.h:471

warningQuda
#define warningQuda(...)
Definition: util_quda.h:133

quda::Solver::blocksolve
virtual void blocksolve(ColorSpinorField &out, ColorSpinorField &in)
Definition: solver.cpp:198

quda::flushForceMonitor
void flushForceMonitor()
Flush any outstanding force monitoring information.
Definition: momentum.cu:29

quda::cudaGaugeField
Definition: gauge_field.h:404

performAPEnStep
void performAPEnStep(unsigned int nSteps, double alpha)
Definition: interface_quda.cpp:5528

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111

profileClover
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.

performWuppertalnStep
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
Definition: interface_quda.cpp:5457

QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3

QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332

QudaEigParam_s::use_norm_op
QudaBoolean use_norm_op
Definition: quda.h:402

quda::DiracParam::type
QudaDiracType type
Definition: dirac_quda.h:22

QUDA_MAX_CHRONO
#define QUDA_MAX_CHRONO
Definition: interface_quda.cpp:148

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348

quda::cudaGaugeField::zero
void zero()
Definition: cuda_gauge_field.cpp:843

QUDA_PCG_INVERTER
Definition: enum_quda.h:109

OpenMagma
void OpenMagma()
Definition: blas_magma.cu:307

unregister_pinned_quda_
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:5199

QudaMultigridParam_s::precision_null
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
Definition: quda.h:495

quda::SolverParam::tol_hq
double tol_hq
Definition: invert_quda.h:121

quda::multigrid_solver::mg
MG * mg
Definition: multigrid.h:480

QudaGaugeParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52

quda::cudaGaugeField::exchangeGhost
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
Definition: cuda_gauge_field.cpp:216

invalidate_clover
static bool invalidate_clover
Definition: interface_quda.cpp:727

quda::DiracMobius::Dslash4pre
void Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:60

QudaInvertParam_s::chrono_make_resident
int chrono_make_resident
Definition: quda.h:353

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:482

QudaEigParam_s::arpack_check
QudaBoolean arpack_check
Definition: quda.h:429

gauge_update_quda.h

quda::LatticeField::Volume
int Volume() const
Definition: lattice_field.h:504

svd_abs_error
static double svd_abs_error
Definition: hisq_stencil_test.cpp:58

quda::QUDA_PROFILE_PREAMBLE
Definition: timer.h:107

quda::DiracParam::matpcType
QudaMatPCType matpcType
Definition: dirac_quda.h:29

quda::SolverParam::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:181

comm_set_gridsize_
void comm_set_gridsize_(int *grid)
Definition: interface_quda.cpp:5370

X
int X[4]
Definition: covdev_test.cpp:70

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

QudaInvertParam_s::offset
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:176

quda::projectSU3
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
Definition: unitarize_links_quda.cu:590

gaugeFatPrecise
cudaGaugeField * gaugeFatPrecise
Definition: interface_quda.cpp:123

QudaCommsMap
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12

initQudaMemory
void initQudaMemory()
Definition: interface_quda.cpp:638

profileDslash
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.

quda::saveProfile
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:514

saveGaugeFieldQuda
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:4252

init_quda_memory_
void init_quda_memory_()
Definition: interface_quda.cpp:5147

profileGauge
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.

QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:96

cloverPrecise
cudaCloverField * cloverPrecise
Definition: interface_quda.cpp:137

QudaParity
enum QudaParity_s QudaParity

register_pinned_quda_
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:5194

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:50

QudaGaugeParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:49

quda::DiracCloverPC
Definition: dirac_quda.h:305

quda::applyU
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:446

comms_initialized
static bool comms_initialized
Definition: interface_quda.cpp:399

QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:224

quda::OvrImpSTOUTStep
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.
Definition: gauge_stout.cu:269

num_failures_h
static int * num_failures_h
Definition: interface_quda.cpp:153

quda::DiracDomainWall4D::Dslash4
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_domain_wall_4d.cpp:21

QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:31

quda::cublas::init
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31

quda::DiracDomainWall4DPC
Definition: dirac_quda.h:423

QudaInvertParam_s::mass
double mass
Definition: quda.h:105

dslashQuda_mdwf
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:2015

QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:189

QudaEigParam_s::location
QudaFieldLocation location
Definition: quda.h:453

quda::deflated_solver::defl
Deflation * defl
Definition: deflation.h:189

quda::cpuColorSpinorField::freeGhostBuffer
static void freeGhostBuffer(void)
Definition: cpu_color_spinor_field.cpp:285

profileGauss
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.

clover_field.h

quda::GaugeFieldParam::nFace
int nFace
Definition: gauge_field.h:14

QUDA_MEMORY_DEVICE
Definition: enum_quda.h:12

MatDagMatQuda
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2158

quda::deflated_solver::d
Dirac * d
Definition: deflation.h:182

QudaEigParam_s
Definition: quda.h:376

profileProject
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.

QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:475

quda::DiracParam::halo_precision
QudaPrecision halo_precision
Definition: dirac_quda.h:46

invert_quda_
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5163

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:66

quda::blas::zero
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472

quda::DiracParam::kappa
double kappa
Definition: dirac_quda.h:23

quda::cudaCloverField::copy
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
Definition: clover_field.cpp:281

QudaGaugeParam_s::cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:55

QUDA_TWISTED_CLOVER_DSLASH
Definition: enum_quda.h:95

LexMapData::dims
int dims[QUDA_MAX_DIM]
Definition: interface_quda.cpp:333

quda::LatticeFieldParam::nDim
int nDim
Definition: lattice_field.h:64

init_default_comms
static void init_default_comms()
Definition: interface_quda.cpp:452

tune_quda.h

quda::Dirac::setMass
void setMass(double mass)
Definition: dirac_quda.h:169

pushVerbosity
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
Definition: util_quda.cpp:83

num_failures_d
static int * num_failures_d
Definition: interface_quda.cpp:154

QUDA_ASQTADPC_DIRAC
Definition: enum_quda.h:310

init_quda_device_
void init_quda_device_(int *dev)
Definition: interface_quda.cpp:5146

gaugeLongSloppy
cudaGaugeField * gaugeLongSloppy
Definition: interface_quda.cpp:131

QudaInvertParam_s::compute_clover_inverse
int compute_clover_inverse
Definition: quda.h:240

dims
static int dims[4]
Definition: face_gauge.cpp:41

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:47

QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:351

checkCudaErrorNoSync
#define checkCudaErrorNoSync()
Definition: util_quda.h:145

QUDA_WILSON_DSLASH
Definition: enum_quda.h:87

quda::MG
Definition: multigrid.h:177

update_gauge_field_quda_
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:5211

quda::Dirac::Mdag
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: dirac.cpp:90

blas_quda.h

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:69

deflation.h

quda::QUDA_PROFILE_INIT
Definition: timer.h:106

quda::gaugefixingFFT
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: gauge_fix_fft.cu:1083

index
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32

quda::CloverFieldParam::cloverInv
void * cloverInv
Definition: clover_field.h:14

quda::SolverParam::true_res
double true_res
Definition: invert_quda.h:130

quda::Dirac::M
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0

invertQuda
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2830

printQudaInvertParam
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:277

QUDA_TWISTED_CLOVER_DIRAC
Definition: enum_quda.h:313

quda::Solver
Definition: invert_quda.h:460

STR
#define STR(x)
Definition: interface_quda.cpp:473

QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:266

clover_quda_
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
Definition: interface_quda.cpp:5157

quda::ColorSpinorParam
Definition: color_spinor_field.h:80

loadSloppyGaugeQuda
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
Definition: interface_quda.cpp:1292

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

unitarization_links.h

quda::multigrid_solver::mSmoothSloppy
DiracM * mSmoothSloppy
Definition: multigrid.h:473

quda::wuppertalStep
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
Definition: color_spinor_wuppertal.cu:186

profilerStop
static void profilerStop(const char *f)
Definition: interface_quda.cpp:305

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_NORMERR_PC_SOLVE
Definition: enum_quda.h:166

QudaGaugeParam_s::location
QudaFieldLocation location
Definition: quda.h:34

quda::GaugeFieldParam::setPrecision
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:131

quda::GaugeFieldParam::site_size
size_t site_size
Definition: gauge_field.h:48

QudaInvertParam_s::clover_rho
double clover_rho
Definition: quda.h:234

quda::inverse
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611

dirac
GaugeCovDev * dirac
Definition: covdev_test.cpp:73

profileInvert
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

profilePhase
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.

QudaInvertParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:217

GaugeFixOVRQuda
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.

gaugePrecondition
cudaGaugeField * gaugePrecondition
Definition: interface_quda.cpp:119

QUDA_TWISTED_MASS_DSLASH
Definition: enum_quda.h:94

quda::CloverFieldParam::twisted
bool twisted
Clover coefficient.
Definition: clover_field.h:17

QUDA_MATPC_ODD_ODD_ASYMMETRIC
Definition: enum_quda.h:213

QudaInvertParam_s::deflation_grid
int deflation_grid
Definition: quda.h:336

QudaInvertParam_s::iter
int iter
Definition: quda.h:249

profileCovDev
static TimeProfile profileCovDev("covDevQuda")
Profiler for momentum action.

quda::DeflationParam
Definition: deflation.h:13

QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:92

QudaInvertParam_s::num_src
int num_src
Definition: quda.h:171

gParam
GaugeFieldParam gParam
Definition: hisq_paths_force_test.cpp:64

quda::CloverFieldParam::csw
double csw
Definition: clover_field.h:16

quda::multigrid_solver::mSmooth
DiracM * mSmooth
Definition: multigrid.h:472

printQudaEigParam
void printQudaEigParam(QudaEigParam *param)
Definition: check_params.h:143

quda::pack
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83

blas_cublas.h

QUDA_SOURCE_NORMALIZATION
Definition: enum_quda.h:232

quda::DiracMdagM
Definition: dirac_quda.h:1181

quda::CG
Conjugate-Gradient Solver.
Definition: invert_quda.h:570

quda::deflated_solver::deflated_solver
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
Definition: interface_quda.cpp:2743

quda::contractQuda
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)
Definition: contract.cu:107

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::MinResExt
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
Definition: invert_quda.h:1171

quda::cudaGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:546

quda::ColorSpinorParam::nSpin
int nSpin
Definition: color_spinor_field.h:86

QUDA_GENERAL_LINKS
Definition: enum_quda.h:24

profileEigensolve
static TimeProfile profileEigensolve("eigensolveQuda")
Profiler for computeFatLinkQuda.

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

quda.h
Main header file for the QUDA library.

unitarize_eps
static double unitarize_eps
Definition: hisq_stencil_test.cpp:54

QUDA_STAGGEREDPC_DIRAC
Definition: enum_quda.h:308

profileInit
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.

quda::DiracMMdag
Definition: dirac_quda.h:1221

quda::DiracParam::clover
cudaCloverField * clover
Definition: dirac_quda.h:35

redundant_comms
static bool redundant_comms
Definition: interface_quda.cpp:86

QUDA_MAX_MG_LEVEL
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
Definition: quda_constants.h:56

quda::GaugeFieldParam::link_type
QudaLinkType link_type
Definition: gauge_field.h:19

quda::multigrid_solver::B
std::vector< ColorSpinorField * > B
Definition: multigrid.h:475

set_kernel_pack_t_
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
Definition: interface_quda.cpp:5380

quda::CloverFieldParam::direct
bool direct
Definition: clover_field.h:10

quda::printPeakMemUsage
void printPeakMemUsage()
Definition: malloc.cpp:375

QUDA_MAX_DWF_LS
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
Definition: quda_constants.h:49

quda::setDiracRefineParam
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
Definition: interface_quda.cpp:1679

quda::DiracMatrix
Definition: dirac_quda.h:1100

quda::LatticeFieldParam::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:58

QUDA_DEGRAND_ROSSI_GAMMA_BASIS
Definition: enum_quda.h:367

quda::loadTuneCache
void loadTuneCache()
Definition: tune.cpp:322

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:40

QUDA_TIFR_PADDED_DIRAC_ORDER
Definition: enum_quda.h:248

newDeflationQuda
void * newDeflationQuda(QudaEigParam *eig_param)
Definition: interface_quda.cpp:2809

dslash_quda.h

quda::fermion_force
Definition: ks_improved_force.h:8

staggeredPhaseQuda
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
Definition: interface_quda.cpp:5029

printQudaMultigridParam
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:598

quda::cpuCloverField
Definition: clover_field.h:244

mat_dag_mat_quda_
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5161

QudaGaugeParam_s::use_resident_gauge
int use_resident_gauge
Definition: quda.h:80

QudaInvertParam_s::Ls
int Ls
Definition: quda.h:109

quda::QUDA_PROFILE_TOTAL
Definition: timer.h:144

printfQuda
#define printfQuda(...)
Definition: util_quda.h:115

quda::LatticeFieldParam::mem_type
QudaMemoryType mem_type
Definition: lattice_field.h:73

new_quda_gauge_param_
void new_quda_gauge_param_(QudaGaugeParam *param)
Definition: interface_quda.cpp:5204

QUDA_DAG_NO
Definition: enum_quda.h:218

quda::DiracParam::fatGauge
cudaGaugeField * fatGauge
Definition: dirac_quda.h:32

QudaInvertParam_s::twist_flavor
QudaTwistFlavorType twist_flavor
Definition: quda.h:117

profilerStart
static void profilerStart(const char *f)
Definition: interface_quda.cpp:252

cloverQuda
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
Definition: interface_quda.cpp:2339

QudaGaugeParam_s::return_result_gauge
int return_result_gauge
Definition: quda.h:84

gaugeSmeared
cudaGaugeField * gaugeSmeared
Definition: interface_quda.cpp:135

GaugeFixFFTQuda
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")

quda::GaugeField::Tadpole
double Tadpole() const
Definition: gauge_field.h:253

QudaInvertParam_s::residue
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:194

QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:112

quda::ColorSpinorParam::siteOrder
QudaSiteOrder siteOrder
Definition: color_spinor_field.h:91

quda::GaugeFieldParam::gauge
void * gauge
Definition: gauge_field.h:24

cudaGauge
cudaGaugeField * cudaGauge
Definition: hisq_paths_force_test.cpp:21

profileExtendedGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.

QudaInvertParam_s::chrono_max_dim
int chrono_max_dim
Definition: quda.h:362

QudaInvertParam_s::overlap
int overlap
Definition: quda.h:173

QUDA_ASYMMETRIC_MASS_NORMALIZATION
Definition: enum_quda.h:226

checkGauge
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
Definition: interface_quda.cpp:2266

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:64

streams
cudaStream_t * streams
Definition: interface_quda.cpp:157

QudaGaugeParam_s::use_resident_mom
int use_resident_mom
Definition: quda.h:81

quda::ColorSpinorParam::v
void * v
Definition: color_spinor_field.h:99

QUDA_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:302

quda::GaugeFieldParam::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:16

quda::multigrid_solver::d
Dirac * d
Definition: multigrid.h:467

quda::setKernelPackT
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:24

closeMagma
void closeMagma()
Definition: interface_quda.cpp:106

quda::TimeProfile
Definition: timer.h:171

quda::GaugeFieldParam::create
QudaFieldCreate create
Definition: gauge_field.h:26

quda::printAPIProfile
void printAPIProfile()
Print out the timer profile for CUDA API calls.
Definition: quda_cuda_api.cpp:336

QUDA_DOMAIN_WALL_4D_DIRAC
Definition: enum_quda.h:303

quda::printLaunchTimer
void printLaunchTimer()
Definition: tune.cpp:843

quda::gamma5
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:461

QudaContractType
enum QudaContractType_s QudaContractType

QudaInvertParam_s::cl_pad
int cl_pad
Definition: quda.h:247

profileEnd
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.

QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:163

longlink
void * longlink
Definition: staggered_invertmsrc_test.cpp:30

QudaInvertParam_s::compute_true_res
int compute_true_res
Definition: quda.h:125

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

QudaInvertParam_s::residual_type
QudaResidualType residual_type
Definition: quda.h:320

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

quda::SolverParam::use_init_guess
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:64

QUDA_EVEN_PARITY
Definition: enum_quda.h:287

QudaInvertParam_s::num_offset
int num_offset
Definition: quda.h:169

flushChronoQuda
void flushChronoQuda(int i)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:1448

quda::DiracParam::longGauge
cudaGaugeField * longGauge
Definition: dirac_quda.h:33

QUDA_MATDAG_MAT_SOLUTION
Definition: enum_quda.h:152

dslash_quda_
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
Definition: interface_quda.cpp:5155

popVerbosity
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
Definition: util_quda.cpp:94

QudaVerbosity
enum QudaVerbosity_s QudaVerbosity

quda::updateGaugeField
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
Definition: gauge_update_quda.cu:227

fatlink
void * fatlink
Definition: staggered_invertmsrc_test.cpp:29

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:126

QUDA_CLOVER_DIRAC
Definition: enum_quda.h:299

createCloverQuda
void createCloverQuda(QudaInvertParam *invertParam)
Definition: interface_quda.cpp:4198

QudaGaugeParam_s::return_result_mom
int return_result_mom
Definition: quda.h:85

test_type
int test_type
Definition: test_util.cpp:1636

QUDA_CPS_WILSON_DIRAC_ORDER
Definition: enum_quda.h:246

end_quda_
void end_quda_()
Definition: interface_quda.cpp:5148

QudaInvertParam_s::compute_clover
int compute_clover
Definition: quda.h:239

computeGaugeForceQuda
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
Definition: interface_quda.cpp:4073

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251

quda::computeStaggeredOprod
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
Definition: staggered_oprod.cu:447

ks_force_quda.h

QudaInvertParam_s::epsilon
double epsilon
Definition: quda.h:115

contract_quda.h

numa_affinity.h

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

cpuGauge
cpuGaugeField * cpuGauge
Definition: hisq_paths_force_test.cpp:22

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

quda::GaugeFieldParam::compute_fat_link_max
bool compute_fat_link_max
Definition: gauge_field.h:33

quda::GaugeFieldParam::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:28

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:474

QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:360

quda::TimeProfile::PrintGlobal
static void PrintGlobal()
Definition: timer.cpp:81

setOutputFile
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:75

gaugePrecise
cudaGaugeField * gaugePrecise
Definition: interface_quda.cpp:117

random_quda.h

quda::GaugeField::TBoundary
QudaTboundary TBoundary() const
Definition: gauge_field.h:254

quda::GaugeField::StaggeredPhase
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:259

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:68

QudaInvertParam_s::use_resident_solution
int use_resident_solution
Definition: quda.h:350

quda::Dirac::create
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:159

QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:89

quda::DiracDomainWall4D::Dslash5
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_domain_wall_4d.cpp:31

quda::unscaled_shifts
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
Definition: interface_quda.cpp:1767

QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:306

dslashQuda
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
Definition: interface_quda.cpp:1853

compute_gauge_force_quda_
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
Definition: interface_quda.cpp:5267

updateR
void updateR()
update the radius for halos.
Definition: interface_quda.cpp:674

profileWuppertal
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.

cudaForce
cudaGaugeField * cudaForce
Definition: hisq_paths_force_test.cpp:24

flush_chrono_quda_
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:5192

quda::multigrid_solver
Definition: multigrid.h:466

quda::ColorSpinorParam::create
QudaFieldCreate create
Definition: color_spinor_field.h:95

profileContract
static TimeProfile profileContract("contractQuda")
Profiler for covariant derivative.

quda::SolverParam
Definition: invert_quda.h:17

quda::DiracParam::m5
double m5
Definition: dirac_quda.h:25

QudaGaugeParam_s::make_resident_gauge
int make_resident_gauge
Definition: quda.h:82

gaugeFatExtended
cudaGaugeField * gaugeFatExtended
Definition: interface_quda.cpp:127

quda::cudaGaugeField::copy
void copy(const GaugeField &src)
Definition: cuda_gauge_field.cpp:630

QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:164

quda::DiracMobiusPC::Dslash5inv
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:205

QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:483

quda::DiracCloverPC::CloverInv
void CloverInv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: dirac_clover.cpp:119

QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:155

QUDA_GAUGE_LAPLACE_DIRAC
Definition: enum_quda.h:317

quda::Dirac::reconstruct
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const =0

quda::computeCloverSigmaOprod
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
Definition: clover_sigma_outer_product.cu:98

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

chronoResident
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)

QUDA_ASQTAD_DIRAC
Definition: enum_quda.h:309

quda::Dirac::Dslash
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0

quda::CloverFieldParam::norm
void * norm
Definition: clover_field.h:13

extendedGaugeResident
cudaGaugeField * extendedGaugeResident
Definition: interface_quda.cpp:143

quda::computeClover
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204

QUDA_SU3_LINKS
Definition: enum_quda.h:23

QudaInvertParam_s::dslash_type_precondition
QudaDslashType dslash_type_precondition
Definition: quda.h:284

QudaInvertParam_s::clover_cpu_prec
QudaPrecision clover_cpu_prec
Definition: quda.h:224

QudaEigParam_s::cuda_prec_ritz
QudaPrecision cuda_prec_ritz
Definition: quda.h:447

parity
QudaParity parity
Definition: covdev_test.cpp:54

QudaInvertParam_s
Definition: quda.h:97

QUDA_TENSOR_GEOMETRY
Definition: enum_quda.h:476

quda::LatticeFieldParam::r
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:79

opp
static int opp(int dir)
Definition: interface_quda.cpp:5217

profileHISQForce
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::reorder_location_set
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
Definition: lattice_field.cpp:726

QudaMultigridParam_s::n_level
int n_level
Definition: quda.h:483

quda::LatticeField::freeGhostBuffer
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
Definition: lattice_field.cpp:271

destroyMultigridQuda
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
Definition: interface_quda.cpp:2641

quda::destroyDslashEvents
void destroyDslashEvents()
Definition: dslash_quda.cu:144

quda::gaugeGauss
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...
Definition: gauge_random.cu:145

prec
QudaPrecision prec
Definition: test_util.cpp:1608

llfat_quda.h

quda::deflated_solver::RV
ColorSpinorField * RV
Definition: deflation.h:185

performOvrImpSTOUTnStep
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
Definition: interface_quda.cpp:5598

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:30

quda::SolverParam::tol
double tol
Definition: invert_quda.h:115

gauge_field.h

setVerbosity
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Definition: copy_gauge_extended.cu:343

QUDA_VERSION_MAJOR
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1

QudaInvertParam_s::matpc_type
QudaMatPCType matpc_type
Definition: quda.h:206

QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:90

momResident
cudaGaugeField * momResident
Definition: interface_quda.cpp:142

quda::DiracParam::tmp1
ColorSpinorField * tmp1
Definition: dirac_quda.h:41

quda::GaugeField::applyStaggeredPhase
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
Definition: gauge_field.cpp:139

QudaMultigridParam_s::gflops
double gflops
Definition: quda.h:642

quda::Deflation
Definition: deflation.h:78

cpuForce
cpuGaugeField * cpuForce
Definition: hisq_paths_force_test.cpp:25

profileCloverForce
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.

kappa5
double kappa5
Definition: dslash_ctest.cpp:31

quda::plaquette
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.
Definition: gauge_plaq.cu:65

createGaugeForcePaths
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
Definition: interface_quda.cpp:5219

newMultigridQuda
void * newMultigridQuda(QudaMultigridParam *mg_param)
Definition: interface_quda.cpp:2624

plaqQuda
void plaqQuda(double plaq[3])
Definition: interface_quda.cpp:5419

QudaGaugeParam_s::reconstruct_refinement_sloppy
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:56

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::cpuColorSpinorField
Definition: color_spinor_field.h:865

QudaGaugeParam_s::cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:47

gaugeSloppy
cudaGaugeField * gaugeSloppy
Definition: interface_quda.cpp:118

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:635

QUDA_STAGGERED_DIRAC
Definition: enum_quda.h:307

quda::multigrid_solver::mgParam
MGParam * mgParam
Definition: multigrid.h:478

ks_improved_force.h

initQudaDevice
void initQudaDevice(int dev)
Definition: interface_quda.cpp:483

endQuda
void endQuda(void)
Definition: interface_quda.cpp:1461

quda::Dirac
Definition: dirac_quda.h:106

QudaGaugeParam_s::overlap
int overlap
Definition: quda.h:76

eigensolve_quda.h

newQudaGaugeParam
QudaGaugeParam newQudaGaugeParam(void)

QUDA_RECONSTRUCT_INVALID
Definition: enum_quda.h:73

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

QudaMultigridParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:478

checkClover
void checkClover(QudaInvertParam *param)
Definition: interface_quda.cpp:2242

quda_internal.h

device_free
#define device_free(ptr)
Definition: malloc_quda.h:69

QudaInvertParam_s::reliable_delta_refinement
double reliable_delta_refinement
Definition: quda.h:130

quda::LatticeFieldParam::pad
int pad
Definition: lattice_field.h:69

mat_quda_
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5159

QudaInvertParam_s::clover_coeff
double clover_coeff
Definition: quda.h:233

quda::GaugeField
Definition: gauge_field.h:164

QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:91

compute_staggered_force_quda_
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5323

quda::setDiracPreParam
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
Definition: interface_quda.cpp:1698

verbosity
QudaVerbosity verbosity
Definition: test_util.cpp:1614

gaugeLongPrecondition
cudaGaugeField * gaugeLongPrecondition
Definition: interface_quda.cpp:132

quda::GaugeField::removeStaggeredPhase
void removeStaggeredPhase()
Definition: gauge_field.cpp:154

quda::DiracParam::epsilon
double epsilon
Definition: dirac_quda.h:39

QUDA_MEMORY_MAPPED
Definition: enum_quda.h:14