quda-ref/v1.1.0/interface__quda_8cpp_source.html

 #include <cmath>

 #include <cstdio>

 #include <cstdlib>

 #include <cstring>

 #include <iostream>

 #include <sys/time.h>

 #include <complex.h>


 #include <quda.h>

 #include <quda_fortran.h>

 #include <quda_internal.h>

 #include <device.h>

 #include <comm_quda.h>

 #include <tune_quda.h>

 #include <blas_quda.h>

 #include <gauge_field.h>

 #include <dirac_quda.h>

 #include <dslash_quda.h>

 #include <invert_quda.h>

 #include <eigensolve_quda.h>

 #include <color_spinor_field.h>

 #include <clover_field.h>

 #include <llfat_quda.h>

 #include <unitarization_links.h>

 #include <algorithm>

 #include <staggered_oprod.h>

 #include <ks_improved_force.h>

 #include <ks_force_quda.h>

 #include <random_quda.h>

 #include <mpi_comm_handle.h>


 #include <multigrid.h>

 #include <deflation.h>


 #include <split_grid.h>


 #include <ks_force_quda.h>


 #ifdef GPU_GAUGE_FORCE

 #include <gauge_force_quda.h>

 #endif

 #include <gauge_update_quda.h>


 #define MAX(a,b) ((a)>(b)? (a):(b))

 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))


 // define newQudaGaugeParam() and newQudaInvertParam()

 #define INIT_PARAM

 #include "check_params.h"

 #undef INIT_PARAM


 // define (static) checkGaugeParam() and checkInvertParam()

 #define CHECK_PARAM

 #include "check_params.h"

 #undef CHECK_PARAM

 void checkBLASParam(QudaBLASParam &param) { checkBLASParam(&param); }


 // define printQudaGaugeParam() and printQudaInvertParam()

 #define PRINT_PARAM

 #include "check_params.h"

 #undef PRINT_PARAM


 #include <gauge_tools.h>

 #include <contract_quda.h>

 #include <momentum.h>


 using namespace quda;


 static int R[4] = {0, 0, 0, 0};

 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels

 static bool redundant_comms = false;


 #include <blas_lapack.h>


 //for MAGMA lib:

 #include <blas_magma.h>


 static bool InitMagma = false;


 void openMagma() {


   if (!InitMagma) {

     OpenMagma();

     InitMagma = true;

   } else {

     printfQuda("\nMAGMA library was already initialized..\n");

   }


 }


 void closeMagma(){


   if (InitMagma) {

     CloseMagma();

     InitMagma = false;

   } else {

     printfQuda("\nMAGMA library was not initialized..\n");

   }


 }


 cudaGaugeField *gaugePrecise = nullptr;

 cudaGaugeField *gaugeSloppy = nullptr;

 cudaGaugeField *gaugePrecondition = nullptr;

 cudaGaugeField *gaugeRefinement = nullptr;

 cudaGaugeField *gaugeEigensolver = nullptr;

 cudaGaugeField *gaugeExtended = nullptr;


 cudaGaugeField *gaugeFatPrecise = nullptr;

 cudaGaugeField *gaugeFatSloppy = nullptr;

 cudaGaugeField *gaugeFatPrecondition = nullptr;

 cudaGaugeField *gaugeFatRefinement = nullptr;

 cudaGaugeField *gaugeFatEigensolver = nullptr;

 cudaGaugeField *gaugeFatExtended = nullptr;


 cudaGaugeField *gaugeLongPrecise = nullptr;

 cudaGaugeField *gaugeLongSloppy = nullptr;

 cudaGaugeField *gaugeLongPrecondition = nullptr;

 cudaGaugeField *gaugeLongRefinement = nullptr;

 cudaGaugeField *gaugeLongEigensolver = nullptr;

 cudaGaugeField *gaugeLongExtended = nullptr;


 cudaGaugeField *gaugeSmeared = nullptr;


 cudaCloverField *cloverPrecise = nullptr;

 cudaCloverField *cloverSloppy = nullptr;

 cudaCloverField *cloverPrecondition = nullptr;

 cudaCloverField *cloverRefinement = nullptr;

 cudaCloverField *cloverEigensolver = nullptr;


 cudaGaugeField *momResident = nullptr;

 cudaGaugeField *extendedGaugeResident = nullptr;


 std::vector<cudaColorSpinorField*> solutionResident;


 // vector of spinors used for forecasting solutions in HMC

 #define QUDA_MAX_CHRONO 12

 // each entry is one p

 std::vector< std::vector<ColorSpinorField*> > chronoResident(QUDA_MAX_CHRONO);


 // Mapped memory buffer used to hold unitarization failures

 static int *num_failures_h = nullptr;

 static int *num_failures_d = nullptr;


 static bool initialized = false;


 static TimeProfile profileInit("initQuda");


 static TimeProfile profileGauge("loadGaugeQuda");


 static TimeProfile profileClover("loadCloverQuda");


 static TimeProfile profileDslash("dslashQuda");


 static TimeProfile profileInvert("invertQuda");


 static TimeProfile profileInvertMultiSrc("invertMultiSrcQuda");


 static TimeProfile profileMulti("invertMultiShiftQuda");


 static TimeProfile profileEigensolve("eigensolveQuda");


 static TimeProfile profileFatLink("computeKSLinkQuda");


 static TimeProfile profileGaugeForce("computeGaugeForceQuda");


 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");


 static TimeProfile profileExtendedGauge("createExtendedGaugeField");


 static TimeProfile profileCloverForce("computeCloverForceQuda");


 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");


 static TimeProfile profileHISQForce("computeHISQForceQuda");


 static TimeProfile profilePlaq("plaqQuda");


 static TimeProfile profileWuppertal("wuppertalQuda");


 static TimeProfile profileGauss("gaussQuda");


 static TimeProfile profileGaugeObs("gaugeObservablesQuda");


 static TimeProfile profileAPE("APEQuda");


 static TimeProfile profileSTOUT("STOUTQuda");


 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");


 static TimeProfile profileWFlow("wFlowQuda");


 static TimeProfile profileProject("projectSU3Quda");


 static TimeProfile profilePhase("staggeredPhaseQuda");


 static TimeProfile profileContract("contractQuda");


 static TimeProfile profileBLAS("blasQuda");

 TimeProfile &getProfileBLAS() { return profileBLAS; }


 static TimeProfile profileCovDev("covDevQuda");


 static TimeProfile profileMomAction("momActionQuda");


 static TimeProfile profileEnd("endQuda");


 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");

 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");


 static TimeProfile profileInit2End("initQuda-endQuda",false);


 static bool enable_profiler = false;

 static bool do_not_profile_quda = false;


 static void profilerStart(const char *f)

 {

   static std::vector<int> target_list;

   static bool enable = false;

   static bool init = false;

   if (!init) {

     char *profile_target_env = getenv("QUDA_ENABLE_TARGET_PROFILE"); // selectively enable profiling for a given solve


     if ( profile_target_env ) {

       std::stringstream target_stream(profile_target_env);


       int target;

       while(target_stream >> target) {

        target_list.push_back(target);

        if (target_stream.peek() == ',') target_stream.ignore();

      }


      if (target_list.size() > 0) {

        std::sort(target_list.begin(), target_list.end());

        target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );

        warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size());

        enable = true;

      }

    }


     char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts

     if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0)))  {

       do_not_profile_quda=true;

       printfQuda("Disabling profiling in QUDA\n");

     }

     init = true;

   }


   static int target_count = 0;

   static unsigned int i = 0;

   if (do_not_profile_quda){

     device::profile::stop();

     printfQuda("Stopping profiling in QUDA\n");

   } else {

     if (enable) {

       if (i < target_list.size() && target_count++ == target_list[i]) {

         enable_profiler = true;

         printfQuda("Starting profiling for %s\n", f);

         device::profile::start();

         i++; // advance to next target

     }

   }

 }

 }


 static void profilerStop(const char *f) {

   if (do_not_profile_quda) {

     device::profile::start();

   } else {


     if (enable_profiler) {

       printfQuda("Stopping profiling for %s\n", f);

       device::profile::stop();

       enable_profiler = false;

     }

   }

 }


 namespace quda {

   void printLaunchTimer();

 }


 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)

 {

   setVerbosity(verbosity);

   setOutputPrefix(prefix);

   setOutputFile(outfile);

 }


 typedef struct {

   int ndim;

   int dims[QUDA_MAX_DIM];

 } LexMapData;


 static int lex_rank_from_coords(const int *coords, void *fdata)

 {

   auto *md = static_cast<LexMapData *>(fdata);


   int rank = coords[0];

   for (int i = 1; i < md->ndim; i++) {

     rank = md->dims[i] * rank + coords[i];

   }

   return rank;

 }


 #ifdef QMP_COMMS

 static int qmp_rank_from_coords(const int *coords, void *fdata)

 {

   return QMP_get_node_number_from(coords);

 }

 #endif


 // Provision for user control over MPI comm handle

 // Assumes an MPI implementation of QMP


 #if defined(QMP_COMMS) || defined(MPI_COMMS)

 MPI_Comm MPI_COMM_HANDLE_USER;

 static bool user_set_comm_handle = false;

 #endif


 void setMPICommHandleQuda(void *mycomm)

 {

 #if defined(QMP_COMMS) || defined(MPI_COMMS)

   MPI_COMM_HANDLE_USER = *((MPI_Comm *)mycomm);

   user_set_comm_handle = true;

 #endif

 }


 static bool comms_initialized = false;


 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)

 {

   if (comms_initialized) return;


   if (nDim != 4) {

     errorQuda("Number of communication grid dimensions must be 4");

   }


   LexMapData map_data;

   if (!func) {


 #if QMP_COMMS

     if (QMP_logical_topology_is_declared()) {

       if (QMP_get_logical_number_of_dimensions() != 4) {

         errorQuda("QMP logical topology must have 4 dimensions");

       }

       for (int i=0; i<nDim; i++) {

         int qdim = QMP_get_logical_dimensions()[i];

         if(qdim != dims[i]) {

           errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);

         }

       }

       fdata = nullptr;

       func = qmp_rank_from_coords;

     } else {

       warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");

 #endif


       map_data.ndim = nDim;

       for (int i=0; i<nDim; i++) {

         map_data.dims[i] = dims[i];

       }

       fdata = (void *) &map_data;

       func = lex_rank_from_coords;


 #if QMP_COMMS

     }

 #endif


   }


 #if defined(QMP_COMMS) || defined(MPI_COMMS)

   comm_init(nDim, dims, func, fdata, user_set_comm_handle, (void *)&MPI_COMM_HANDLE_USER);

 #else

   comm_init(nDim, dims, func, fdata);

 #endif


   comms_initialized = true;

 }


 static void init_default_comms()

 {

 #if defined(QMP_COMMS)

   if (QMP_logical_topology_is_declared()) {

     int ndim = QMP_get_logical_number_of_dimensions();

     const int *dims = QMP_get_logical_dimensions();

     initCommsGridQuda(ndim, dims, nullptr, nullptr);

   } else {

     errorQuda("initQuda() called without prior call to initCommsGridQuda(),"

         " and QMP logical topology has not been declared");

   }

 #elif defined(MPI_COMMS)

   errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");

 #else // single-GPU

   const int dims[4] = {1, 1, 1, 1};

   initCommsGridQuda(4, dims, nullptr, nullptr);

 #endif

 }


 #define STR_(x) #x

 #define STR(x) STR_(x)

   static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);

 #undef STR

 #undef STR_


 extern char* gitversion;


 /*

  * Set the device that QUDA uses.

  */

 void initQudaDevice(int dev)

 {

   //static bool initialized = false;

   if (initialized) return;

   initialized = true;


   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);

   profileInit.TPSTART(QUDA_PROFILE_TOTAL);

   profileInit.TPSTART(QUDA_PROFILE_INIT);


   if (getVerbosity() >= QUDA_SUMMARIZE) {

 #ifdef GITVERSION

     printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);

 #else

     printfQuda("QUDA %s\n",quda_version.c_str());

 #endif

   }


 #ifdef MULTI_GPU

   if (dev < 0) {

     if (!comms_initialized) {

       errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");

     }

     dev = comm_gpuid();

   }

 #else

   if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);

 #endif


   device::init(dev);


   { // determine if we will do CPU or GPU data reordering (default is GPU)

     char *reorder_str = getenv("QUDA_REORDER_LOCATION");


     if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {

       warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");

       reorder_location_set(QUDA_CUDA_FIELD_LOCATION);

     } else {

       warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");

       reorder_location_set(QUDA_CPU_FIELD_LOCATION);

     }

   }


   profileInit.TPSTOP(QUDA_PROFILE_INIT);

   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 /*

  * Any persistent memory allocations that QUDA uses are done here.

  */

 void initQudaMemory()

 {

   profileInit.TPSTART(QUDA_PROFILE_TOTAL);

   profileInit.TPSTART(QUDA_PROFILE_INIT);


   if (!comms_initialized) init_default_comms();


   device::create_context();


   loadTuneCache();


   // initalize the memory pool allocators

   pool::init();


   createDslashEvents();


   blas_lapack::native::init();

   blas::init();


   num_failures_h = static_cast<int *>(mapped_malloc(sizeof(int)));

   num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));


   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));


   profileInit.TPSTOP(QUDA_PROFILE_INIT);

   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void updateR()

 {

   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));

 }


 void initQuda(int dev)

 {

   // initialize communications topology, if not already done explicitly via initCommsGridQuda()

   if (!comms_initialized) init_default_comms();


   // set the device that QUDA uses

   initQudaDevice(dev);


   // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)

   initQudaMemory();

 }


 // This is a flag used to signal when we have downloaded new gauge

 // field.  Set by loadGaugeQuda and consumed by loadCloverQuda as one

 // possible flag to indicate we need to recompute the clover field

 static bool invalidate_clover = true;


 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)

 {

   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");

   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);


   checkGaugeParam(param);


   profileGauge.TPSTART(QUDA_PROFILE_INIT);

   // Set the specific input parameters and create the cpu gauge field

   GaugeFieldParam gauge_param(h_gauge, *param);


   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :

     static_cast<GaugeField*>(new cudaGaugeField(gauge_param));


   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {

     static size_t checksum = SIZE_MAX;

     size_t in_checksum = in->checksum(true);

     if (in_checksum == checksum) {

       if (getVerbosity() >= QUDA_VERBOSE)

         printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);

       profileGauge.TPSTOP(QUDA_PROFILE_INIT);

       profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);

       delete in;

       invalidate_clover = false;

       return;

     }

     checksum = in_checksum;

     invalidate_clover = true;

   }


   // free any current gauge field before new allocations to reduce memory overhead

   switch (param->type) {

     case QUDA_WILSON_LINKS:

       if (gaugeRefinement != gaugeSloppy && gaugeRefinement != gaugeEigensolver && gaugeRefinement)

         delete gaugeRefinement;


       if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugeEigensolver && gaugePrecondition != gaugePrecise

           && gaugePrecondition)

         delete gaugePrecondition;


       if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver != gaugePrecondition

           && gaugeEigensolver)

         delete gaugeEigensolver;


       if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;


       if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise;


       break;

     case QUDA_ASQTAD_FAT_LINKS:

       if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement != gaugeFatEigensolver && gaugeFatRefinement)

         delete gaugeFatRefinement;


       if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatEigensolver

           && gaugeFatPrecondition != gaugeFatPrecise && gaugeFatPrecondition)

         delete gaugeFatPrecondition;


       if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise

           && gaugeFatEigensolver != gaugeFatPrecondition && gaugeFatEigensolver)

         delete gaugeFatEigensolver;


       if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;


       if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise;


       break;

     case QUDA_ASQTAD_LONG_LINKS:


       if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement != gaugeLongEigensolver && gaugeLongRefinement)

         delete gaugeLongRefinement;


       if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongEigensolver

           && gaugeLongPrecondition != gaugeLongPrecise && gaugeLongPrecondition)

         delete gaugeLongPrecondition;


       if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise

           && gaugeLongEigensolver != gaugeLongPrecondition && gaugeLongEigensolver)

         delete gaugeLongEigensolver;


       if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;


       if (gaugeLongPrecise) delete gaugeLongPrecise;


       break;

     case QUDA_SMEARED_LINKS:

       if (gaugeSmeared) delete gaugeSmeared;

       break;

     default:

       errorQuda("Invalid gauge type %d", param->type);

   }


   // if not preserving then copy the gauge field passed in

   cudaGaugeField *precise = nullptr;


   // switch the parameters for creating the mirror precise cuda gauge field

   gauge_param.create = QUDA_NULL_FIELD_CREATE;

   gauge_param.reconstruct = param->reconstruct;

   gauge_param.setPrecision(param->cuda_prec, true);

   gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;

   gauge_param.pad = param->ga_pad;


   precise = new cudaGaugeField(gauge_param);


   if (param->use_resident_gauge) {

     if(gaugePrecise == nullptr) errorQuda("No resident gauge field");

     // copy rather than point at to ensure that the padded region is filled in

     precise->copy(*gaugePrecise);

     precise->exchangeGhost();

     delete gaugePrecise;

     gaugePrecise = nullptr;

     profileGauge.TPSTOP(QUDA_PROFILE_INIT);

   } else {

     profileGauge.TPSTOP(QUDA_PROFILE_INIT);

     profileGauge.TPSTART(QUDA_PROFILE_H2D);

     precise->copy(*in);

     profileGauge.TPSTOP(QUDA_PROFILE_H2D);

   }


   // for gaugeSmeared we are interested only in the precise version

   if (param->type == QUDA_SMEARED_LINKS) {

     gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);


     profileGauge.TPSTART(QUDA_PROFILE_FREE);

     delete precise;

     delete in;

     profileGauge.TPSTOP(QUDA_PROFILE_FREE);


     profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);

     return;

   }


   // creating sloppy fields isn't really compute, but it is work done on the gpu

   profileGauge.TPSTART(QUDA_PROFILE_COMPUTE);


   // switch the parameters for creating the mirror sloppy cuda gauge field

   gauge_param.reconstruct = param->reconstruct_sloppy;

   gauge_param.setPrecision(param->cuda_prec_sloppy, true);

   cudaGaugeField *sloppy = nullptr;

   if (param->cuda_prec == param->cuda_prec_sloppy && param->reconstruct == param->reconstruct_sloppy) {

     sloppy = precise;

   } else {

     sloppy = new cudaGaugeField(gauge_param);

     sloppy->copy(*precise);

   }


   // switch the parameters for creating the mirror preconditioner cuda gauge field

   gauge_param.reconstruct = param->reconstruct_precondition;

   gauge_param.setPrecision(param->cuda_prec_precondition, true);

   cudaGaugeField *precondition = nullptr;

   if (param->cuda_prec == param->cuda_prec_precondition && param->reconstruct == param->reconstruct_precondition) {

     precondition = precise;

   } else if (param->cuda_prec_sloppy == param->cuda_prec_precondition

              && param->reconstruct_sloppy == param->reconstruct_precondition) {

     precondition = sloppy;

   } else {

     precondition = new cudaGaugeField(gauge_param);

     precondition->copy(*precise);

   }


   // switch the parameters for creating the refinement cuda gauge field

   gauge_param.reconstruct = param->reconstruct_refinement_sloppy;

   gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);

   cudaGaugeField *refinement = nullptr;

   if (param->cuda_prec_sloppy == param->cuda_prec_refinement_sloppy

       && param->reconstruct_sloppy == param->reconstruct_refinement_sloppy) {

     refinement = sloppy;

   } else {

     refinement = new cudaGaugeField(gauge_param);

     refinement->copy(*sloppy);

   }


   // switch the parameters for creating the eigensolver cuda gauge field

   gauge_param.reconstruct = param->reconstruct_eigensolver;

   gauge_param.setPrecision(param->cuda_prec_eigensolver, true);

   cudaGaugeField *eigensolver = nullptr;

   if (param->cuda_prec == param->cuda_prec_eigensolver && param->reconstruct == param->reconstruct_eigensolver) {

     eigensolver = precise;

   } else if (param->cuda_prec_precondition == param->cuda_prec_eigensolver

              && param->reconstruct_precondition == param->reconstruct_eigensolver) {

     eigensolver = precondition;

   } else if (param->cuda_prec_sloppy == param->cuda_prec_eigensolver

              && param->reconstruct_sloppy == param->reconstruct_eigensolver) {

     eigensolver = sloppy;

   } else {

     eigensolver = new cudaGaugeField(gauge_param);

     eigensolver->copy(*precise);

   }


   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);


   // create an extended preconditioning field

   cudaGaugeField* extended = nullptr;

   if (param->overlap){

     int R[4]; // domain-overlap widths in different directions

     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);

     extended = createExtendedGauge(*precondition, R, profileGauge);

   }


   switch (param->type) {

     case QUDA_WILSON_LINKS:

       gaugePrecise = precise;

       gaugeSloppy = sloppy;

       gaugePrecondition = precondition;

       gaugeRefinement = refinement;

       gaugeEigensolver = eigensolver;


       if(param->overlap) gaugeExtended = extended;

       break;

     case QUDA_ASQTAD_FAT_LINKS:

       gaugeFatPrecise = precise;

       gaugeFatSloppy = sloppy;

       gaugeFatPrecondition = precondition;

       gaugeFatRefinement = refinement;

       gaugeFatEigensolver = eigensolver;


       if(param->overlap){

         if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");

         gaugeFatExtended = extended;

       }

       break;

     case QUDA_ASQTAD_LONG_LINKS:

       gaugeLongPrecise = precise;

       gaugeLongSloppy = sloppy;

       gaugeLongPrecondition = precondition;

       gaugeLongRefinement = refinement;

       gaugeLongEigensolver = eigensolver;


       if(param->overlap){

         if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");

         gaugeLongExtended = extended;

       }

       break;

     default:

       errorQuda("Invalid gauge type %d", param->type);

   }


   profileGauge.TPSTART(QUDA_PROFILE_FREE);

   delete in;

   profileGauge.TPSTOP(QUDA_PROFILE_FREE);


   if (extendedGaugeResident) {

     // updated the resident gauge field if needed

     QudaReconstructType recon = extendedGaugeResident->Reconstruct();

     delete extendedGaugeResident;

     // Use the static R (which is defined at the very beginning of lib/interface_quda.cpp) here

     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);

   }


   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)

 {

   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);


   if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported");


   if (!initialized) errorQuda("QUDA not initialized");

   checkGaugeParam(param);


   // Set the specific cpu parameters and create the cpu gauge field

   GaugeFieldParam gauge_param(h_gauge, *param);

   cpuGaugeField cpuGauge(gauge_param);

   cudaGaugeField *cudaGauge = nullptr;

   switch (param->type) {

   case QUDA_WILSON_LINKS: cudaGauge = gaugePrecise; break;

   case QUDA_ASQTAD_FAT_LINKS: cudaGauge = gaugeFatPrecise; break;

   case QUDA_ASQTAD_LONG_LINKS: cudaGauge = gaugeLongPrecise; break;

   case QUDA_SMEARED_LINKS:

     gauge_param.create = QUDA_NULL_FIELD_CREATE;

     gauge_param.reconstruct = param->reconstruct;

     gauge_param.setPrecision(param->cuda_prec, true);

     gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;

     gauge_param.pad = param->ga_pad;

     cudaGauge = new cudaGaugeField(gauge_param);

     copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);

     break;

   default: errorQuda("Invalid gauge type");

   }


   profileGauge.TPSTART(QUDA_PROFILE_D2H);

   cudaGauge->saveCPUField(cpuGauge);

   profileGauge.TPSTOP(QUDA_PROFILE_D2H);


   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }


   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void loadSloppyCloverQuda(const QudaPrecision prec[]);

 void freeSloppyCloverQuda();


 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)

 {

   profileClover.TPSTART(QUDA_PROFILE_TOTAL);

   profileClover.TPSTART(QUDA_PROFILE_INIT);


   checkCloverParam(inv_param);

   bool device_calc = false; // calculate clover and inverse on the device?


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   if (!initialized) errorQuda("QUDA not initialized");


   if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {

     device_calc = true;

     if (inv_param->clover_coeff == 0.0 && inv_param->clover_csw == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient nor Csw not set");

     if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");

   }


   if (inv_param->clover_cpu_prec < QUDA_SINGLE_PRECISION) errorQuda("Fixed-point precision not supported on CPU");

   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded before clover");

   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)

       && (inv_param->dslash_type != QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH)) {

     errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);

   }


   // determines whether operator is preconditioned when calling invertQuda()

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||

       inv_param->solve_type == QUDA_NORMOP_PC_SOLVE ||

       inv_param->solve_type == QUDA_NORMERR_PC_SOLVE );


   // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()

   bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||

       inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);


   // uninverted clover term is required when applying unpreconditioned operator,

   // but note that dslashQuda() is always preconditioned

   if (!h_clover && !pc_solve && !pc_solution) {

     //warningQuda("Uninverted clover term not loaded");

   }


   // uninverted clover term is also required for "asymmetric" preconditioning

   if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {

     warningQuda("Uninverted clover term not loaded");

   }


   bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;


   CloverFieldParam clover_param;

   clover_param.nDim = 4;

   // If clover_coeff is not set manually, then it is the product Csw * kappa.

   // If the user has set the clover_coeff manually, that value takes precedent.

   clover_param.csw = inv_param->clover_csw;

   clover_param.coeff = inv_param->clover_coeff == 0.0 ? inv_param->kappa * inv_param->clover_csw : inv_param->clover_coeff;

   // We must also adjust inv_param->clover_coeff here. If a user has set kappa and

   // Csw, we must populate inv_param->clover_coeff for them as the computeClover

   // routines uses that value

   inv_param->clover_coeff = (inv_param->clover_coeff == 0.0 ? inv_param->kappa * inv_param->clover_csw : inv_param->clover_coeff);

   clover_param.twisted = twisted;

   clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;

   clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;

   for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];

   clover_param.pad = inv_param->cl_pad;

   clover_param.create = QUDA_NULL_FIELD_CREATE;

   clover_param.norm = nullptr;

   clover_param.invNorm = nullptr;

   clover_param.setPrecision(inv_param->clover_cuda_prec, true);

   clover_param.direct = h_clover || device_calc ? true : false;

   clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover_inverse() ? true : false;

   CloverField *in = nullptr;

   profileClover.TPSTOP(QUDA_PROFILE_INIT);


   // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)

   bool clover_update = false;

   // If either of the clover params have changed, trigger a recompute

   double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;

   double coeff_old = cloverPrecise ? cloverPrecise->Coeff() : 0.0;

   if (!cloverPrecise || invalidate_clover ||

       inv_param->clover_coeff != coeff_old ||

       inv_param->clover_csw != csw_old) clover_update = true;


   // compute or download clover field only if gauge field has been updated or clover field doesn't exist

   if (clover_update) {

     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");

     freeSloppyCloverQuda();

     if (cloverPrecise) delete cloverPrecise;


     profileClover.TPSTART(QUDA_PROFILE_INIT);

     cloverPrecise = new cudaCloverField(clover_param);


     if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {

       // create a param for the cpu clover field

       CloverFieldParam inParam(clover_param);

       inParam.order = inv_param->clover_order;

       inParam.setPrecision(inv_param->clover_cpu_prec);

       inParam.direct = h_clover ? true : false;

       inParam.inverse = h_clovinv ? true : false;

       inParam.clover = h_clover;

       inParam.cloverInv = h_clovinv;

       inParam.create = QUDA_REFERENCE_FIELD_CREATE;

       in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?

         static_cast<CloverField*>(new cpuCloverField(inParam)) :

         static_cast<CloverField*>(new cudaCloverField(inParam));

     }

     profileClover.TPSTOP(QUDA_PROFILE_INIT);


     if (!device_calc) {

       profileClover.TPSTART(QUDA_PROFILE_H2D);

       bool inverse = (h_clovinv && !inv_param->compute_clover_inverse && !dynamic_clover_inverse());

       cloverPrecise->copy(*in, inverse);

       profileClover.TPSTOP(QUDA_PROFILE_H2D);

     } else {

       profileClover.TPSTOP(QUDA_PROFILE_TOTAL);

       createCloverQuda(inv_param);

       profileClover.TPSTART(QUDA_PROFILE_TOTAL);

     }


     // inverted clover term is required when applying preconditioned operator

     if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {

       profileClover.TPSTART(QUDA_PROFILE_COMPUTE);

       if (!dynamic_clover_inverse()) {

         cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog);

         if (inv_param->compute_clover_trlog) {

           inv_param->trlogA[0] = cloverPrecise->TrLog()[0];

           inv_param->trlogA[1] = cloverPrecise->TrLog()[1];

         }

       }

       profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);

     }

   } else {

     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");

   }


   clover_param.direct = true;

   clover_param.inverse = dynamic_clover_inverse() ? false : true;


   cloverPrecise->setRho(inv_param->clover_rho);


   QudaPrecision prec[] = {inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition,

                           inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver};

   loadSloppyCloverQuda(prec);


   // if requested, copy back the clover / inverse field

   if (inv_param->return_clover || inv_param->return_clover_inverse) {

     if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");


     // copy the inverted clover term into host application order on the device

     clover_param.direct = (h_clover && inv_param->return_clover);

     clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);


     // this isn't really "epilogue" but this label suffices

     profileClover.TPSTART(QUDA_PROFILE_EPILOGUE);

     cudaCloverField *hack = nullptr;

     if (!dynamic_clover_inverse()) {

       clover_param.order = inv_param->clover_order;

       clover_param.setPrecision(inv_param->clover_cpu_prec);

       hack = new cudaCloverField(clover_param);

       hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse

     } else {

       clover_param.setPrecision(inv_param->clover_cuda_prec, true);

       auto *hackOfTheHack = new cudaCloverField(clover_param);  // Hack of the hack

       hackOfTheHack->copy(*cloverPrecise, false);

       cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog);

       if (inv_param->compute_clover_trlog) {

         inv_param->trlogA[0] = cloverPrecise->TrLog()[0];

         inv_param->trlogA[1] = cloverPrecise->TrLog()[1];

       }

       clover_param.order = inv_param->clover_order;

       clover_param.setPrecision(inv_param->clover_cpu_prec);

       hack = new cudaCloverField(clover_param);

       hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse

       delete hackOfTheHack;

     }

     profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE);


     // copy the field into the host application's clover field

     profileClover.TPSTART(QUDA_PROFILE_D2H);

     if (inv_param->return_clover) {

       qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);

     }

     if (inv_param->return_clover_inverse) {

       qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);

     }


     profileClover.TPSTOP(QUDA_PROFILE_D2H);


     delete hack;

   }


   profileClover.TPSTART(QUDA_PROFILE_FREE);

   if (in) delete in; // delete object referencing input field

   profileClover.TPSTOP(QUDA_PROFILE_FREE);


   popVerbosity();


   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void freeSloppyCloverQuda();


 void loadSloppyCloverQuda(const QudaPrecision *prec)

 {

   freeSloppyCloverQuda();


   if (cloverPrecise) {

     // create the mirror sloppy clover field

     CloverFieldParam clover_param(*cloverPrecise);

     clover_param.setPrecision(prec[0], true);


     if (cloverPrecise->V(false) != cloverPrecise->V(true)) {

       clover_param.direct = true;

       clover_param.inverse = true;

     } else {

       clover_param.direct = false;

       clover_param.inverse = true;

     }


     if (clover_param.Precision() != cloverPrecise->Precision()) {

       cloverSloppy = new cudaCloverField(clover_param);

       cloverSloppy->copy(*cloverPrecise, clover_param.inverse);

     } else {

       cloverSloppy = cloverPrecise;

     }


     // switch the parameters for creating the mirror preconditioner clover field

     clover_param.setPrecision(prec[1], true);


     // create the mirror preconditioner clover field

     if (clover_param.Precision() == cloverPrecise->Precision()) {

       cloverPrecondition = cloverPrecise;

     } else if (clover_param.Precision() == cloverSloppy->Precision()) {

       cloverPrecondition = cloverSloppy;

     } else {

       cloverPrecondition = new cudaCloverField(clover_param);

       cloverPrecondition->copy(*cloverPrecise, clover_param.inverse);

     }


     // switch the parameters for creating the mirror refinement clover field

     clover_param.setPrecision(prec[2], true);


     // create the mirror refinement clover field

     if (clover_param.Precision() != cloverSloppy->Precision()) {

       cloverRefinement = new cudaCloverField(clover_param);

       cloverRefinement->copy(*cloverSloppy, clover_param.inverse);

     } else {

       cloverRefinement = cloverSloppy;

     }

     // switch the parameters for creating the mirror eigensolver clover field

     clover_param.setPrecision(prec[3]);


     // create the mirror eigensolver clover field

     if (clover_param.Precision() == cloverPrecise->Precision()) {

       cloverEigensolver = cloverPrecise;

     } else if (clover_param.Precision() == cloverSloppy->Precision()) {

       cloverEigensolver = cloverSloppy;

     } else if (clover_param.Precision() == cloverPrecondition->Precision()) {

       cloverEigensolver = cloverPrecondition;

     } else {

       cloverEigensolver = new cudaCloverField(clover_param);

       cloverEigensolver->copy(*cloverPrecise, clover_param.inverse);

     }

   }


 }


 // just free the sloppy fields used in mixed-precision solvers

 void freeSloppyGaugeQuda()

 {

   if (!initialized) errorQuda("QUDA not initialized");


   // Wilson gauges

   //---------------------------------------------------------------------------

   // Delete gaugeRefinement if it does not alias gaugeSloppy.

   if (gaugeRefinement != gaugeSloppy && gaugeRefinement) delete gaugeRefinement;


   // Delete gaugePrecondition if it does not alias gaugePrecise, gaugeSloppy, or gaugeEigensolver.

   if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugePrecise && gaugePrecondition != gaugeEigensolver

       && gaugePrecondition)

     delete gaugePrecondition;


   // Delete gaugeEigensolver if it does not alias gaugePrecise or gaugeSloppy.

   if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver) delete gaugeEigensolver;


   // Delete gaugeSloppy if it does not alias gaugePrecise.

   if (gaugeSloppy != gaugePrecise && gaugeSloppy) delete gaugeSloppy;


   gaugeEigensolver = nullptr;

   gaugeRefinement = nullptr;

   gaugePrecondition = nullptr;

   gaugeSloppy = nullptr;

   //---------------------------------------------------------------------------


   // Long gauges

   //---------------------------------------------------------------------------

   // Delete gaugeLongRefinement if it does not alias gaugeLongSloppy.

   if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement) delete gaugeLongRefinement;


   // Delete gaugeLongPrecondition if it does not alias gaugeLongPrecise, gaugeLongSloppy, or gaugeLongEigensolver.

   if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongPrecise

       && gaugeLongPrecondition != gaugeLongEigensolver && gaugeLongPrecondition)

     delete gaugeLongPrecondition;


   // Delete gaugeLongEigensolver if it does not alias gaugeLongPrecise or gaugeLongSloppy.

   if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise && gaugeLongEigensolver)

     delete gaugeLongEigensolver;


   // Delete gaugeLongSloppy if it does not alias gaugeLongPrecise.

   if (gaugeLongSloppy != gaugeLongPrecise && gaugeLongSloppy) delete gaugeLongSloppy;


   gaugeLongEigensolver = nullptr;

   gaugeLongRefinement = nullptr;

   gaugeLongPrecondition = nullptr;

   gaugeLongSloppy = nullptr;

   //---------------------------------------------------------------------------


   // Fat gauges

   //---------------------------------------------------------------------------

   // Delete gaugeFatRefinement if it does not alias gaugeFatSloppy.

   if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement) delete gaugeFatRefinement;


   // Delete gaugeFatPrecondition if it does not alias gaugeFatPrecise, gaugeFatSloppy, or gaugeFatEigensolver.

   if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatPrecise

       && gaugeFatPrecondition != gaugeFatEigensolver && gaugeFatPrecondition)

     delete gaugeFatPrecondition;


   // Delete gaugeFatEigensolver if it does not alias gaugeFatPrecise or gaugeFatSloppy.

   if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise && gaugeFatEigensolver)

     delete gaugeFatEigensolver;


   // Delete gaugeFatSloppy if it does not alias gaugeFatPrecise.

   if (gaugeFatSloppy != gaugeFatPrecise && gaugeFatSloppy) delete gaugeFatSloppy;


   gaugeFatEigensolver = nullptr;

   gaugeFatRefinement = nullptr;

   gaugeFatPrecondition = nullptr;

   gaugeFatSloppy = nullptr;

 }


 void freeGaugeQuda(void)

 {

   if (!initialized) errorQuda("QUDA not initialized");


   freeSloppyGaugeQuda();


   if (gaugePrecise) delete gaugePrecise;

   if (gaugeExtended) delete gaugeExtended;


   gaugePrecise = nullptr;

   gaugeExtended = nullptr;


   if (gaugeLongPrecise) delete gaugeLongPrecise;

   if (gaugeLongExtended) delete gaugeLongExtended;


   gaugeLongPrecise = nullptr;

   gaugeLongExtended = nullptr;


   if (gaugeFatPrecise) delete gaugeFatPrecise;


   gaugeFatPrecise = nullptr;

   gaugeFatExtended = nullptr;


   if (gaugeSmeared) delete gaugeSmeared;


   gaugeSmeared = nullptr;

   // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise

   if (extendedGaugeResident) {

     delete extendedGaugeResident;

     extendedGaugeResident = nullptr;

   }

 }


 void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)

 {

   // first do SU3 links (if they exist)

   if (gaugePrecise) {

     GaugeFieldParam gauge_param(*gaugePrecise);

     // switch the parameters for creating the mirror sloppy cuda gauge field


     gauge_param.reconstruct = recon[0];

     gauge_param.setPrecision(prec[0], true);


     if (gaugeSloppy) errorQuda("gaugeSloppy already exists");


     if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) {

       gaugeSloppy = gaugePrecise;

     } else {

       gaugeSloppy = new cudaGaugeField(gauge_param);

       gaugeSloppy->copy(*gaugePrecise);

     }


     // switch the parameters for creating the mirror preconditioner cuda gauge field

     gauge_param.reconstruct = recon[1];

     gauge_param.setPrecision(prec[1], true);


     if (gaugePrecondition) errorQuda("gaugePrecondition already exists");


     if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) {

       gaugePrecondition = gaugePrecise;

     } else if (gauge_param.Precision() == gaugeSloppy->Precision()

                && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {

       gaugePrecondition = gaugeSloppy;

     } else {

       gaugePrecondition = new cudaGaugeField(gauge_param);

       gaugePrecondition->copy(*gaugePrecise);

     }


     // switch the parameters for creating the mirror refinement cuda gauge field

     gauge_param.reconstruct = recon[2];

     gauge_param.setPrecision(prec[2], true);


     if (gaugeRefinement) errorQuda("gaugeRefinement already exists");


     if (gauge_param.Precision() == gaugeSloppy->Precision() && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {

       gaugeRefinement = gaugeSloppy;

     } else {

       gaugeRefinement = new cudaGaugeField(gauge_param);

       gaugeRefinement->copy(*gaugeSloppy);

     }


     // switch the parameters for creating the mirror eigensolver cuda gauge field

     gauge_param.reconstruct = recon[3];

     gauge_param.setPrecision(prec[3], true);


     if (gaugeEigensolver) errorQuda("gaugeEigensolver already exists");


     if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) {

       gaugeEigensolver = gaugePrecise;

     } else if (gauge_param.Precision() == gaugeSloppy->Precision()

                && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {

       gaugeEigensolver = gaugeSloppy;

     } else if (gauge_param.Precision() == gaugePrecondition->Precision()

                && gauge_param.reconstruct == gaugePrecondition->Reconstruct()) {

       gaugeEigensolver = gaugePrecondition;

     } else {

       gaugeEigensolver = new cudaGaugeField(gauge_param);

       gaugeEigensolver->copy(*gaugePrecise);

     }

   }


   // fat links (if they exist)

   if (gaugeFatPrecise) {

     GaugeFieldParam gauge_param(*gaugeFatPrecise);

     // switch the parameters for creating the mirror sloppy cuda gauge field


     gauge_param.setPrecision(prec[0], true);


     if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");


     if (gauge_param.Precision() == gaugeFatPrecise->Precision()

         && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) {

       gaugeFatSloppy = gaugeFatPrecise;

     } else {

       gaugeFatSloppy = new cudaGaugeField(gauge_param);

       gaugeFatSloppy->copy(*gaugeFatPrecise);

     }


     // switch the parameters for creating the mirror preconditioner cuda gauge field

     gauge_param.setPrecision(prec[1], true);


     if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");


     if (gauge_param.Precision() == gaugeFatPrecise->Precision()

         && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) {

       gaugeFatPrecondition = gaugeFatPrecise;

     } else if (gauge_param.Precision() == gaugeFatSloppy->Precision()

                && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {

       gaugeFatPrecondition = gaugeFatSloppy;

     } else {

       gaugeFatPrecondition = new cudaGaugeField(gauge_param);

       gaugeFatPrecondition->copy(*gaugeFatPrecise);

     }


     // switch the parameters for creating the mirror refinement cuda gauge field

     gauge_param.setPrecision(prec[2], true);


     if (gaugeFatRefinement) errorQuda("gaugeFatRefinement already exists\n");


     if (gauge_param.Precision() == gaugeFatSloppy->Precision()

         && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {

       gaugeFatRefinement = gaugeFatSloppy;

     } else {

       gaugeFatRefinement = new cudaGaugeField(gauge_param);

       gaugeFatRefinement->copy(*gaugeFatSloppy);

     }


     // switch the parameters for creating the mirror eigensolver cuda gauge field

     gauge_param.setPrecision(prec[3], true);


     if (gaugeFatEigensolver) errorQuda("gaugeFatEigensolver already exists");


     if (gauge_param.Precision() == gaugeFatPrecise->Precision()

         && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) {

       gaugeFatEigensolver = gaugeFatPrecise;

     } else if (gauge_param.Precision() == gaugeFatSloppy->Precision()

                && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {

       gaugeFatEigensolver = gaugeFatSloppy;

     } else if (gauge_param.Precision() == gaugeFatPrecondition->Precision()

                && gauge_param.reconstruct == gaugeFatPrecondition->Reconstruct()) {

       gaugeFatEigensolver = gaugeFatPrecondition;

     } else {

       gaugeFatEigensolver = new cudaGaugeField(gauge_param);

       gaugeFatEigensolver->copy(*gaugeFatPrecise);

     }

   }


   // long links (if they exist)

   if (gaugeLongPrecise) {

     GaugeFieldParam gauge_param(*gaugeLongPrecise);

     // switch the parameters for creating the mirror sloppy cuda gauge field


     gauge_param.reconstruct = recon[0];

     gauge_param.setPrecision(prec[0], true);


     if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");


     if (gauge_param.Precision() == gaugeLongPrecise->Precision()

         && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) {

       gaugeLongSloppy = gaugeLongPrecise;

     } else {

       gaugeLongSloppy = new cudaGaugeField(gauge_param);

       gaugeLongSloppy->copy(*gaugeLongPrecise);

     }


     // switch the parameters for creating the mirror preconditioner cuda gauge field

     gauge_param.reconstruct = recon[1];

     gauge_param.setPrecision(prec[1], true);


     if (gaugeLongPrecondition) errorQuda("gaugeLongPrecondition already exists\n");


     if (gauge_param.Precision() == gaugeLongPrecise->Precision()

         && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) {

       gaugeLongPrecondition = gaugeLongPrecise;

     } else if (gauge_param.Precision() == gaugeLongSloppy->Precision()

                && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {

       gaugeLongPrecondition = gaugeLongSloppy;

     } else {

       gaugeLongPrecondition = new cudaGaugeField(gauge_param);

       gaugeLongPrecondition->copy(*gaugeLongPrecise);

     }


     // switch the parameters for creating the mirror refinement cuda gauge field

     gauge_param.reconstruct = recon[2];

     gauge_param.setPrecision(prec[2], true);


     if (gaugeLongRefinement) errorQuda("gaugeLongRefinement already exists\n");


     if (gauge_param.Precision() == gaugeLongSloppy->Precision()

         && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {

       gaugeLongRefinement = gaugeLongSloppy;

     } else {

       gaugeLongRefinement = new cudaGaugeField(gauge_param);

       gaugeLongRefinement->copy(*gaugeLongSloppy);

     }


     // switch the parameters for creating the mirror eigensolver cuda gauge field

     gauge_param.reconstruct = recon[3];

     gauge_param.setPrecision(prec[3], true);


     if (gaugeLongEigensolver) errorQuda("gaugePrecondition already exists");


     if (gauge_param.Precision() == gaugeLongPrecise->Precision()

         && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) {

       gaugeLongEigensolver = gaugeLongPrecise;

     } else if (gauge_param.Precision() == gaugeLongSloppy->Precision()

                && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {

       gaugeLongEigensolver = gaugeLongSloppy;

     } else if (gauge_param.Precision() == gaugeLongPrecondition->Precision()

                && gauge_param.reconstruct == gaugeLongPrecondition->Reconstruct()) {

       gaugeLongEigensolver = gaugeLongPrecondition;

     } else {

       gaugeLongEigensolver = new cudaGaugeField(gauge_param);

       gaugeLongEigensolver->copy(*gaugeLongPrecise);

     }

   }

 }


 void freeSloppyCloverQuda()

 {

   if (!initialized) errorQuda("QUDA not initialized");


   // Delete cloverRefinement if it does not alias gaugeSloppy.

   if (cloverRefinement != cloverSloppy && cloverRefinement) delete cloverRefinement;


   // Delete cloverPrecondition if it does not alias cloverPrecise, cloverSloppy, or cloverEigensolver.

   if (cloverPrecondition != cloverSloppy && cloverPrecondition != cloverPrecise

       && cloverPrecondition != cloverEigensolver && cloverPrecondition)

     delete cloverPrecondition;


   // Delete cloverEigensolver if it does not alias cloverPrecise or cloverSloppy.

   if (cloverEigensolver != cloverSloppy && cloverEigensolver != cloverPrecise && cloverEigensolver)

     delete cloverEigensolver;


   // Delete cloverSloppy if it does not alias cloverPrecise.

   if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;


   cloverEigensolver = nullptr;

   cloverRefinement = nullptr;

   cloverPrecondition = nullptr;

   cloverSloppy = nullptr;

 }


 void freeCloverQuda(void)

 {

   if (!initialized) errorQuda("QUDA not initialized");

   freeSloppyCloverQuda();

   if (cloverPrecise) delete cloverPrecise;

   cloverPrecise = nullptr;

 }


 void flushChronoQuda(int i)

 {

   if (i >= QUDA_MAX_CHRONO)

     errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);


   auto &basis = chronoResident[i];


   for (auto v : basis) {

     if (v)  delete v;

   }

   basis.clear();

 }


 void endQuda(void)

 {

   profileEnd.TPSTART(QUDA_PROFILE_TOTAL);


   if (!initialized) return;


   freeGaugeQuda();

   freeCloverQuda();


   for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);


   for (auto v : solutionResident) if (v) delete v;

   solutionResident.clear();


   if(momResident) delete momResident;


   LatticeField::freeGhostBuffer();

   cpuColorSpinorField::freeGhostBuffer();


   blas_lapack::generic::destroy();

   blas_lapack::native::destroy();

   blas::destroy();


   pool::flush_pinned();

   pool::flush_device();


   host_free(num_failures_h);

   num_failures_h = nullptr;

   num_failures_d = nullptr;


   destroyDslashEvents();


   saveTuneCache();

   saveProfile();


   // flush any outstanding force monitoring (if enabled)

   flushForceMonitor();


   initialized = false;


   comm_finalize();

   comms_initialized = false;


   profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);

   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);


   // print out the profile information of the lifetime of the library

   if (getVerbosity() >= QUDA_SUMMARIZE) {

     profileInit.Print();

     profileGauge.Print();

     profileClover.Print();

     profileDslash.Print();

     profileInvert.Print();

     profileInvertMultiSrc.Print();

     profileMulti.Print();

     profileEigensolve.Print();

     profileFatLink.Print();

     profileGaugeForce.Print();

     profileGaugeUpdate.Print();

     profileExtendedGauge.Print();

     profileCloverForce.Print();

     profileStaggeredForce.Print();

     profileHISQForce.Print();

     profileContract.Print();

     profileBLAS.Print();

     profileCovDev.Print();

     profilePlaq.Print();

     profileGaugeObs.Print();

     profileAPE.Print();

     profileSTOUT.Print();

     profileOvrImpSTOUT.Print();

     profileWFlow.Print();

     profileProject.Print();

     profilePhase.Print();

     profileMomAction.Print();

     profileEnd.Print();


     profileInit2End.Print();

     TimeProfile::PrintGlobal();


     printLaunchTimer();

     printAPIProfile();


     printfQuda("\n");

     printPeakMemUsage();

     printfQuda("\n");

   }


   assertAllMemFree();


   device::destroy();

 }


 namespace quda {


   void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     double kappa = inv_param->kappa;

     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

       kappa *= gaugePrecise->Anisotropy();

     }


     switch (inv_param->dslash_type) {

     case QUDA_WILSON_DSLASH:

       diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;

       break;

     case QUDA_CLOVER_WILSON_DSLASH:

       diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;

       break;

     case QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH:

       diracParam.type = pc ? QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC : QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC;

       break;

     case QUDA_DOMAIN_WALL_DSLASH:

       diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC;

       diracParam.Ls = inv_param->Ls;

       break;

     case QUDA_DOMAIN_WALL_4D_DSLASH:

       diracParam.type = pc ? QUDA_DOMAIN_WALL_4DPC_DIRAC : QUDA_DOMAIN_WALL_4D_DIRAC;

       diracParam.Ls = inv_param->Ls;

       break;

     case QUDA_MOBIUS_DWF_EOFA_DSLASH:

       if (inv_param->Ls > QUDA_MAX_DWF_LS) {

         errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);

       }

       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC;

       diracParam.Ls = inv_param->Ls;

       if (sizeof(Complex) != sizeof(double _Complex)) {

         errorQuda("Irreconcilable difference between interface and internal complex number conventions");

       }

       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);

       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);

       diracParam.eofa_shift = inv_param->eofa_shift;

       diracParam.eofa_pm = inv_param->eofa_pm;

       diracParam.mq1 = inv_param->mq1;

       diracParam.mq2 = inv_param->mq2;

       diracParam.mq3 = inv_param->mq3;

       break;

     case QUDA_MOBIUS_DWF_DSLASH:

       if (inv_param->Ls > QUDA_MAX_DWF_LS)

         errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);

       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC;

       diracParam.Ls = inv_param->Ls;

       if (sizeof(Complex) != sizeof(double _Complex)) {

         errorQuda("Irreconcilable difference between interface and internal complex number conventions");

       }

       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);

       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);

       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

         printfQuda("Printing b_5 and c_5 values\n");

         for (int i = 0; i < diracParam.Ls; i++) {

           printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),

               diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());

           // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,

           // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,

           // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );

         }

       }

       break;

     case QUDA_STAGGERED_DSLASH:

       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;

       break;

     case QUDA_ASQTAD_DSLASH:

       diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;

       break;

     case QUDA_TWISTED_MASS_DSLASH:

       diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC;

       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {

         diracParam.Ls = 1;

         diracParam.epsilon = 0.0;

       } else {

         diracParam.Ls = 2;

         diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;

       }

       break;

     case QUDA_TWISTED_CLOVER_DSLASH:

       diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC;

       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET)  {

         diracParam.Ls = 1;

         diracParam.epsilon = 0.0;

       } else {

         diracParam.Ls = 2;

         diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;

       }

       break;

     case QUDA_LAPLACE_DSLASH:

       diracParam.type = pc ? QUDA_GAUGE_LAPLACEPC_DIRAC : QUDA_GAUGE_LAPLACE_DIRAC;

       diracParam.laplace3D = inv_param->laplace3D;

       break;

     case QUDA_COVDEV_DSLASH:

       diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;

       break;

     default:

       errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);

     }


     diracParam.matpcType = inv_param->matpc_type;

     diracParam.dagger = inv_param->dagger;

     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise : gaugePrecise;

     diracParam.fatGauge = gaugeFatPrecise;

     diracParam.longGauge = gaugeLongPrecise;

     diracParam.clover = cloverPrecise;

     diracParam.kappa = kappa;

     diracParam.mass = inv_param->mass;

     diracParam.m5 = inv_param->m5;

     diracParam.mu = inv_param->mu;


     for (int i=0; i<4; i++) diracParam.commDim[i] = 1;   // comms are always on


     if (diracParam.gauge->Precision() != inv_param->cuda_prec)

       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),

                 inv_param->cuda_prec);

   }


   void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     setDiracParam(diracParam, inv_param, pc);


     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatSloppy : gaugeSloppy;

     diracParam.fatGauge = gaugeFatSloppy;

     diracParam.longGauge = gaugeLongSloppy;

     diracParam.clover = cloverSloppy;


     for (int i=0; i<4; i++) {

       diracParam.commDim[i] = 1;   // comms are always on

     }


     if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy)

       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),

                 inv_param->cuda_prec_sloppy);

   }


   void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     setDiracParam(diracParam, inv_param, pc);


     diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatRefinement : gaugeRefinement;

     diracParam.fatGauge = gaugeFatRefinement;

     diracParam.longGauge = gaugeLongRefinement;

     diracParam.clover = cloverRefinement;


     for (int i=0; i<4; i++) {

       diracParam.commDim[i] = 1;   // comms are always on

     }


     if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy)

       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),

                 inv_param->cuda_prec_refinement_sloppy);

   }


   // The preconditioner currently mimicks the sloppy operator with no comms

   void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)

   {

     setDiracParam(diracParam, inv_param, pc);


     if (inv_param->overlap) {

       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;

       diracParam.fatGauge = gaugeFatExtended;

       diracParam.longGauge = gaugeLongExtended;

     } else {

       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecondition : gaugePrecondition;

       diracParam.fatGauge = gaugeFatPrecondition;

       diracParam.longGauge = gaugeLongPrecondition;

     }

     diracParam.clover = cloverPrecondition;


     for (int i=0; i<4; i++) {

       diracParam.commDim[i] = comms ? 1 : 0;

     }


     // In the preconditioned staggered CG allow a different dslash type in the preconditioning

     if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH

        && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) {

        diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;

        diracParam.gauge = gaugeFatPrecondition;

     }


     if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition)

       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),

                 inv_param->cuda_prec_precondition);

   }


   // The deflation preconditioner currently mimicks the sloppy operator with no comms

   void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)

   {

     setDiracParam(diracParam, inv_param, pc);


     if (inv_param->overlap) {

       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;

       diracParam.fatGauge = gaugeFatExtended;

       diracParam.longGauge = gaugeLongExtended;

     } else {

       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatEigensolver : gaugeEigensolver;

       diracParam.fatGauge = gaugeFatEigensolver;

       diracParam.longGauge = gaugeLongEigensolver;

     }

     diracParam.clover = cloverEigensolver;


     for (int i = 0; i < 4; i++) { diracParam.commDim[i] = comms ? 1 : 0; }


     // In the deflated staggered CG allow a different dslash type

     if (inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH

         && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) {

       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;

       diracParam.gauge = gaugeFatEigensolver;

     }


     if (diracParam.gauge->Precision() != inv_param->cuda_prec_eigensolver)

       errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(),

                 inv_param->cuda_prec_eigensolver);

   }


   void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)

   {

     DiracParam diracParam;

     DiracParam diracSloppyParam;

     DiracParam diracPreParam;


     setDiracParam(diracParam, &param, pc_solve);

     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);

     // eigCG and deflation need 2 sloppy precisions and do not use Schwarz

     bool comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true;

     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator

     dSloppy = Dirac::create(diracSloppyParam);

     dPre = Dirac::create(diracPreParam);

   }


   void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param,

                              const bool pc_solve)

   {

     DiracParam diracParam;

     DiracParam diracSloppyParam;

     DiracParam diracPreParam;

     DiracParam diracRefParam;


     setDiracParam(diracParam, &param, pc_solve);

     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);

     setDiracRefineParam(diracRefParam, &param, pc_solve);

     // eigCG and deflation need 2 sloppy precisions and do not use Schwarz

     bool comms_flag = (param.inv_type == QUDA_INC_EIGCG_INVERTER || param.eig_param) ? true : false;

     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator

     dSloppy = Dirac::create(diracSloppyParam);

     dPre = Dirac::create(diracPreParam);

     dRef = Dirac::create(diracRefParam);

   }


   void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,

                           const bool pc_solve)

   {

     DiracParam diracParam;

     DiracParam diracSloppyParam;

     DiracParam diracPreParam;

     DiracParam diracEigParam;


     setDiracParam(diracParam, &param, pc_solve);

     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);

     // eigCG and deflation need 2 sloppy precisions and do not use Schwarz

     bool comms_flag = (param.inv_type == QUDA_INC_EIGCG_INVERTER || param.eig_param) ? true : false;

     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);

     setDiracEigParam(diracEigParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator

     dSloppy = Dirac::create(diracSloppyParam);

     dPre = Dirac::create(diracPreParam);

     dEig = Dirac::create(diracEigParam);

   }


   void massRescale(cudaColorSpinorField &b, QudaInvertParam &param, bool for_multishift)

   {


     double kappa5 = (0.5/(5.0 + param.m5));

     double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH || param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH

                     || param.dslash_type == QUDA_MOBIUS_DWF_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) ?

       kappa5 :

       param.kappa;


     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       printfQuda("Mass rescale: Kappa is: %g\n", kappa);

       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);

       double nin = blas::norm2(b);

       printfQuda("Mass rescale: norm of source in = %g\n", nin);

     }


     // staggered dslash uses mass normalization internally

     if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {

       switch (param.solution_type) {

         case QUDA_MAT_SOLUTION:

         case QUDA_MATPC_SOLUTION:

           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);

           break;

         case QUDA_MATDAG_MAT_SOLUTION:

         case QUDA_MATPCDAG_MATPC_SOLUTION:

           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);

           break;

         default:

           errorQuda("Not implemented");

       }

       return;

     }


     // multiply the source to compensate for normalization of the Dirac operator, if necessary

     // you are responsible for restoring what's in param.offset

     switch (param.solution_type) {

       case QUDA_MAT_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||

             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           blas::ax(2.0*kappa, b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa;

         }

         break;

       case QUDA_MATDAG_MAT_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||

             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           blas::ax(4.0*kappa*kappa, b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;

         }

         break;

       case QUDA_MATPC_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {

           blas::ax(4.0*kappa*kappa, b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;

         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           blas::ax(2.0*kappa, b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa;

         }

         break;

       case QUDA_MATPCDAG_MATPC_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {

           blas::ax(16.0*std::pow(kappa,4), b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4);

         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           blas::ax(4.0*kappa*kappa, b);

           if (for_multishift)

             for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa;

         }

         break;

       default:

         errorQuda("Solution type %d not supported", param.solution_type);

     }


     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       printfQuda("Mass rescale: Kappa is: %g\n", kappa);

       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);

       double nin = blas::norm2(b);

       printfQuda("Mass rescale: norm of source out = %g\n", nin);

     }

   }

 }


 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)

 {

   profileDslash.TPSTART(QUDA_PROFILE_TOTAL);

   profileDslash.TPSTART(QUDA_PROFILE_INIT);


   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;


   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)

       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))

     errorQuda("Gauge field not allocated");

   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location);

   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);


   cpuParam.v = h_out;

   cpuParam.location = inv_param->output_location;

   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField in(*in_h, cudaParam);

   cudaColorSpinorField out(in, cudaParam);


   bool pc = true;

   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   profileDslash.TPSTOP(QUDA_PROFILE_INIT);


   profileDslash.TPSTART(QUDA_PROFILE_H2D);

   in = *in_h;

   profileDslash.TPSTOP(QUDA_PROFILE_H2D);


   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*in_h);

     double gpu = blas::norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&

       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||

        inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )

     blas::ax(1.0/(2.0*inv_param->mass), in);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     blas::ax(gauge.Anisotropy(), in);

   }


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {

     cudaParam.create = QUDA_NULL_FIELD_CREATE;

     cudaColorSpinorField tmp1(in, cudaParam);

     ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist

     dirac->Dslash(out, tmp1, parity); // apply the operator

   } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH

              || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) {

     dirac->Dslash4(out, in, parity);

   } else {

     dirac->Dslash(out, in, parity); // apply the operator

   }

   profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);


   profileDslash.TPSTART(QUDA_PROFILE_D2H);

   *out_h = out;

   profileDslash.TPSTOP(QUDA_PROFILE_D2H);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*out_h);

     double gpu = blas::norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   profileDslash.TPSTART(QUDA_PROFILE_FREE);

   delete dirac; // clean up


   delete out_h;

   delete in_h;

   profileDslash.TPSTOP(QUDA_PROFILE_FREE);


   popVerbosity();

   profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)

 {

   pushVerbosity(inv_param->verbosity);


   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;


   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)

       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))

     errorQuda("Gauge field not allocated");

   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);

   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*in_h);

     double gpu = blas::norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   dirac->M(out, in); // apply the operator

   delete dirac; // clean up


   double kappa = inv_param->kappa;

   if (pc) {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {

       blas::ax(0.25/(kappa*kappa), out);

     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       blas::ax(0.5/kappa, out);

     }

   } else {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||

         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       blas::ax(0.5/kappa, out);

     }

   }


   cpuParam.v = h_out;

   cpuParam.location = inv_param->output_location;

   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   *out_h = out;


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*out_h);

     double gpu = blas::norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)

 {

   pushVerbosity(inv_param->verbosity);


   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;


   if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH)

       || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH))

     errorQuda("Gauge field not allocated");

   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location);

   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){

     double cpu = blas::norm2(*in_h);

     double gpu = blas::norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   //  double kappa = inv_param->kappa;

   //  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   dirac->MdagM(out, in); // apply the operator

   delete dirac; // clean up


   double kappa = inv_param->kappa;

   if (pc) {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {

       blas::ax(1.0/std::pow(2.0*kappa,4), out);

     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       blas::ax(0.25/(kappa*kappa), out);

     }

   } else {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||

         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       blas::ax(0.25/(kappa*kappa), out);

     }

   }


   cpuParam.v = h_out;

   cpuParam.location = inv_param->output_location;

   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   *out_h = out;


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){

     double cpu = blas::norm2(*out_h);

     double gpu = blas::norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 namespace quda

 {

   bool canReuseResidentGauge(QudaInvertParam *param)

   {

     if (param->dslash_type != QUDA_ASQTAD_DSLASH) {

       return (gaugePrecise != nullptr) and param->cuda_prec == gaugePrecise->Precision();

     } else {

       return (gaugeFatPrecise != nullptr) and param->cuda_prec == gaugeFatPrecise->Precision();

     }

   }

 } // namespace quda


 void checkClover(QudaInvertParam *param) {


   if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {

     return;

   }


   if (param->cuda_prec != cloverPrecise->Precision()) {

     errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());

   }


   if ( (!cloverSloppy || param->cuda_prec_sloppy != cloverSloppy->Precision()) ||

        (!cloverPrecondition || param->cuda_prec_precondition != cloverPrecondition->Precision()) ||

        (!cloverRefinement || param->cuda_prec_refinement_sloppy != cloverRefinement->Precision()) ||

        (!cloverEigensolver || param->cuda_prec_eigensolver != cloverEigensolver->Precision()) ) {

     freeSloppyCloverQuda();

     QudaPrecision prec[4] = {param->cuda_prec_sloppy, param->cuda_prec_precondition,

       param->cuda_prec_refinement_sloppy, param->cuda_prec_eigensolver};

     loadSloppyCloverQuda(prec);

   }


   if (cloverPrecise == nullptr) errorQuda("Precise clover field doesn't exist");

   if (cloverSloppy == nullptr) errorQuda("Sloppy clover field doesn't exist");

   if (cloverPrecondition == nullptr) errorQuda("Precondition clover field doesn't exist");

   if (cloverRefinement == nullptr) errorQuda("Refinement clover field doesn't exist");

   if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");

 }


 quda::cudaGaugeField *checkGauge(QudaInvertParam *param)

 {

   quda::cudaGaugeField *cudaGauge = nullptr;

   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {

     if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");


     if (param->cuda_prec != gaugePrecise->Precision()) {

       errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());

     }


     if (param->cuda_prec_sloppy != gaugeSloppy->Precision()

         || param->cuda_prec_precondition != gaugePrecondition->Precision()

         || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()

         || param->cuda_prec_eigensolver != gaugeEigensolver->Precision()) {

       QudaPrecision precision[4] = {param->cuda_prec_sloppy, param->cuda_prec_precondition,

                                     param->cuda_prec_refinement_sloppy, param->cuda_prec_eigensolver};

       QudaReconstructType recon[4] = {gaugeSloppy->Reconstruct(), gaugePrecondition->Reconstruct(),

                                       gaugeRefinement->Reconstruct(), gaugeEigensolver->Reconstruct()};

       freeSloppyGaugeQuda();

       loadSloppyGaugeQuda(precision, recon);

     }


     if (gaugeSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");

     if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");

     if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");

     if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");

     if (param->overlap) {

       if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");

     }

     cudaGauge = gaugePrecise;

   } else {

     if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");

     if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");


     if (param->cuda_prec != gaugeFatPrecise->Precision()) {

       errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());

     }


     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()

         || param->cuda_prec_precondition != gaugeFatPrecondition->Precision()

         || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision()

         || param->cuda_prec_eigensolver != gaugeFatEigensolver->Precision()

         || param->cuda_prec_sloppy != gaugeLongSloppy->Precision()

         || param->cuda_prec_precondition != gaugeLongPrecondition->Precision()

         || param->cuda_prec_refinement_sloppy != gaugeLongRefinement->Precision()

         || param->cuda_prec_eigensolver != gaugeLongEigensolver->Precision()) {


       QudaPrecision precision[4] = {param->cuda_prec_sloppy, param->cuda_prec_precondition,

                                     param->cuda_prec_refinement_sloppy, param->cuda_prec_eigensolver};

       // recon is always no for fat links, so just use long reconstructs here

       QudaReconstructType recon[4] = {gaugeLongSloppy->Reconstruct(), gaugeLongPrecondition->Reconstruct(),

                                       gaugeLongRefinement->Reconstruct(), gaugeLongEigensolver->Reconstruct()};

       freeSloppyGaugeQuda();

       loadSloppyGaugeQuda(precision, recon);

     }


     if (gaugeFatSloppy == nullptr) errorQuda("Sloppy gauge fat field doesn't exist");

     if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");

     if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");

     if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist");

     if (param->overlap) {

       if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");

     }


     if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");

     if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");

     if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");

     if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist");

     if (param->overlap) {

       if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");

     }

     cudaGauge = gaugeFatPrecise;

   }


   checkClover(param);


   return cudaGauge;

 }


 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)

 {

   pushVerbosity(inv_param->verbosity);


   if (!initialized) errorQuda("QUDA not initialized");

   if (gaugePrecise == nullptr) errorQuda("Gauge field not allocated");

   if (cloverPrecise == nullptr) errorQuda("Clover field not allocated");


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))

     errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), true);


   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*in_h);

     double gpu = blas::norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     blas::ax(gaugePrecise->Anisotropy(), in);

   }

   bool pc = true;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);

         //FIXME: Do we need this for twisted clover???

   DiracCloverPC dirac(diracParam); // create the Dirac operator

   if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator

   else dirac.CloverInv(out, in, parity);


   cpuParam.v = h_out;

   cpuParam.location = inv_param->output_location;

   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   *out_h = out;


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*out_h);

     double gpu = blas::norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   /*for (int i=0; i<in_h->Volume(); i++) {

     ((cpuColorSpinorField*)out_h)->PrintVector(i);

     }*/


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)

 {

   profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL);

   profileEigensolve.TPSTART(QUDA_PROFILE_INIT);


   // Transfer the inv param structure contained in eig_param

   QudaInvertParam *inv_param = eig_param->invert_param;


   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH || inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH

       || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)

     setKernelPackT(true);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     printQudaInvertParam(inv_param);

     printQudaEigParam(eig_param);

   }


   checkInvertParam(inv_param);

   checkEigParam(eig_param);

   cudaGaugeField *cudaGauge = checkGauge(inv_param);


   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE)

     || (inv_param->solve_type == QUDA_NORMERR_PC_SOLVE);


   inv_param->secs = 0;

   inv_param->gflops = 0;

   inv_param->iter = 0;


   // Define problem matrix

   //------------------------------------------------------

   Dirac *d = nullptr;

   Dirac *dSloppy = nullptr;

   Dirac *dPre = nullptr;


   // Create the dirac operator with a sloppy and a precon.

   createDirac(d, dSloppy, dPre, *inv_param, pc_solve);

   Dirac &dirac = *d;


   // Create device side ColorSpinorField vector space and to pass to the

   // compute function.

   const int *X = cudaGauge->X();

   ColorSpinorParam cpuParam(host_evecs[0], *inv_param, X, inv_param->solution_type, inv_param->input_location);


   // create wrappers around application vector set

   std::vector<ColorSpinorField *> host_evecs_;

   for (int i = 0; i < eig_param->n_conv; i++) {

     cpuParam.v = host_evecs[i];

     host_evecs_.push_back(ColorSpinorField::Create(cpuParam));

   }


   ColorSpinorParam cudaParam(cpuParam);

   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;

   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   cudaParam.setPrecision(inv_param->cuda_prec_eigensolver, inv_param->cuda_prec_eigensolver, true);

   // Ensure device vectors qre in UKQCD basis for Wilson type fermions

   if (cudaParam.nSpin != 1) cudaParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;


   std::vector<Complex> evals(eig_param->n_conv, 0.0);

   std::vector<ColorSpinorField *> kSpace;

   for (int i = 0; i < eig_param->n_conv; i++) { kSpace.push_back(ColorSpinorField::Create(cudaParam)); }


   // If you attempt to compute part of the imaginary spectrum of a symmetric matrix,

   // the solver will fail.

   if ((eig_param->spectrum == QUDA_SPECTRUM_LI_EIG || eig_param->spectrum == QUDA_SPECTRUM_SI_EIG)

       && ((eig_param->use_norm_op || (inv_param->dslash_type == QUDA_LAPLACE_DSLASH))

           || ((inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH)

               && inv_param->solve_type == QUDA_DIRECT_PC_SOLVE))) {

     errorQuda("Cannot compute imaginary spectra with a hermitian operator");

   }


   // Gamma5 pre-multiplication is only supported for the M type operator

   if (eig_param->compute_gamma5) {

     if (eig_param->use_norm_op || eig_param->use_dagger) {

       errorQuda("gamma5 premultiplication is only supported for M type operators: dag = %s, normop = %s",

                 eig_param->use_dagger ? "true" : "false", eig_param->use_norm_op ? "true" : "false");

     }

   }


   profileEigensolve.TPSTOP(QUDA_PROFILE_INIT);


   if (!eig_param->use_norm_op && !eig_param->use_dagger && eig_param->compute_gamma5) {

     DiracG5M m(dirac);

     if (eig_param->arpack_check) {

       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);

     } else {

       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);

       (*eig_solve)(kSpace, evals);

       delete eig_solve;

     }

   } else if (!eig_param->use_norm_op && !eig_param->use_dagger && !eig_param->compute_gamma5) {

     DiracM m(dirac);

     if (eig_param->arpack_check) {

       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);

     } else {

       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);

       (*eig_solve)(kSpace, evals);

       delete eig_solve;

     }

   } else if (!eig_param->use_norm_op && eig_param->use_dagger) {

     DiracMdag m(dirac);

     if (eig_param->arpack_check) {

       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);

     } else {

       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);

       (*eig_solve)(kSpace, evals);

       delete eig_solve;

     }

   } else if (eig_param->use_norm_op && !eig_param->use_dagger) {

     DiracMdagM m(dirac);

     if (eig_param->arpack_check) {

       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);

     } else {

       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);

       (*eig_solve)(kSpace, evals);

       delete eig_solve;

     }

   } else if (eig_param->use_norm_op && eig_param->use_dagger) {

     DiracMMdag m(dirac);

     if (eig_param->arpack_check) {

       arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);

     } else {

       EigenSolver *eig_solve = EigenSolver::create(eig_param, m, profileEigensolve);

       (*eig_solve)(kSpace, evals);

       delete eig_solve;

     }

   } else {

     errorQuda("Invalid use_norm_op and dagger combination");

   }


   // Copy eigen values back

   for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }


   // Transfer Eigenpairs back to host if using GPU eigensolver. The copy

   // will automatically rotate from device UKQCD gamma basis to the

   // host side gamma basis.

   if (!(eig_param->arpack_check)) {

     profileEigensolve.TPSTART(QUDA_PROFILE_D2H);

     for (int i = 0; i < eig_param->n_conv; i++) *host_evecs_[i] = *kSpace[i];

     profileEigensolve.TPSTOP(QUDA_PROFILE_D2H);

   }


   profileEigensolve.TPSTART(QUDA_PROFILE_FREE);

   for (int i = 0; i < eig_param->n_conv; i++) delete host_evecs_[i];

   delete d;

   delete dSloppy;

   delete dPre;

   for (int i = 0; i < eig_param->n_conv; i++) delete kSpace[i];

   profileEigensolve.TPSTOP(QUDA_PROFILE_FREE);


   popVerbosity();


   // cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache();


   profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)

   : profile(profile) {

   profile.TPSTART(QUDA_PROFILE_INIT);

   QudaInvertParam *param = mg_param.invert_param;

   // set whether we are going use native or generic blas

   blas_lapack::set_native(param->native_blas_lapack);


   checkMultigridParam(&mg_param);

   cudaGaugeField *cudaGauge = checkGauge(param);


   // check MG params (needs to go somewhere else)

   if (mg_param.n_level > QUDA_MAX_MG_LEVEL)

     errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);

   for (int i=0; i<mg_param.n_level; i++) {

     if (mg_param.smoother_solve_type[i] != QUDA_DIRECT_SOLVE && mg_param.smoother_solve_type[i] != QUDA_DIRECT_PC_SOLVE)

       errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);

   }

   if (param->solve_type != QUDA_DIRECT_SOLVE)

     errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param);

   mg_param.secs = 0;

   mg_param.gflops = 0;


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||

     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (param->solve_type == QUDA_NORMOP_PC_SOLVE);


   // create the dirac operators for the fine grid


   // this is the Dirac operator we use for inter-grid residual computation

   DiracParam diracParam;

   setDiracSloppyParam(diracParam, param, outer_pc_solve);

   d = Dirac::create(diracParam);

   m = new DiracM(*d);


   // this is the Dirac operator we use for smoothing

   DiracParam diracSmoothParam;

   bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||

     (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);

   setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);

   diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0];

   dSmooth = Dirac::create(diracSmoothParam);

   mSmooth = new DiracM(*dSmooth);


   // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)

   DiracParam diracSmoothSloppyParam;

   setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve,

                    mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false);

   diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0];


   dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);

   mSmoothSloppy = new DiracM(*dSmoothSloppy);


   if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION)

     errorQuda("MG setup location %d disabled", mg_param.setup_location[0]);

   ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]);

   csParam.create = QUDA_NULL_FIELD_CREATE;

   QudaPrecision Bprec = mg_param.precision_null[0];

   Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec);

   csParam.setPrecision(Bprec, Bprec, true);

   if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;

   csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE;

   B.resize(mg_param.n_vec[0]);


   if (mg_param.transfer_type[0] == QUDA_TRANSFER_COARSE_KD || mg_param.transfer_type[0] == QUDA_TRANSFER_OPTIMIZED_KD) {

     // Create the ColorSpinorField as a "container" for metadata.

     csParam.create = QUDA_REFERENCE_FIELD_CREATE;


     // These never get accessed, `nullptr` on its own leads to an error in texture binding

     csParam.v = (void *)std::numeric_limits<uint64_t>::max();

     csParam.norm = (void *)std::numeric_limits<uint64_t>::max();

   }


   for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); }


   // fill out the MG parameters for the fine level

   mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);


   mg = new MG(*mgParam, profile);

   mgParam->updateInvertParam(*param);


   // cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache();

   profile.TPSTOP(QUDA_PROFILE_INIT);

 }


 void* newMultigridQuda(QudaMultigridParam *mg_param) {

   profilerStart(__func__);


   pushVerbosity(mg_param->invert_param->verbosity);


   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   auto *mg = new multigrid_solver(*mg_param, profileInvert);

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);


   saveTuneCache();


   popVerbosity();


   profilerStop(__func__);

   return static_cast<void*>(mg);

 }


 void destroyMultigridQuda(void *mg) {

   delete static_cast<multigrid_solver*>(mg);

 }


 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)

 {

   profilerStart(__func__);


   pushVerbosity(mg_param->invert_param->verbosity);


   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);


   auto *mg = static_cast<multigrid_solver*>(mg_);

   checkMultigridParam(mg_param);


   QudaInvertParam *param = mg_param->invert_param;

   // check the gauge fields have been created and set the precision as needed

   checkGauge(param);


   // for reporting level 1 is the fine level but internally use level 0 for indexing

   // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" );

   // setOutputPrefix(prefix);

   setOutputPrefix("MG level 1 (GPU): "); //fix me


   // Check if we're doing a thin update only

   if (mg_param->thin_update_only) {

     // FIXME: add support for updating kappa, mu as appropriate


     // FIXME: assumes gauge parameters haven't changed.

     // These routines will set gauge = gaugeFat for DiracImprovedStaggered

     mg->d->updateFields(gaugeSloppy, gaugeFatSloppy, gaugeLongSloppy, cloverSloppy);

     mg->d->setMass(param->mass);


     mg->dSmooth->updateFields(gaugeSloppy, gaugeFatSloppy, gaugeLongSloppy, cloverSloppy);

     mg->dSmooth->setMass(param->mass);


     if (mg->dSmoothSloppy != mg->dSmooth) {

       if (param->overlap) {

         mg->dSmoothSloppy->updateFields(gaugeExtended, gaugeFatExtended, gaugeLongExtended, cloverPrecondition);

       } else {

         mg->dSmoothSloppy->updateFields(gaugePrecondition, gaugeFatPrecondition, gaugeLongPrecondition,

                                         cloverPrecondition);

       }

       mg->dSmoothSloppy->setMass(param->mass);

     }

     // The above changes are propagated internally by use of references, pointers, etc, so

     // no further updates are needed.


   } else {


     bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);


     // free the previous dirac operators

     if (mg->m) delete mg->m;

     if (mg->mSmooth) delete mg->mSmooth;

     if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;


     if (mg->d) delete mg->d;

     if (mg->dSmooth) delete mg->dSmooth;

     if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;


     // create new fine dirac operators


     // this is the Dirac operator we use for inter-grid residual computation

     DiracParam diracParam;

     setDiracSloppyParam(diracParam, param, outer_pc_solve);

     mg->d = Dirac::create(diracParam);

     mg->m = new DiracM(*(mg->d));


     // this is the Dirac operator we use for smoothing

     DiracParam diracSmoothParam;

     bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE)

       || (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);

     setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);

     mg->dSmooth = Dirac::create(diracSmoothParam);

     mg->mSmooth = new DiracM(*(mg->dSmooth));


     // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)

     DiracParam diracSmoothSloppyParam;

     setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);

     mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);

     ;

     mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));


     mg->mgParam->matResidual = mg->m;

     mg->mgParam->matSmooth = mg->mSmooth;

     mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;


     mg->mgParam->updateInvertParam(*param);

     if (mg->mgParam->mg_global.invert_param != param) mg->mgParam->mg_global.invert_param = param;


     bool refresh = true;

     mg->mg->reset(refresh);

   }


   setOutputPrefix("");


   // cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache();


   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);


   popVerbosity();


   profilerStop(__func__);

 }


 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)

 {

   profilerStart(__func__);

   pushVerbosity(mg_param->invert_param->verbosity);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);


   auto *mg = static_cast<multigrid_solver*>(mg_);

   checkMultigridParam(mg_param);

   checkGauge(mg_param->invert_param);


   mg->mg->dumpNullVectors();


   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   popVerbosity();

   profilerStop(__func__);

 }


 deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)

   : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr),  profile(profile) {


   QudaInvertParam *param = eig_param.invert_param;


   if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;


   profile.TPSTART(QUDA_PROFILE_INIT);


   cudaGaugeField *cudaGauge = checkGauge(param);

   eig_param.secs   = 0;

   eig_param.gflops = 0;


   DiracParam diracParam;

   if(eig_param.cuda_prec_ritz == param->cuda_prec)

   {

     setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));

   } else {

     setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));

   }


   const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);


   d = Dirac::create(diracParam);

   m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));


   ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location);


   ritzParam.create        = QUDA_ZERO_FIELD_CREATE;

   ritzParam.is_composite  = true;

   ritzParam.is_component  = false;

   ritzParam.composite_dim = param->n_ev * param->deflation_grid;

   ritzParam.setPrecision(param->cuda_prec_ritz);


   if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {

     ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order

     if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;


     //select memory location here, by default ritz vectors will be allocated on the device

     //but if not sufficient device memory, then the user may choose mapped type of memory

     ritzParam.mem_type = eig_param.mem_type_ritz;

   } else { //host location

     ritzParam.mem_type = QUDA_MEMORY_PINNED;

   }


   int ritzVolume = 1;

   for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];


   if (getVerbosity() == QUDA_DEBUG_VERBOSE) {


     size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());

     printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());

     if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");

     else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED)

       printfQuda("Using mapped memory type.\n");

   }


   RV = ColorSpinorField::Create(ritzParam);


   deflParam = new DeflationParam(eig_param, RV, *m);


   defl = new Deflation(*deflParam, profile);


   profile.TPSTOP(QUDA_PROFILE_INIT);

 }


 void* newDeflationQuda(QudaEigParam *eig_param) {

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

 #ifdef MAGMA_LIB

   openMagma();

 #endif

   auto *defl = new deflated_solver(*eig_param, profileInvert);


   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);


   saveProfile(__func__);

   flushProfile();

   return static_cast<void*>(defl);

 }


 void destroyDeflationQuda(void *df) {

 #ifdef MAGMA_LIB

   closeMagma();

 #endif

   delete static_cast<deflated_solver*>(df);

 }


 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)

 {

   profilerStart(__func__);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH || param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH

       || param->dslash_type == QUDA_MOBIUS_DWF_DSLASH || param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH)

     setKernelPackT(true);


   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);


   checkInvertParam(param, hp_x, hp_b);


   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);


   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in

   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it

   // for now, though, so here we factorize everything for convenience.


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||

     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||

     (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||

     (param->solve_type == QUDA_DIRECT_PC_SOLVE);

   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||

     (param->solve_type == QUDA_NORMERR_PC_SOLVE);


   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   Dirac *d = nullptr;

   Dirac *dSloppy = nullptr;

   Dirac *dPre = nullptr;

   Dirac *dEig = nullptr;


   // Create the dirac operator and operators for sloppy, precondition,

   // and an eigensolver

   createDiracWithEig(d, dSloppy, dPre, dEig, *param, pc_solve);


   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;

   Dirac &diracPre = *dPre;

   Dirac &diracEig = *dEig;


   profileInvert.TPSTART(QUDA_PROFILE_H2D);


   ColorSpinorField *b = nullptr;

   ColorSpinorField *x = nullptr;

   ColorSpinorField *in = nullptr;

   ColorSpinorField *out = nullptr;


   const int *X = cudaGauge->X();


   // wrap CPU host side pointers

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);

   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);


   cpuParam.v = hp_x;

   cpuParam.location = param->output_location;

   ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);


   // download source

   ColorSpinorParam cudaParam(cpuParam, *param);

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam);


   // now check if we need to invalidate the solutionResident vectors

   bool invalidate = false;

   if (param->use_resident_solution == 1) {

     for (auto v : solutionResident)

       if (b->Precision() != v->Precision() || b->SiteSubset() != v->SiteSubset()) { invalidate = true; break; }


     if (invalidate) {

       for (auto v : solutionResident) if (v) delete v;

       solutionResident.clear();

     }


     if (!solutionResident.size()) {

       cudaParam.create = QUDA_NULL_FIELD_CREATE;

       solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution

     }

     x = solutionResident[0];

   } else {

     cudaParam.create = QUDA_NULL_FIELD_CREATE;

     x = new cudaColorSpinorField(cudaParam);

   }


   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess

     // initial guess only supported for single-pass solvers

     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&

         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {

       errorQuda("Initial guess not supported for two-pass solver");

     }


     *x = *h_x; // solution

   } else { // zero initial guess

     blas::zero(*x);

   }


   // if we're doing a managed memory MG solve and prefetching is

   // enabled, prefetch all the Dirac matrices. There's probably

   // a better place to put this...

   if (param->inv_type_precondition == QUDA_MG_INVERTER) {

     dirac.prefetch(QUDA_CUDA_FIELD_LOCATION);

     diracSloppy.prefetch(QUDA_CUDA_FIELD_LOCATION);

     diracPre.prefetch(QUDA_CUDA_FIELD_LOCATION);

   }


   profileInvert.TPSTOP(QUDA_PROFILE_H2D);

   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);


   double nb = blas::norm2(*b);

   if (nb==0.0) errorQuda("Source has zero norm");


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nh_b = blas::norm2(*h_b);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

     if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) {

       double nh_x = blas::norm2(*h_x);

       double nx = blas::norm2(*x);

       printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);

     }

   }


   // rescale the source and solution vectors to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     blas::ax(1.0/sqrt(nb), *b);

     blas::ax(1.0/sqrt(nb), *x);

   }


   massRescale(*static_cast<cudaColorSpinorField *>(b), *param, false);


   dirac.prepare(in, out, *x, *b, param->solution_type);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = blas::norm2(*in);

     double nout = blas::norm2(*out);

     printfQuda("Prepared source = %g\n", nin);

     printfQuda("Prepared solution = %g\n", nout);

   }


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = blas::norm2(*in);

     printfQuda("Prepared source post mass rescale = %g\n", nin);

   }


   // solution_type specifies *what* system is to be solved.

   // solve_type specifies *how* the system is to be solved.

   //

   // We have the following four cases (plus preconditioned variants):

   //

   // solution_type    solve_type    Effect

   // -------------    ----------    ------

   // MAT              DIRECT        Solve Ax=b

   // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y

   // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)

   // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b

   // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y

   //

   // We generally require that the solution_type and solve_type

   // preconditioning match.  As an exception, the unpreconditioned MAT

   // solution_type may be used with any solve_type, including

   // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the

   // preconditioned source and reconstruction of the full solution are

   // taken care of by Dirac::prepare() and Dirac::reconstruct(),

   // respectively.


   if (pc_solution && !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }


   if (!mat_solution && !pc_solution && pc_solve) {

     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");

   }


   if (!mat_solution && norm_error_solve) {

     errorQuda("Normal-error solve requires Mat solution");

   }


   if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {

     errorQuda("Multigrid preconditioning only supported for direct solves");

   }


   if (param->chrono_use_resident && ( norm_error_solve) ){

     errorQuda("Chronological forcasting only presently supported for M^dagger M solver");

   }


   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);


   if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b

     cudaColorSpinorField tmp(*in);

     dirac.Mdag(*in, tmp);

   } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b

     DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);

     (*solve)(*out, *in);

     blas::copy(*in, *out);

     delete solve;

     solverParam.updateInvertParam(*param);

   }


   if (direct_solve) {

     DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);

     SolverParam solverParam(*param);

     // chronological forecasting

     if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {

       profileInvert.TPSTART(QUDA_PROFILE_CHRONO);


       auto &basis = chronoResident[param->chrono_index];


       ColorSpinorParam cs_param(*basis[0]);

       ColorSpinorField *tmp = ColorSpinorField::Create(cs_param);

       ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);

       std::vector<ColorSpinorField*> Ap;

       for (unsigned int k=0; k < basis.size(); k++) {

         Ap.emplace_back((ColorSpinorField::Create(cs_param)));

       }


       if (param->chrono_precision == param->cuda_prec) {

         for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);

       } else if (param->chrono_precision == param->cuda_prec_sloppy) {

         for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);

       } else {

         errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",

                   param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);

       }


       bool orthogonal = true;

       bool apply_mat = false;

       bool hermitian = false;

       MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);


       blas::copy(*tmp, *in);

       mre(*out, *tmp, basis, Ap);


       for (auto ap: Ap) {

         if (ap) delete (ap);

       }

       delete tmp;

       if (tmp2 != out) delete tmp2;


       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);

     }


     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);

     (*solve)(*out, *in);

     delete solve;

     solverParam.updateInvertParam(*param);

   } else if (!norm_error_solve) {

     DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);

     SolverParam solverParam(*param);


     // chronological forecasting

     if (param->chrono_use_resident && chronoResident[param->chrono_index].size() > 0) {

       profileInvert.TPSTART(QUDA_PROFILE_CHRONO);


       auto &basis = chronoResident[param->chrono_index];


       ColorSpinorParam cs_param(*basis[0]);

       std::vector<ColorSpinorField*> Ap;

       ColorSpinorField *tmp = ColorSpinorField::Create(cs_param);

       ColorSpinorField *tmp2 = (param->chrono_precision == out->Precision()) ? out : ColorSpinorField::Create(cs_param);

       for (unsigned int k=0; k < basis.size(); k++) {

         Ap.emplace_back((ColorSpinorField::Create(cs_param)));

       }


       if (param->chrono_precision == param->cuda_prec) {

         for (unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *tmp, *tmp2);

       } else if (param->chrono_precision == param->cuda_prec_sloppy) {

         for (unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);

       } else {

         errorQuda("Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",

                   param->chrono_precision, param->cuda_prec, param->cuda_prec_sloppy);

       }


       bool orthogonal = true;

       bool apply_mat = false;

       bool hermitian = true;

       MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);


       blas::copy(*tmp, *in);

       mre(*out, *tmp, basis, Ap);


       for (auto ap: Ap) {

         if (ap) delete(ap);

       }

       delete tmp;

       if (tmp2 != out) delete tmp2;


       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);

     }


     // if using a Schwarz preconditioner with a normal operator then we must use the DiracMdagMLocal operator

     if (param->inv_type_precondition != QUDA_INVALID_INVERTER && param->schwarz_type != QUDA_INVALID_SCHWARZ) {

       DiracMdagMLocal mPreLocal(diracPre);

       Solver *solve = Solver::create(solverParam, m, mSloppy, mPreLocal, mEig, profileInvert);

       (*solve)(*out, *in);

       delete solve;

       solverParam.updateInvertParam(*param);

     } else {

       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);

       (*solve)(*out, *in);

       delete solve;

       solverParam.updateInvertParam(*param);

     }

   } else { // norm_error_solve

     DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);

     cudaColorSpinorField tmp(*out);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert);

     (*solve)(tmp, *in); // y = (M M^\dag) b

     dirac.Mdag(*out, tmp);  // x = M^dag y

     delete solve;

     solverParam.updateInvertParam(*param);

   }


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = blas::norm2(*x);

     printfQuda("Solution = %g\n",nx);

   }


   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);

   if (param->chrono_make_resident) {

     if(param->chrono_max_dim < 1){

       errorQuda("Cannot chrono_make_resident with chrono_max_dim %i",param->chrono_max_dim);

     }


     const int i = param->chrono_index;

     if (i >= QUDA_MAX_CHRONO)

       errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);


     auto &basis = chronoResident[i];


     if(param->chrono_max_dim < (int)basis.size()){

       errorQuda("Requested chrono_max_dim %i is smaller than already existing chroology %i",param->chrono_max_dim,(int)basis.size());

     }


     if(not param->chrono_replace_last){

       // if we have not filled the space yet just augment

       if ((int)basis.size() < param->chrono_max_dim) {

         ColorSpinorParam cs_param(*out);

         cs_param.setPrecision(param->chrono_precision);

         basis.emplace_back(ColorSpinorField::Create(cs_param));

       }


       // shuffle every entry down one and bring the last to the front

       ColorSpinorField *tmp = basis[basis.size()-1];

       for (unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];

         basis[0] = tmp;

     }

     *(basis[0]) = *out; // set first entry to new solution

   }

   dirac.reconstruct(*x, *b, param->solution_type);


   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     // rescale the solution

     blas::ax(sqrt(nb), *x);

   }

   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);


   if (!param->make_resident_solution) {

     profileInvert.TPSTART(QUDA_PROFILE_D2H);

     *h_x = *x;

     profileInvert.TPSTOP(QUDA_PROFILE_D2H);

   }


   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);


   if (param->compute_action) {

     Complex action = blas::cDotProduct(*b, *x);

     param->action[0] = action.real();

     param->action[1] = action.imag();

   }


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = blas::norm2(*x);

     double nh_x = blas::norm2(*h_x);

     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);

   }

   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);


   profileInvert.TPSTART(QUDA_PROFILE_FREE);


   delete h_b;

   delete h_x;

   delete b;


   if (param->use_resident_solution && !param->make_resident_solution) {

     for (auto v: solutionResident) if (v) delete v;

     solutionResident.clear();

   } else if (!param->make_resident_solution) {

     delete x;

   }


   delete d;

   delete dSloppy;

   delete dPre;

   delete dEig;


   profileInvert.TPSTOP(QUDA_PROFILE_FREE);


   popVerbosity();


   // cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache();


   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);


   profilerStop(__func__);

 }


 void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks,

                           void *milc_longlinks)

 {

   auto link_recon = gauge_param->reconstruct;

   auto link_recon_sloppy = gauge_param->reconstruct_sloppy;

   auto link_recon_precondition = gauge_param->reconstruct_precondition;


   // Specific gauge parameters for MILC

   int pad_size = 0;

 #ifdef MULTI_GPU

   int x_face_size = gauge_param->X[1] * gauge_param->X[2] * gauge_param->X[3] / 2;

   int y_face_size = gauge_param->X[0] * gauge_param->X[2] * gauge_param->X[3] / 2;

   int z_face_size = gauge_param->X[0] * gauge_param->X[1] * gauge_param->X[3] / 2;

   int t_face_size = gauge_param->X[0] * gauge_param->X[1] * gauge_param->X[2] / 2;

   pad_size = MAX(x_face_size, y_face_size);

   pad_size = MAX(pad_size, z_face_size);

   pad_size = MAX(pad_size, t_face_size);

 #endif


   int fat_pad = pad_size;

   int link_pad = 3 * pad_size;


   gauge_param->type = (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_LAPLACE_DSLASH) ?

     QUDA_SU3_LINKS :

     QUDA_ASQTAD_FAT_LINKS;


   gauge_param->ga_pad = fat_pad;

   if (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_LAPLACE_DSLASH) {

     gauge_param->reconstruct = link_recon;

     gauge_param->reconstruct_sloppy = link_recon_sloppy;

     gauge_param->reconstruct_refinement_sloppy = link_recon_sloppy;

   } else {

     gauge_param->reconstruct = QUDA_RECONSTRUCT_NO;

     gauge_param->reconstruct_sloppy = QUDA_RECONSTRUCT_NO;

     gauge_param->reconstruct_refinement_sloppy = QUDA_RECONSTRUCT_NO;

   }

   gauge_param->reconstruct_precondition = QUDA_RECONSTRUCT_NO;


   loadGaugeQuda(milc_fatlinks, gauge_param);


   if (inv_param->dslash_type == QUDA_ASQTAD_DSLASH) {

     gauge_param->type = QUDA_ASQTAD_LONG_LINKS;

     gauge_param->ga_pad = link_pad;

     gauge_param->staggered_phase_type = QUDA_STAGGERED_PHASE_NO;

     gauge_param->reconstruct = link_recon;

     gauge_param->reconstruct_sloppy = link_recon_sloppy;

     gauge_param->reconstruct_refinement_sloppy = link_recon_sloppy;

     gauge_param->reconstruct_precondition = link_recon_precondition;

     loadGaugeQuda(milc_longlinks, gauge_param);

   }

 }


 template <class Interface, class... Args>

 void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // color spinor field pointers, and inv_param

                       void *h_gauge, void *milc_fatlinks, void *milc_longlinks,

                       QudaGaugeParam *gauge_param,     // gauge field pointers

                       void *h_clover, void *h_clovinv, // clover field pointers

                       Interface op, Args... args)

 {

   profilerStart(__func__);


   CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};

   int num_sub_partition = quda::product(split_key);


   if (!split_key.is_valid()) {

     errorQuda("split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]);

   }


   if (num_sub_partition == 1) { // In this case we don't split the grid.


     for (int n = 0; n < param->num_src; n++) { op(_hp_x[n], _hp_b[n], param, args...); }


   } else {


     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);

     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_INIT);


     if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr.\n"); }


     // Doing the sub-partition arithmatics

     if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) {

       errorQuda("We need to have split_grid[0](=%d) * split_grid[1](=%d) * split_grid[2](=%d) * split_grid[3](=%d) * "

                 "num_src_per_sub_partition(=%d) == num_src(=%d).",

                 split_key[0], split_key[1], split_key[2], split_key[3], param->num_src_per_sub_partition, param->num_src);

     }


     // Determine if the color spinor field is using a 5d e/o preconditioning

     QudaPCType pc_type = QUDA_4D_PC;

     if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { pc_type = QUDA_5D_PC; }


     // Doesn't work for MG yet.

     if (param->inv_type_precondition == QUDA_MG_INVERTER) { errorQuda("Split Grid does NOT work with MG yet."); }


     checkInvertParam(param, _hp_x[0], _hp_b[0]);


     bool is_staggered;

     if (h_gauge) {

       is_staggered = false;

     } else if (milc_fatlinks) {

       is_staggered = true;

     } else {

       errorQuda("Both h_gauge and milc_fatlinks are null.");

       is_staggered = true; // to suppress compiler warning/error.

     }


     // Gauge fields/params

     GaugeFieldParam *gf_param = nullptr;

     GaugeField *in = nullptr;

     // Staggered gauge fields/params

     GaugeFieldParam *milc_fatlink_param = nullptr;

     GaugeFieldParam *milc_longlink_param = nullptr;

     GaugeField *milc_fatlink_field = nullptr;

     GaugeField *milc_longlink_field = nullptr;


     // set up the gauge field params.

     if (!is_staggered) { // not staggered

       gf_param = new GaugeFieldParam(h_gauge, *gauge_param);

       if (gf_param->order <= 4) { gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }

       in = GaugeField::Create(*gf_param);

     } else { // staggered

       milc_fatlink_param = new GaugeFieldParam(milc_fatlinks, *gauge_param);

       if (milc_fatlink_param->order <= 4) { milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }

       milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);

       milc_longlink_param = new GaugeFieldParam(milc_longlinks, *gauge_param);

       if (milc_longlink_param->order <= 4) { milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }

       milc_longlink_field = GaugeField::Create(*milc_longlink_param);

     }


     // Create the temp host side helper fields, which are just wrappers of the input pointers.

     bool pc_solution

       = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


     const int *X = gauge_param->X;

     quda::CommKey field_dim = {X[0], X[1], X[2], X[3]};

     ColorSpinorParam cpuParam(_hp_b[0], *param, X, pc_solution, param->input_location);

     std::vector<ColorSpinorField *> _h_b(param->num_src);

     for (int i = 0; i < param->num_src; i++) {

       cpuParam.v = _hp_b[i];

       _h_b[i] = ColorSpinorField::Create(cpuParam);

     }


     cpuParam.location = param->output_location;

     std::vector<ColorSpinorField *> _h_x(param->num_src);

     for (int i = 0; i < param->num_src; i++) {

       cpuParam.v = _hp_x[i];

       _h_x[i] = ColorSpinorField::Create(cpuParam);

     }


     // Make the gauge param dimensions larger

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       printfQuda("Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n", comm_dim(0),

                  comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);

     }

     for (int d = 0; d < CommKey::n_dim; d++) {

       if (comm_dim(d) % split_key[d] != 0) {

         errorQuda("Split not possible: %2d %% %2d != 0.", comm_dim(d), split_key[d]);

       }

       if (!is_staggered) {

         gf_param->x[d] *= split_key[d];

         gf_param->pad *= split_key[d];

       } else {

         milc_fatlink_param->x[d] *= split_key[d];

         milc_fatlink_param->pad *= split_key[d];

         milc_longlink_param->x[d] *= split_key[d];

         milc_longlink_param->pad *= split_key[d];

       }

       gauge_param->X[d] *= split_key[d];

       gauge_param->ga_pad *= split_key[d];

     }


     // Deal with clover field. For Multi source computatons, clover field construction is done

     // exclusively on the GPU.

     if (param->clover_coeff == 0.0 && param->clover_csw == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient nor Csw not set");

     if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");


     quda::CloverField *input_clover = nullptr;

     quda::CloverField *collected_clover = nullptr;

     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH

         || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {

       if (h_clover || h_clovinv) {

         CloverFieldParam clover_param;

         clover_param.nDim = 4;

         // If clover_coeff is not set manually, then it is the product Csw * kappa.

         // If the user has set the clover_coeff manually, that value takes precedent.

         clover_param.csw = param->clover_csw;

         clover_param.coeff = param->clover_coeff == 0.0 ? param->kappa * param->clover_csw : param->clover_coeff;

         // We must also adjust param->clover_coeff here. If a user has set kappa and

         // Csw, we must populate param->clover_coeff for them as the computeClover

         // routines uses that value

         param->clover_coeff = (param->clover_coeff == 0.0 ? param->kappa * param->clover_csw : param->clover_coeff);

         clover_param.twisted = param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH;

         clover_param.mu2 = clover_param.twisted ? 4.0 * param->kappa * param->kappa * param->mu * param->mu : 0.0;

         clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;

         for (int d = 0; d < 4; d++) { clover_param.x[d] = field_dim[d]; }

         clover_param.pad = param->cl_pad;

         clover_param.create = QUDA_REFERENCE_FIELD_CREATE;

         clover_param.norm = nullptr;

         clover_param.invNorm = nullptr;

         clover_param.setPrecision(param->clover_cpu_prec);

         clover_param.direct = h_clover ? true : false;

         clover_param.inverse = h_clovinv ? true : false;

         clover_param.clover = h_clover;

         clover_param.cloverInv = h_clovinv;

         clover_param.order = param->clover_order;

         clover_param.location = param->clover_location;


         input_clover = CloverField::Create(clover_param);


         for (int d = 0; d < CommKey::n_dim; d++) { clover_param.x[d] *= split_key[d]; }

         clover_param.create = QUDA_NULL_FIELD_CREATE;

         collected_clover = CloverField::Create(clover_param);


         std::vector<quda::CloverField *> v_c(1);

         v_c[0] = input_clover;

         quda::split_field(*collected_clover, v_c, split_key); // Clover uses 4d even-odd preconditioning.

       }

     }


     quda::GaugeField *collected_gauge = nullptr;

     quda::GaugeField *collected_milc_fatlink_field = nullptr;

     quda::GaugeField *collected_milc_longlink_field = nullptr;


     if (!is_staggered) {

       gf_param->create = QUDA_NULL_FIELD_CREATE;

       collected_gauge = new quda::cpuGaugeField(*gf_param);

       std::vector<quda::GaugeField *> v_g(1);

       v_g[0] = in;

       quda::split_field(*collected_gauge, v_g, split_key);

     } else {

       milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;

       milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;

       collected_milc_fatlink_field = new quda::cpuGaugeField(*milc_fatlink_param);

       collected_milc_longlink_field = new quda::cpuGaugeField(*milc_longlink_param);

       std::vector<quda::GaugeField *> v_g(1);

       v_g[0] = milc_fatlink_field;

       quda::split_field(*collected_milc_fatlink_field, v_g, split_key);

       v_g[0] = milc_longlink_field;

       quda::split_field(*collected_milc_longlink_field, v_g, split_key);

     }


     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_INIT);

     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE);


     comm_barrier();


     // Split input fermion field

     quda::ColorSpinorParam cpu_cs_param_split(*_h_x[0]);

     for (int d = 0; d < CommKey::n_dim; d++) { cpu_cs_param_split.x[d] *= split_key[d]; }

     std::vector<quda::ColorSpinorField *> _collect_b(param->num_src_per_sub_partition, nullptr);

     std::vector<quda::ColorSpinorField *> _collect_x(param->num_src_per_sub_partition, nullptr);

     for (int n = 0; n < param->num_src_per_sub_partition; n++) {

       _collect_b[n] = new quda::cpuColorSpinorField(cpu_cs_param_split);

       _collect_x[n] = new quda::cpuColorSpinorField(cpu_cs_param_split);

       auto first = _h_b.begin() + n * num_sub_partition;

       auto last = _h_b.begin() + (n + 1) * num_sub_partition;

       std::vector<ColorSpinorField *> _v_b(first, last);

       split_field(*_collect_b[n], _v_b, split_key, pc_type);

     }

     comm_barrier();


     push_communicator(split_key);

     updateR();

     comm_barrier();


     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_PREAMBLE);

     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);


     // Load gauge field after pushing the split communicator so the comm buffers, etc are setup according to

     // the split topology.

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); }

     if (!is_staggered) {

       loadGaugeQuda(collected_gauge->Gauge_p(), gauge_param);

     } else {

       // freeGaugeQuda();

       loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->Gauge_p(),

                            collected_milc_longlink_field->Gauge_p());

     }

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); }


     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH

         || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {

       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); }

       if (collected_clover) {

         loadCloverQuda(collected_clover->V(false), collected_clover->V(true), param);

       } else {

         loadCloverQuda(nullptr, nullptr, param);

       }

       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded clover field...\n"); }

     }


     for (int n = 0; n < param->num_src_per_sub_partition; n++) {

       op(_collect_x[n]->V(), _collect_b[n]->V(), param, args...);

     }


     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);

     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_EPILOGUE);

     push_communicator(default_comm_key);

     updateR();

     comm_barrier();


     for (int d = 0; d < CommKey::n_dim; d++) {

       gauge_param->X[d] /= split_key[d];

       gauge_param->ga_pad /= split_key[d];

     }


     for (int n = 0; n < param->num_src_per_sub_partition; n++) {

       auto first = _h_x.begin() + n * num_sub_partition;

       auto last = _h_x.begin() + (n + 1) * num_sub_partition;

       std::vector<ColorSpinorField *> _v_x(first, last);

       join_field(_v_x, *_collect_x[n], split_key, pc_type);

     }


     for (auto p : _collect_b) { delete p; }

     for (auto p : _collect_x) { delete p; }


     for (auto p : _h_x) { delete p; }

     for (auto p : _h_b) { delete p; }


     if (!is_staggered) {

       delete in;

       delete collected_gauge;

     } else {

       delete milc_fatlink_field;

       delete milc_longlink_field;

       delete collected_milc_fatlink_field;

       delete collected_milc_longlink_field;

     }


     if (input_clover) { delete input_clover; }

     if (collected_clover) { delete collected_clover; }


     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_EPILOGUE);

     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);


     // Restore the gauge field

     if (!is_staggered) {

       loadGaugeQuda(h_gauge, gauge_param);

     } else {

       freeGaugeQuda();

       loadFatLongGaugeQuda(param, gauge_param, milc_fatlinks, milc_longlinks);

     }


     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       loadCloverQuda(h_clover, h_clovinv, param);

     }

   }


   profilerStop(__func__);

 }


 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };

   callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, nullptr, nullptr, op);

 }


 void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks,

                                  void *milc_longlinks, QudaGaugeParam *gauge_param)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };

   callMultiSrcQuda(_hp_x, _hp_b, param, nullptr, milc_fatlinks, milc_longlinks, gauge_param, nullptr, nullptr, op);

 }


 void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge,

                               QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param) { invertQuda(_x, _b, param); };

   callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, h_clover, h_clovinv, op);

 }


 void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge,

                         QudaGaugeParam *gauge_param)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };

   callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, nullptr, nullptr, op, parity);

 }


 void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity,

                                  void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };

   callMultiSrcQuda(_hp_x, _hp_b, param, nullptr, milc_fatlinks, milc_longlinks, gauge_param, nullptr, nullptr, op,

                    parity);

 }


 void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge,

                               QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)

 {

   auto op = [](void *_x, void *_b, QudaInvertParam *param, QudaParity parity) { dslashQuda(_x, _b, param, parity); };

   callMultiSrcQuda(_hp_x, _hp_b, param, h_gauge, nullptr, nullptr, gauge_param, h_clover, h_clovinv, op, parity);

 }


 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)

 {

   profilerStart(__func__);


   profileMulti.TPSTART(QUDA_PROFILE_TOTAL);

   profileMulti.TPSTART(QUDA_PROFILE_INIT);


   if (!initialized) errorQuda("QUDA not initialized");


   checkInvertParam(param, _hp_x[0], _hp_b);


   // check the gauge fields have been created

   checkGauge(param);


   if (param->num_offset > QUDA_MAX_MULTI_SHIFT)

     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", param->num_offset,

               QUDA_MAX_MULTI_SHIFT);


   pushVerbosity(param->verbosity);


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);


   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

       param->dslash_type == QUDA_STAGGERED_DSLASH) {


     if (param->solution_type != QUDA_MATPC_SOLUTION) {

       errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type");

     }


     if (param->solve_type != QUDA_DIRECT_PC_SOLVE) {

       errorQuda("For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");

     }


   } else { // Wilson type


     if (mat_solution) {

       errorQuda("For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");

     }

     if (direct_solve) {

       errorQuda("For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");

     }

     if (pc_solution & !pc_solve) {

       errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");

     }

     if (!pc_solution & pc_solve) {

       errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");

     }

   }


   // Timing and FLOP counters

   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   for (int i=0; i<param->num_offset-1; i++) {

     for (int j=i+1; j<param->num_offset; j++) {

       if (param->offset[i] > param->offset[j])

         errorQuda("Offsets must be ordered from smallest to largest");

     }

   }


   // Host pointers for x, take a copy of the input host pointers

   void** hp_x;

   hp_x = new void* [ param->num_offset ];


   void* hp_b = _hp_b;

   for(int i=0;i < param->num_offset;i++){

     hp_x[i] = _hp_x[i];

   }


   // Create the matrix.

   // The way this works is that createDirac will create 'd' and 'dSloppy'

   // which are global. We then grab these with references...

   //

   // Balint: Isn't there a nice construction pattern we could use here? This is

   // expedient but yucky.

   //  DiracParam diracParam;

   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

       param->dslash_type == QUDA_STAGGERED_DSLASH){

     param->mass = sqrt(param->offset[0]/4);

   }


   Dirac *d = nullptr;

   Dirac *dSloppy = nullptr;

   Dirac *dPre = nullptr;

   Dirac *dRefine = nullptr;


   // Create the dirac operator and a sloppy, precon, and refine.

   createDiracWithRefine(d, dSloppy, dPre, dRefine, *param, pc_solve);

   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;


   cudaColorSpinorField *b = nullptr;   // Cuda RHS

   std::vector<ColorSpinorField*> x;  // Cuda Solutions

   x.resize(param->num_offset);

   std::vector<ColorSpinorField*> p;

   std::unique_ptr<double[]> r2_old(new double[param->num_offset]);


   // Grab the dimension array of the input gauge field.

   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?

     gaugeFatPrecise->X() : gaugePrecise->X();


   // This creates a ColorSpinorParam struct, from the host data

   // pointer, the definitions in param, the dimensions X, and whether

   // the solution is on a checkerboard instruction or not. These can

   // then be used as 'instructions' to create the actual

   // ColorSpinorField

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);

   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);


   std::vector<ColorSpinorField*> h_x;

   h_x.resize(param->num_offset);


   cpuParam.location = param->output_location;

   for(int i=0; i < param->num_offset; i++) {

     cpuParam.v = hp_x[i];

     h_x[i] = ColorSpinorField::Create(cpuParam);

   }


   profileMulti.TPSTOP(QUDA_PROFILE_INIT);

   profileMulti.TPSTART(QUDA_PROFILE_H2D);

   // Now I need a colorSpinorParam for the device

   ColorSpinorParam cudaParam(cpuParam, *param);

   // This setting will download a host vector

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;

   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it

   profileMulti.TPSTOP(QUDA_PROFILE_H2D);


   profileMulti.TPSTART(QUDA_PROFILE_INIT);

   // Create the solution fields filled with zero

   cudaParam.create = QUDA_ZERO_FIELD_CREATE;


   // now check if we need to invalidate the solutionResident vectors

   bool invalidate = false;

   for (auto v : solutionResident) {

     if (cudaParam.Precision() != v->Precision()) {

       invalidate = true;

       break;

     }

   }


   if (invalidate) {

     for (auto v : solutionResident) delete v;

     solutionResident.clear();

   }


   // grow resident solutions to be big enough

   for (int i=solutionResident.size(); i < param->num_offset; i++) {

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Adding vector %d to solutionsResident\n", i);

     solutionResident.push_back(new cudaColorSpinorField(cudaParam));

   }

   for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];

   profileMulti.TPSTOP(QUDA_PROFILE_INIT);


   profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);


   // Check source norms

   double nb = blas::norm2(*b);

   if (nb==0.0) errorQuda("Source has zero norm");


   if(getVerbosity() >= QUDA_VERBOSE ) {

     double nh_b = blas::norm2(*h_b);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

   }


   // rescale the source vector to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     blas::ax(1.0/sqrt(nb), *b);

   }


   // backup shifts

   double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];

   for (int i = 0; i < param->num_offset; i++) { unscaled_shifts[i] = param->offset[i]; }


   // rescale

   massRescale(*b, *param, true);

   profileMulti.TPSTOP(QUDA_PROFILE_PREAMBLE);


   DiracMatrix *m, *mSloppy;


   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

       param->dslash_type == QUDA_STAGGERED_DSLASH) {

     m = new DiracM(dirac);

     mSloppy = new DiracM(diracSloppy);

   } else {

     m = new DiracMdagM(dirac);

     mSloppy = new DiracMdagM(diracSloppy);

   }


   SolverParam solverParam(*param);

   {

     MultiShiftCG cg_m(*m, *mSloppy, solverParam, profileMulti);

     cg_m(x, *b, p, r2_old.get());

   }

   solverParam.updateInvertParam(*param);


   delete m;

   delete mSloppy;


   if (param->compute_true_res) {

     // check each shift has the desired tolerance and use sequential CG to refine

     profileMulti.TPSTART(QUDA_PROFILE_INIT);

     cudaParam.create = QUDA_ZERO_FIELD_CREATE;

     cudaColorSpinorField r(*b, cudaParam);

     profileMulti.TPSTOP(QUDA_PROFILE_INIT);

     QudaInvertParam refineparam = *param;

     refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy;

     Dirac &dirac = *d;

     Dirac &diracSloppy = *dRefine;


 #define REFINE_INCREASING_MASS

 #ifdef REFINE_INCREASING_MASS

     for(int i=0; i < param->num_offset; i++) {

 #else

     for(int i=param->num_offset-1; i >= 0; i--) {

 #endif

       double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

         param->true_res_hq_offset[i] : 0;

       double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

         param->tol_hq_offset[i] : 0;


       /*

         In the case where the shifted systems have zero tolerance

         specified, we refine these systems until either the limit of

         precision is reached (prec_tol) or until the tolerance reaches

         the iterated residual tolerance of the previous multi-shift

         solver (iter_res_offset[i]), which ever is greater.

       */

       const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12

       const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));

       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);

       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0

       if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {

         if (getVerbosity() >= QUDA_SUMMARIZE)

           printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",

                      i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);


         // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)

         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

             param->dslash_type == QUDA_STAGGERED_DSLASH) {

           dirac.setMass(sqrt(param->offset[i]/4));

           diracSloppy.setMass(sqrt(param->offset[i]/4));

         }


         DiracMatrix *m, *mSloppy;


         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

             param->dslash_type == QUDA_STAGGERED_DSLASH) {

           m = new DiracM(dirac);

           mSloppy = new DiracM(diracSloppy);

         } else {

           m = new DiracMdagM(dirac);

           mSloppy = new DiracMdagM(diracSloppy);

         }


         // need to curry in the shift if we are not doing staggered

         if (param->dslash_type != QUDA_ASQTAD_DSLASH && param->dslash_type != QUDA_STAGGERED_DSLASH) {

           m->shift = param->offset[i];

           mSloppy->shift = param->offset[i];

         }


         if (false) { // experimenting with Minimum residual extrapolation

                      // only perform MRE using current and previously refined solutions

 #ifdef REFINE_INCREASING_MASS

           const int nRefine = i+1;

 #else

           const int nRefine = param->num_offset - i + 1;

 #endif


           std::vector<ColorSpinorField *> q;

           q.resize(nRefine);

           std::vector<ColorSpinorField *> z;

           z.resize(nRefine);

           cudaParam.create = QUDA_NULL_FIELD_CREATE;

           cudaColorSpinorField tmp(cudaParam);


           for (int j = 0; j < nRefine; j++) {

             q[j] = new cudaColorSpinorField(cudaParam);

             z[j] = new cudaColorSpinorField(cudaParam);

           }


           *z[0] = *x[0]; // zero solution already solved

 #ifdef REFINE_INCREASING_MASS

           for (int j=1; j<nRefine; j++) *z[j] = *x[j];

 #else

           for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];

 #endif


           bool orthogonal = true;

           bool apply_mat = true;

           bool hermitian = true;

           MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);

           blas::copy(tmp, *b);

           mre(*x[i], tmp, z, q);


           for(int j=0; j < nRefine; j++) {

             delete q[j];

             delete z[j];

           }

         }


         SolverParam solverParam(refineparam);

         solverParam.iter = 0;

         solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES;

         solverParam.tol = (param->tol_offset[i] > 0.0 ? param->tol_offset[i] : iter_tol); // set L2 tolerance

         solverParam.tol_hq = param->tol_hq_offset[i];                                     // set heavy quark tolerance

         solverParam.delta = param->reliable_delta_refinement;


         {

           CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam, profileMulti);

           if (i==0)

             cg(*x[i], *b, p[i], r2_old[i]);

           else

             cg(*x[i], *b);

         }


         solverParam.true_res_offset[i] = solverParam.true_res;

         solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;

         solverParam.updateInvertParam(*param,i);


         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

             param->dslash_type == QUDA_STAGGERED_DSLASH) {

           dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case

           diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case

         }


         delete m;

         delete mSloppy;

       }

     }

   }


   // restore shifts

   for(int i=0; i < param->num_offset; i++) {

     param->offset[i] = unscaled_shifts[i];

   }


   profileMulti.TPSTART(QUDA_PROFILE_D2H);


   if (param->compute_action) {

     Complex action(0);

     for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);

     param->action[0] = action.real();

     param->action[1] = action.imag();

   }


   for(int i=0; i < param->num_offset; i++) {

     if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution

       blas::ax(sqrt(nb), *x[i]);

     }


     if (getVerbosity() >= QUDA_VERBOSE){

       double nx = blas::norm2(*x[i]);

       printfQuda("Solution %d = %g\n", i, nx);

     }


     if (!param->make_resident_solution) *h_x[i] = *x[i];

   }

   profileMulti.TPSTOP(QUDA_PROFILE_D2H);


   profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);


   if (!param->make_resident_solution) {

     for (auto v: solutionResident) if (v) delete v;

     solutionResident.clear();

   }


   profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);


   profileMulti.TPSTART(QUDA_PROFILE_FREE);

   for(int i=0; i < param->num_offset; i++){

     delete h_x[i];

     //if (!param->make_resident_solution) delete x[i];

   }


   delete h_b;

   delete b;


   delete [] hp_x;


   delete d;

   delete dSloppy;

   delete dPre;

   delete dRefine;

   for (auto& pp : p) delete pp;


   profileMulti.TPSTOP(QUDA_PROFILE_FREE);


   popVerbosity();


   // cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache();


   profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);


   profilerStop(__func__);

 }


 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param)

 {

 #ifdef GPU_FATLINK

   profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);

   profileFatLink.TPSTART(QUDA_PROFILE_INIT);


   checkGaugeParam(param);


   GaugeFieldParam gParam(fatlink, *param, QUDA_GENERAL_LINKS);

   cpuGaugeField cpuFatLink(gParam);   // create the host fatlink

   gParam.gauge = longlink;

   cpuGaugeField cpuLongLink(gParam);  // create the host longlink

   gParam.gauge = ulink;

   cpuGaugeField cpuUnitarizedLink(gParam);

   gParam.link_type = param->type;

   gParam.gauge = inlink;

   cpuGaugeField cpuInLink(gParam);    // create the host sitelink


   // create the device fields

   gParam.reconstruct = param->reconstruct;

   gParam.setPrecision(param->cuda_prec, true);

   gParam.create = QUDA_NULL_FIELD_CREATE;

   cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);

   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);


   cudaInLink->loadCPUField(cpuInLink, profileFatLink);

   cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);


   profileFatLink.TPSTART(QUDA_PROFILE_FREE);

   delete cudaInLink;

   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);


   gParam.create = QUDA_ZERO_FIELD_CREATE;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.reconstruct = QUDA_RECONSTRUCT_NO;

   gParam.setPrecision(param->cuda_prec, true);

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;


   if (longlink) {

     profileFatLink.TPSTART(QUDA_PROFILE_INIT);

     cudaGaugeField *cudaLongLink = new cudaGaugeField(gParam);

     profileFatLink.TPSTOP(QUDA_PROFILE_INIT);


     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);

     longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);

     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);


     cudaLongLink->saveCPUField(cpuLongLink, profileFatLink);


     profileFatLink.TPSTART(QUDA_PROFILE_FREE);

     delete cudaLongLink;

     profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   }


   profileFatLink.TPSTART(QUDA_PROFILE_INIT);

   cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);

   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);


   profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);

   fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff);

   profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (fatlink) cudaFatLink->saveCPUField(cpuFatLink, profileFatLink);


   profileFatLink.TPSTART(QUDA_PROFILE_FREE);

   delete cudaInLinkEx;

   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);


   if (ulink) {

     const double unitarize_eps = 1e-14;

     const double max_error = 1e-10;

     const int reunit_allow_svd = 1;

     const int reunit_svd_only  = 0;

     const double svd_rel_error = 1e-6;

     const double svd_abs_error = 1e-6;

     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only,

                                      svd_rel_error, svd_abs_error);


     cudaGaugeField *cudaUnitarizedLink = new cudaGaugeField(gParam);


     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);

     *num_failures_h = 0;

     quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu

     if (*num_failures_h > 0) errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);

     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);


     cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink);


     profileFatLink.TPSTART(QUDA_PROFILE_FREE);

     delete cudaUnitarizedLink;

     profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   }


   profileFatLink.TPSTART(QUDA_PROFILE_FREE);

   delete cudaFatLink;

   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);


   profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);

 #else

   errorQuda("Fat-link has not been built");

 #endif // GPU_FATLINK

 }


 int getGaugePadding(GaugeFieldParam& param){

   int pad = 0;

 #ifdef MULTI_GPU

   int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];

   int face_size[4];

   for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;

   pad = *std::max_element(face_size, face_size+4);

 #endif


   return pad;

 }


 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,

                           double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)

 {

 #ifdef GPU_GAUGE_FORCE

   profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);

   profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);


   checkGaugeParam(qudaGaugeParam);


   GaugeFieldParam gParam(siteLink, *qudaGaugeParam);

   gParam.site_offset = qudaGaugeParam->gauge_offset;

   gParam.site_size = qudaGaugeParam->site_size;

   cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;


   cudaGaugeField* cudaSiteLink = nullptr;


   if (qudaGaugeParam->use_resident_gauge) {

     if (!gaugePrecise) errorQuda("No resident gauge field to use");

     cudaSiteLink = gaugePrecise;

   } else {

     gParam.create = QUDA_NULL_FIELD_CREATE;

     gParam.reconstruct = qudaGaugeParam->reconstruct;

     gParam.setPrecision(qudaGaugeParam->cuda_prec, true);


     cudaSiteLink = new cudaGaugeField(gParam);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);


     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);

     cudaSiteLink->loadCPUField(*cpuSiteLink);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);


     profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);

   }


   GaugeFieldParam gParamMom(mom, *qudaGaugeParam, QUDA_ASQTAD_MOM_LINKS);

   if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)

     gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;

   else

     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;


   gParamMom.site_offset = qudaGaugeParam->mom_offset;

   gParamMom.site_size = qudaGaugeParam->site_size;

   cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;


   cudaGaugeField* cudaMom = nullptr;

   if (qudaGaugeParam->use_resident_mom) {

     if (!momResident) errorQuda("No resident momentum field to use");

     cudaMom = momResident;

     if (qudaGaugeParam->overwrite_mom) cudaMom->zero();

     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);

   } else {

     gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;

     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;

     gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);

     gParamMom.create = QUDA_ZERO_FIELD_CREATE;

     cudaMom = new cudaGaugeField(gParamMom);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);

     if (!qudaGaugeParam->overwrite_mom) {

       profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);

       cudaMom->loadCPUField(*cpuMom);

       profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);

     }

   }


   cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);

   // apply / remove phase as appropriate

   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();


   // actually do the computation

   profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);

   if (!forceMonitor()) {

     gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf,  path_length, loop_coeff, num_paths, max_length);

   } else {

     // if we are monitoring the force, separate the force computation from the momentum update

     GaugeFieldParam gParam(*cudaMom);

     gParam.create = QUDA_ZERO_FIELD_CREATE;

     GaugeField *force = GaugeField::Create(gParam);

     gaugeForce(*force, *cudaGauge, 1.0, input_path_buf,  path_length, loop_coeff, num_paths, max_length);

     updateMomentum(*cudaMom, eb3, *force, "gauge");

     delete force;

   }

   profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (qudaGaugeParam->return_result_mom) {

     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);

     cudaMom->saveCPUField(*cpuMom);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);

   }


   profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);

   if (qudaGaugeParam->make_resident_gauge) {

     if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;

     gaugePrecise = cudaSiteLink;

   } else {

     delete cudaSiteLink;

   }


   if (qudaGaugeParam->make_resident_mom) {

     if (momResident && momResident != cudaMom) delete momResident;

     momResident = cudaMom;

   } else {

     delete cudaMom;

   }


   if (cpuSiteLink) delete cpuSiteLink;

   if (cpuMom) delete cpuMom;


   if (qudaGaugeParam->make_resident_gauge) {

     if (extendedGaugeResident) delete extendedGaugeResident;

     extendedGaugeResident = cudaGauge;

   } else {

     delete cudaGauge;

   }

   profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);


   profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);


 #else

   errorQuda("Gauge force has not been built");

 #endif // GPU_GAUGE_FORCE

   return 0;

 }


 void momResidentQuda(void *mom, QudaGaugeParam *param)

 {

   profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);

   profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);


   checkGaugeParam(param);


   GaugeFieldParam gParamMom(mom, *param, QUDA_ASQTAD_MOM_LINKS);

   if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)

     gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;

   else

     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

   gParamMom.site_offset = param->mom_offset;

   gParamMom.site_size = param->site_size;


   cpuGaugeField cpuMom(gParamMom);


   if (param->make_resident_mom && !param->return_result_mom) {

     if (momResident) delete momResident;


     gParamMom.create = QUDA_NULL_FIELD_CREATE;

     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;

     gParamMom.setPrecision(param->cuda_prec, true);

     gParamMom.create = QUDA_ZERO_FIELD_CREATE;

     momResident = new cudaGaugeField(gParamMom);

   } else if (param->return_result_mom && !param->make_resident_mom) {

     if (!momResident) errorQuda("No resident momentum to return");

   } else {

     errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom,

               param->return_result_mom);

   }


   profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);


   if (param->make_resident_mom) {

     // we are downloading the momentum from the host

     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);

     momResident->loadCPUField(cpuMom);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);

   } else if (param->return_result_mom) {

     // we are uploading the momentum to the host

     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);

     momResident->saveCPUField(cpuMom);

     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);


     profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);

     delete momResident;

     momResident = nullptr;

     profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);

   }


   profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void createCloverQuda(QudaInvertParam* invertParam)

 {

   profileClover.TPSTART(QUDA_PROFILE_TOTAL);

   if (!cloverPrecise) errorQuda("Clover field not allocated");


   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();

   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)

   int R[4];

   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));

   cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);


   profileClover.TPSTART(QUDA_PROFILE_INIT);

   // create the Fmunu field

   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);

   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   cudaGaugeField Fmunu(tensorParam);

   profileClover.TPSTOP(QUDA_PROFILE_INIT);

   profileClover.TPSTART(QUDA_PROFILE_COMPUTE);

   computeFmunu(Fmunu, *gauge);

   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff);

   profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);


   // FIXME always preserve the extended gauge

   extendedGaugeResident = gauge;

 }


 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)

 {

   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);

   gParam.geometry = static_cast<QudaFieldGeometry>(geometry);

   if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)

     errorQuda("Only scalar and vector geometries are supported\n");


   cpuGaugeField *cpuGauge = nullptr;

   if (gauge) cpuGauge = new cpuGaugeField(gParam);


   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   auto* cudaGauge = new cudaGaugeField(gParam);


   if (gauge) {

     cudaGauge->loadCPUField(*cpuGauge);

     delete cpuGauge;

   }


   return cudaGauge;

 }


 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){


   auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);


   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);

   gParam.geometry = cudaGauge->Geometry();


   cpuGaugeField cpuGauge(gParam);

   cudaGauge->saveCPUField(cpuGauge);


 }


 void destroyGaugeFieldQuda(void* gauge){

   auto* g = reinterpret_cast<cudaGaugeField*>(gauge);

   delete g;

 }


 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *, void **x,

                                QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)

 {

   profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);


   GaugeFieldParam gParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);


   // create the host momentum field

   gParam.reconstruct = gauge_param->reconstruct;

   gParam.t_boundary = QUDA_PERIODIC_T;

   cpuGaugeField cpuMom(gParam);


   // create the device momentum field

   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;

   gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.reconstruct = QUDA_RECONSTRUCT_10;

   cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;


   // create temporary field for quark-field outer product

   gParam.reconstruct = QUDA_RECONSTRUCT_NO;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   cudaGaugeField cudaForce(gParam);

   GaugeField *cudaForce_[2] = {&cudaForce};


   ColorSpinorParam qParam;

   qParam.location = QUDA_CUDA_FIELD_LOCATION;

   qParam.nColor = 3;

   qParam.nSpin = 1;

   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;

   qParam.nDim = 5; // 5 since staggered mrhs

   qParam.setPrecision(gParam.Precision());

   qParam.pad = 0;

   for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];

   qParam.x[4] = 1;

   qParam.create = QUDA_NULL_FIELD_CREATE;

   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;

   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);


   if (gauge_param->use_resident_mom) {

     if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");

     cudaMom = momResident;

   } else {

     // download the initial momentum (FIXME make an option just to return?)

     cudaMom->loadCPUField(cpuMom);

   }


   // resident gauge field is required

   if (!gauge_param->use_resident_gauge || !gaugePrecise)

     errorQuda("Resident gauge field is required");


   if (!gaugePrecise->StaggeredPhaseApplied()) {

     errorQuda("Gauge field requires the staggered phase factors to be applied");

   }


   // check if staggered phase is the desired one

   if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {

     errorQuda("Requested staggered phase %d, but found %d\n",

               gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());

   }


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);


   const int nvector = inv_param->num_offset;

   std::vector<ColorSpinorField*> X(nvector);

   for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);


   if (inv_param->use_resident_solution) {

     if (solutionResident.size() < (unsigned int)nvector)

       errorQuda("solutionResident.size() %lu does not match number of shifts %d",

                 solutionResident.size(), nvector);

   }


   // create the staggered operator

   DiracParam diracParam;

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);

   if (!pc_solve)

     errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type);

   setDiracParam(diracParam, inv_param, pc_solve);

   Dirac *dirac = Dirac::create(diracParam);


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);


   for (int i=0; i<nvector; i++) {

     ColorSpinorField &x = *(X[i]);


     if (inv_param->use_resident_solution) x.Even() = *(solutionResident[i]);

     else errorQuda("%s requires resident solution", __func__);


     // set the odd solution component

     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);

   }


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);


 #if 0

   if (inv_param->use_resident_solution) {

     for (auto v : solutionResident) if (v) delete solutionResident[i];

     solutionResident.clear();

   }

 #endif

   delete dirac;


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);


   // compute quark-field outer product

   for (int i=0; i<nvector; i++) {

     ColorSpinorField &x = *(X[i]);

     // second component is zero since we have no three hop term

     double coeff[2] = {inv_param->residue[i], 0.0};


     // Operate on even-parity sites

     computeStaggeredOprod(cudaForce_, x, coeff, 1);

   }


   // mom += delta * [U * force]TA

   applyU(cudaForce, *gaugePrecise);

   updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");

   qudaDeviceSynchronize();


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);


   if (gauge_param->return_result_mom) {

     // copy the momentum field back to the host

     cudaMom->saveCPUField(cpuMom);

   }


   if (gauge_param->make_resident_mom) {

     // make the momentum field resident

     momResident = cudaMom;

   } else {

     delete cudaMom;

   }


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);

   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);


   for (int i=0; i<nvector; i++) delete X[i];


   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void computeHISQForceQuda(void* const milc_momentum,

                           double dt,

                           const double level2_coeff[6],

                           const double fat7_coeff[6],

                           const void* const w_link,

                           const void* const v_link,

                           const void* const u_link,

                           void **fermion,

                           int num_terms,

                           int num_naik_terms,

                           double **coeff,

                           QudaGaugeParam* gParam)

 {

 #ifdef  GPU_STAGGERED_OPROD

   using namespace quda;

   using namespace quda::fermion_force;

   profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);

   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);


   checkGaugeParam(gParam);


   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);


   // create the device outer-product field

   GaugeFieldParam oParam(0, *gParam, QUDA_GENERAL_LINKS);

   oParam.nFace = 0;

   oParam.create = QUDA_ZERO_FIELD_CREATE;

   oParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);

   cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);

   cudaGaugeField *naikOprod = new cudaGaugeField(oParam);


   {

     // default settings for the unitarization

     const double unitarize_eps = 1e-14;

     const double hisq_force_filter = 5e-5;

     const double max_det_error = 1e-10;

     const bool   allow_svd = true;

     const bool   svd_only = false;

     const double svd_rel_err = 1e-8;

     const double svd_abs_err = 1e-8;


     setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);

   }


   double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};

   // You have to look at the MILC routine to understand the following

   // Basically, I have already absorbed the one-link coefficient


   GaugeFieldParam param(milc_momentum, *gParam, QUDA_ASQTAD_MOM_LINKS);

   //param.nFace = 0;

   param.order  = QUDA_MILC_GAUGE_ORDER;

   param.reconstruct = QUDA_RECONSTRUCT_10;

   param.ghostExchange =  QUDA_GHOST_EXCHANGE_NO;

   cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;


   param.link_type = QUDA_GENERAL_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.gauge = (void*)w_link;

   cpuGaugeField cpuWLink(param);

   param.gauge = (void*)v_link;

   cpuGaugeField cpuVLink(param);

   param.gauge = (void*)u_link;

   cpuGaugeField cpuULink(param);


   param.create = QUDA_ZERO_FIELD_CREATE;

   param.order  = QUDA_FLOAT2_GAUGE_ORDER;

   param.link_type = QUDA_ASQTAD_MOM_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_10;

   GaugeFieldParam momParam(param);


   param.create = QUDA_ZERO_FIELD_CREATE;

   param.link_type = QUDA_GENERAL_LINKS;

   param.setPrecision(gParam->cpu_prec, true);


   int R[4] = { 2*comm_dim_partitioned(0), 2*comm_dim_partitioned(1), 2*comm_dim_partitioned(2), 2*comm_dim_partitioned(3) };

   for (int dir=0; dir<4; ++dir) {

     param.x[dir] += 2*R[dir];

     param.r[dir] = R[dir];

   }


   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.create = QUDA_ZERO_FIELD_CREATE;

   param.setPrecision(gParam->cpu_prec);

   param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;


   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);


   { // do outer-product computation

     ColorSpinorParam qParam;

     qParam.nColor = 3;

     qParam.nSpin = 1;

     qParam.siteSubset = QUDA_FULL_SITE_SUBSET;

     qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;

     qParam.nDim = 4;

     qParam.setPrecision(oParam.Precision());

     qParam.pad = 0;

     for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];


     // create the device quark field

     qParam.create = QUDA_NULL_FIELD_CREATE;

     qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;

     cudaColorSpinorField cudaQuark(qParam);


     // create the host quark field

     qParam.create = QUDA_REFERENCE_FIELD_CREATE;

     qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;

     qParam.v = fermion[0];


     { // regular terms

       GaugeField *oprod[2] = {stapleOprod, naikOprod};


       // loop over different quark fields

       for(int i=0; i<num_terms; ++i){


         // Wrap the MILC quark field

         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);

         qParam.v = fermion[i];

         cpuColorSpinorField cpuQuark(qParam); // create host quark field

         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);


         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);

         cudaQuark = cpuQuark;

         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);


         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);

         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

       }

     }


     { // naik terms

       oneLinkOprod->copy(*stapleOprod);

       ax(level2_coeff[0], *oneLinkOprod);

       GaugeField *oprod[2] = {oneLinkOprod, naikOprod};


       // loop over different quark fields

       for(int i=0; i<num_naik_terms; ++i){


         // Wrap the MILC quark field

         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);

         qParam.v = fermion[i + num_terms - num_naik_terms];

         cpuColorSpinorField cpuQuark(qParam); // create host quark field

         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);


         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);

         cudaQuark = cpuQuark;

         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);


         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);

         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

       }

     }

   }


   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);

   cudaGaugeField* cudaInForce = new cudaGaugeField(param);

   copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);

   delete stapleOprod;


   cudaGaugeField* cudaOutForce = new cudaGaugeField(param);

   copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);

   delete oneLinkOprod;


   cudaGaugeField* cudaGauge = new cudaGaugeField(param);

   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);


   cudaGauge->loadCPUField(cpuWLink, profileHISQForce);


   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);

   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);

   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);


   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff);

   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   // Load naik outer product

   copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);

   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);

   delete naikOprod;


   // Compute Naik three-link term

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

   hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1]);

   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);


   // load v-link

   cudaGauge->loadCPUField(cpuVLink, profileHISQForce);

   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);


   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

   *num_failures_h = 0;

   unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d);

   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);


   qudaMemset((void **)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());


   // read in u-link

   cudaGauge->loadCPUField(cpuULink, profileHISQForce);

   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);


   // Compute Fat7-staple term

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff);

   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   delete cudaInForce;

   cudaGaugeField* cudaMom = new cudaGaugeField(momParam);


   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);

   hisqCompleteForce(*cudaOutForce, *cudaGauge);

   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (gParam->use_resident_mom) {

     if (!momResident) errorQuda("No resident momentum field to use");

     updateMomentum(*momResident, dt, *cudaOutForce, "hisq");

   } else {

     updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");

   }


   if (gParam->return_result_mom) {

     // Close the paths, make anti-hermitian, and store in compressed format

     if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);

   }


   profileHISQForce.TPSTART(QUDA_PROFILE_FREE);


   if (cpuMom) delete cpuMom;


   if (!gParam->make_resident_mom) {

     delete momResident;

     momResident = nullptr;

   }

   if (cudaMom) delete cudaMom;

   delete cudaOutForce;

   delete cudaGauge;

   profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);


   profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);


 #else

   errorQuda("HISQ force has not been built");

 #endif

 }


 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,

                             double *coeff, double kappa2, double ck,

                             int nvector, double multiplicity, void *gauge,

                             QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {


   using namespace quda;

   profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);

   profileCloverForce.TPSTART(QUDA_PROFILE_INIT);


   checkGaugeParam(gauge_param);

   if (!gaugePrecise) errorQuda("No resident gauge field");


   GaugeFieldParam fParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);

   // create the host momentum field

   fParam.reconstruct = QUDA_RECONSTRUCT_10;

   fParam.order = gauge_param->gauge_order;

   cpuGaugeField cpuMom(fParam);


   // create the device momentum field

   fParam.create = QUDA_ZERO_FIELD_CREATE;

   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField cudaMom(fParam);


   // create the device force field

   fParam.link_type = QUDA_GENERAL_LINKS;

   fParam.create = QUDA_ZERO_FIELD_CREATE;

   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   fParam.reconstruct = QUDA_RECONSTRUCT_NO;

   cudaGaugeField cudaForce(fParam);


   ColorSpinorParam qParam;

   qParam.location = QUDA_CUDA_FIELD_LOCATION;

   qParam.nColor = 3;

   qParam.nSpin = 4;

   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;

   qParam.nDim = 4;

   qParam.setPrecision(fParam.Precision());

   qParam.pad = 0;

   for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];


   // create the device quark field

   qParam.create = QUDA_NULL_FIELD_CREATE;

   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;

   qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;


   std::vector<ColorSpinorField*> quarkX, quarkP;

   for (int i=0; i<nvector; i++) {

     quarkX.push_back(ColorSpinorField::Create(qParam));

     quarkP.push_back(ColorSpinorField::Create(qParam));

   }


   qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;

   qParam.x[0] /= 2;

   cudaColorSpinorField tmp(qParam);


   // create the host quark field

   qParam.create = QUDA_REFERENCE_FIELD_CREATE;

   qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;

   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface


   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);

   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc_solve);

   diracParam.tmp1 = &tmp; // use as temporary for dirac->M

   Dirac *dirac = Dirac::create(diracParam);


   if (inv_param->use_resident_solution) {

     if (solutionResident.size() < (unsigned int)nvector)

       errorQuda("solutionResident.size() %lu does not match number of shifts %d",

                 solutionResident.size(), nvector);

   }


   cudaGaugeField &gaugeEx = *extendedGaugeResident;


   // create oprod and trace fields

   fParam.geometry = QUDA_TENSOR_GEOMETRY;

   cudaGaugeField oprod(fParam);


   profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);

   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);


   std::vector<double> force_coeff(nvector);

   // loop over different quark fields

   for(int i=0; i<nvector; i++){

     ColorSpinorField &x = *(quarkX[i]);

     ColorSpinorField &p = *(quarkP[i]);


     if (!inv_param->use_resident_solution) {

       // for downloading x_e

       qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;

       qParam.x[0] /= 2;


       // Wrap the even-parity MILC quark field

       profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);

       profileCloverForce.TPSTART(QUDA_PROFILE_INIT);

       qParam.v = h_x[i];

       cpuColorSpinorField cpuQuarkX(qParam); // create host quark field

       profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);


       profileCloverForce.TPSTART(QUDA_PROFILE_H2D);

       x.Even() = cpuQuarkX;

       profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);


       profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);

       gamma5(x.Even(), x.Even());

     } else {

       x.Even() = *(solutionResident[i]);

     }


     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);

     dirac->M(p.Even(), x.Even());

     dirac->Dagger(QUDA_DAG_YES);

     dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);

     dirac->Dagger(QUDA_DAG_NO);


     gamma5(x, x);

     gamma5(p, p);


     force_coeff[i] = 2.0*dt*coeff[i]*kappa2;

   }


   computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);


   // In double precision the clover derivative is faster with no reconstruct

   cudaGaugeField *u = &gaugeEx;

   if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {

     GaugeFieldParam param(gaugeEx);

     param.reconstruct = QUDA_RECONSTRUCT_NO;

     u = new cudaGaugeField(param);

     u -> copy(gaugeEx);

   }


   computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);


   /* Now the U dA/dU terms */

   std::vector< std::vector<double> > ferm_epsilon(nvector);

   for (int shift = 0; shift < nvector; shift++) {

     ferm_epsilon[shift].reserve(2);

     ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;

     ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;

   }


   computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);


   cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);


   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);


   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);

   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);


   if (u != &gaugeEx) delete u;


   updateMomentum(cudaMom, -1.0, cudaForce, "clover");

   profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);


   // copy the outer product field back to the host

   profileCloverForce.TPSTART(QUDA_PROFILE_D2H);

   cudaMom.saveCPUField(cpuMom);

   profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);


   profileCloverForce.TPSTART(QUDA_PROFILE_FREE);


   for (int i=0; i<nvector; i++) {

     delete quarkX[i];

     delete quarkP[i];

   }


 #if 0

   if (inv_param->use_resident_solution) {

     for (auto v : solutionResident) if (v) delete v;

     solutionResident.clear();

   }

 #endif

   delete dirac;

   profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);


   profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void updateGaugeFieldQuda(void* gauge,

                           void* momentum,

                           double dt,

                           int conj_mom,

                           int exact,

                           QudaGaugeParam* param)

 {

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);


   // create the host fields

   GaugeFieldParam gParam(gauge, *param, QUDA_SU3_LINKS);

   gParam.site_offset = param->gauge_offset;

   gParam.site_size = param->site_size;

   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;

   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;


   GaugeFieldParam gParamMom(momentum, *param);

   gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?

    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;

   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;

   gParamMom.site_offset = param->mom_offset;

   gParamMom.site_size = param->site_size;

   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;


   // create the device fields

   gParam.create = QUDA_NULL_FIELD_CREATE;

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;

   gParam.reconstruct = QUDA_RECONSTRUCT_10;

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   gParam.pad = 0;

   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;


   gParam.link_type = QUDA_SU3_LINKS;

   gParam.reconstruct = param->reconstruct;

   cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;

   auto *cudaOutGauge = new cudaGaugeField(gParam);


   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);


   profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);


   if (!param->use_resident_gauge) {   // load fields onto the device

     cudaInGauge->loadCPUField(*cpuGauge);

   } else { // or use resident fields already present

     if (!gaugePrecise) errorQuda("No resident gauge field allocated");

     cudaInGauge = gaugePrecise;

     gaugePrecise = nullptr;

   }


   if (!param->use_resident_mom) {

     cudaMom->loadCPUField(*cpuMom);

   } else {

     if (!momResident) errorQuda("No resident mom field allocated");

     cudaMom = momResident;

     momResident = nullptr;

   }


   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);


   // perform the update

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);

   updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,

       (bool)conj_mom, (bool)exact);

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (param->return_result_gauge) {

     // copy the gauge field back to the host

     profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);

     cudaOutGauge->saveCPUField(*cpuGauge);

     profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);

   }


   profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);

   if (param->make_resident_gauge) {

     if (gaugePrecise != nullptr) delete gaugePrecise;

     gaugePrecise = cudaOutGauge;

   } else {

     delete cudaOutGauge;

   }


   if (param->make_resident_mom) {

     if (momResident != nullptr && momResident != cudaMom) delete momResident;

     momResident = cudaMom;

   } else {

     delete cudaMom;

   }


   delete cudaInGauge;

   if (cpuMom) delete cpuMom;

   if (cpuGauge) delete cpuGauge;


   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);

 }


  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {

    profileProject.TPSTART(QUDA_PROFILE_TOTAL);


    profileProject.TPSTART(QUDA_PROFILE_INIT);

    checkGaugeParam(param);


    // create the gauge field

    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);

    gParam.site_offset = param->gauge_offset;

    gParam.site_size = param->site_size;

    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;

    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;


    // create the device fields

    gParam.create = QUDA_NULL_FIELD_CREATE;

    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

    gParam.reconstruct = param->reconstruct;

    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;

    profileProject.TPSTOP(QUDA_PROFILE_INIT);


    if (param->use_resident_gauge) {

      if (!gaugePrecise) errorQuda("No resident gauge field to use");

      cudaGauge = gaugePrecise;

      gaugePrecise = nullptr;

    } else {

      profileProject.TPSTART(QUDA_PROFILE_H2D);

      cudaGauge->loadCPUField(*cpuGauge);

      profileProject.TPSTOP(QUDA_PROFILE_H2D);

    }


    profileProject.TPSTART(QUDA_PROFILE_COMPUTE);

    *num_failures_h = 0;


    // project onto SU(3)

    if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();

    projectSU3(*cudaGauge, tol, num_failures_d);

    if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase();


    profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);


    if(*num_failures_h>0)

      errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);


    profileProject.TPSTART(QUDA_PROFILE_D2H);

    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);

    profileProject.TPSTOP(QUDA_PROFILE_D2H);


    if (param->make_resident_gauge) {

      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;

      gaugePrecise = cudaGauge;

    } else {

      delete cudaGauge;

    }


    profileProject.TPSTART(QUDA_PROFILE_FREE);

    if (cpuGauge) delete cpuGauge;

    profileProject.TPSTOP(QUDA_PROFILE_FREE);


    profileProject.TPSTOP(QUDA_PROFILE_TOTAL);

  }


  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {

    profilePhase.TPSTART(QUDA_PROFILE_TOTAL);


    profilePhase.TPSTART(QUDA_PROFILE_INIT);

    checkGaugeParam(param);


    // create the gauge field

    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);

    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;

    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;


    // create the device fields

    gParam.create = QUDA_NULL_FIELD_CREATE;

    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

    gParam.reconstruct = param->reconstruct;

    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;

    profilePhase.TPSTOP(QUDA_PROFILE_INIT);


    if (param->use_resident_gauge) {

      if (!gaugePrecise) errorQuda("No resident gauge field to use");

      cudaGauge = gaugePrecise;

    } else {

      profilePhase.TPSTART(QUDA_PROFILE_H2D);

      cudaGauge->loadCPUField(*cpuGauge);

      profilePhase.TPSTOP(QUDA_PROFILE_H2D);

    }


    profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);

    *num_failures_h = 0;


    // apply / remove phase as appropriate

    if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();

    else cudaGauge->removeStaggeredPhase();


    profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);


    profilePhase.TPSTART(QUDA_PROFILE_D2H);

    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);

    profilePhase.TPSTOP(QUDA_PROFILE_D2H);


    if (param->make_resident_gauge) {

      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise;

      gaugePrecise = cudaGauge;

    } else {

      delete cudaGauge;

    }


    profilePhase.TPSTART(QUDA_PROFILE_FREE);

    if (cpuGauge) delete cpuGauge;

    profilePhase.TPSTOP(QUDA_PROFILE_FREE);


    profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);

  }


 // evaluate the momentum action

 double momActionQuda(void* momentum, QudaGaugeParam* param)

 {

   profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);


   profileMomAction.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(param);


   // create the momentum fields

   GaugeFieldParam gParam(momentum, *param, QUDA_ASQTAD_MOM_LINKS);

   gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?

     QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;

   gParam.site_offset = param->mom_offset;

   gParam.site_size = param->site_size;


   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr;


   // create the device fields

   gParam.create = QUDA_NULL_FIELD_CREATE;

   gParam.reconstruct = QUDA_RECONSTRUCT_10;

   gParam.setPrecision(param->cuda_prec, true);


   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;


   profileMomAction.TPSTOP(QUDA_PROFILE_INIT);


   profileMomAction.TPSTART(QUDA_PROFILE_H2D);

   if (!param->use_resident_mom) {

     cudaMom->loadCPUField(*cpuMom);

   } else {

     if (!momResident) errorQuda("No resident mom field allocated");

     cudaMom = momResident;

   }

   profileMomAction.TPSTOP(QUDA_PROFILE_H2D);


   // perform the update

   profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);

   double action = computeMomAction(*cudaMom);

   profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);


   profileMomAction.TPSTART(QUDA_PROFILE_FREE);

   if (param->make_resident_mom) {

     if (momResident != nullptr && momResident != cudaMom) delete momResident;

     momResident = cudaMom;

   } else {

     delete cudaMom;

     momResident = nullptr;

   }

   if (cpuMom) {

     delete cpuMom;

   }


   profileMomAction.TPSTOP(QUDA_PROFILE_FREE);

   profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);


   return action;

 }


 /*

   The following functions are for the Fortran interface.

 */


 void init_quda_(int *dev) { initQuda(*dev); }

 void init_quda_device_(int *dev) { initQudaDevice(*dev); }

 void init_quda_memory_() { initQudaMemory(); }

 void end_quda_() { endQuda(); }

 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }

 void free_gauge_quda_() { freeGaugeQuda(); }

 void free_sloppy_gauge_quda_() { freeSloppyGaugeQuda(); }

 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)

 { loadCloverQuda(h_clover, h_clovinv, inv_param); }

 void free_clover_quda_(void) { freeCloverQuda(); }

 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,

     QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }

 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,

     QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }

 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)

 { MatQuda(h_out, h_in, inv_param); }

 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)

 { MatDagMatQuda(h_out, h_in, inv_param); }

 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {

   fflush(stdout);

   // ensure that fifth dimension is set to 1

   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;

   invertQuda(hp_x, hp_b, param);

   fflush(stdout);

 }


 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)

 {

   // ensure that fifth dimension is set to 1

   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;


   if (!gaugePrecise) errorQuda("Resident gauge field not allocated");


   // get data into array of pointers

   int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;


   // compute offset assuming TIFR padded ordering (FIXME)

   if (param->dirac_order != QUDA_TIFR_PADDED_DIRAC_ORDER)

     errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER and not %d", param->dirac_order);


   const int *X = gaugePrecise->X();

   size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;

   void *hp_x[QUDA_MAX_MULTI_SHIFT];

   for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;


   invertMultiShiftQuda(hp_x, hp_b, param);

 }


 void flush_chrono_quda_(int *index) { flushChronoQuda(*index); }


 void register_pinned_quda_(void *ptr, size_t *bytes) {

   cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);

   checkCudaError();

 }


 void unregister_pinned_quda_(void *ptr) {

   cudaHostUnregister(ptr);

   checkCudaError();

 }


 void new_quda_gauge_param_(QudaGaugeParam *param) {

   *param = newQudaGaugeParam();

 }

 void new_quda_invert_param_(QudaInvertParam *param) {

   *param = newQudaInvertParam();

 }


 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,

     bool *conj_mom, bool *exact,

     QudaGaugeParam *param) {

   updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);

 }


 static inline int opp(int dir) { return 7-dir; }


 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){


   int index=0;

   // Plaquette paths

   if (num_loop_types >= 1)

     for(int i=0; i<4; ++i){

       if(i==dir) continue;

       paths[index][0] = i;        paths[index][1] = opp(dir);   paths[index++][2] = opp(i);

       paths[index][0] = opp(i);   paths[index][1] = opp(dir);   paths[index++][2] = i;

     }


   // Rectangle Paths

   if (num_loop_types >= 2)

     for(int i=0; i<4; ++i){

       if(i==dir) continue;

       paths[index][0] = paths[index][1] = i;       paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);

       index++;

       paths[index][0] = paths[index][1] = opp(i);  paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;

       index++;

       paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);

       index++;

       paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;

       index++;

       paths[index][0] = i;  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;

       index++;

       paths[index][0] = opp(i);  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;

       index++;

     }


   if (num_loop_types >= 3) {

     // Staple paths

     for(int i=0; i<4; ++i){

       for(int j=0; j<4; ++j){

         if(i==dir || j==dir || i==j) continue;

         paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);

         index++;

         paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;

         index++;

         paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);

         index++;

         paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;

         index++;

      }

     }

   }


 }


 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,

                                QudaGaugeParam *param) {


   int numPaths = 0;

   switch (*num_loop_types) {

   case 1:

     numPaths = 6;

     break;

   case 2:

     numPaths = 24;

     break;

   case 3:

     numPaths = 48;

     break;

   default:

     errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);

   }


   auto *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));

   int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));


   if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {

       loop_coeff[i] = coeff[0];

       path_length[i] = 3;

     }

   if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {

       loop_coeff[i] = coeff[1];

       path_length[i] = 5;

     }

   if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {

       loop_coeff[i] = coeff[2];

       path_length[i] = 5;

     }


   int** input_path_buf[4];

   for(int dir=0; dir<4; ++dir){

     input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));

     for(int i=0; i<numPaths; ++i){

       input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));

     }

     createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);

   }


   int max_length = 6;


   computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);


   for(auto & dir : input_path_buf){

     for(int i=0; i<numPaths; ++i) host_free(dir[i]);

     host_free(dir);

   }


   host_free(path_length);

   host_free(loop_coeff);

 }


 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {

   computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);

 }


 // apply the staggered phases

 void apply_staggered_phase_quda_() {

   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");

   if (gaugePrecise) {

     gaugePrecise->applyStaggeredPhase();

   } else {

     errorQuda("No persistent gauge field");

   }

 }


 // remove the staggered phases

 void remove_staggered_phase_quda_() {

   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");

   if (gaugePrecise) {

     gaugePrecise->removeStaggeredPhase();

   } else {

     errorQuda("No persistent gauge field");

   }

   qudaDeviceSynchronize();

 }


 // evaluate the kinetic term

 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {

   *kin = momActionQuda(momentum, param);

 }


 #ifdef MULTI_GPU

 static int bqcd_rank_from_coords(const int *coords, void *fdata)

 {

   int *dims = static_cast<int *>(fdata);


   int rank = coords[3];

   for (int i = 2; i >= 0; i--) {

     rank = dims[i] * rank + coords[i];

   }

   return rank;

 }

 #endif


 void comm_set_gridsize_(int *grid)

 {

 #ifdef MULTI_GPU

   initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));

 #endif

 }


 void set_kernel_pack_t_(int* pack)

 {

   bool pack_ = *pack ? true : false;

   setKernelPackT(pack_);

 }


 void gaussGaugeQuda(unsigned long long seed, double sigma)

 {

   profileGauss.TPSTART(QUDA_PROFILE_TOTAL);


   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");


   cudaGaugeField *data = gaugePrecise;


   GaugeFieldParam param(*data);

   param.reconstruct = QUDA_RECONSTRUCT_12;

   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   cudaGaugeField u(param);


   profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);

   quda::gaugeGauss(*data, seed, sigma);

   profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);


   if (extendedGaugeResident) {

     *extendedGaugeResident = *gaugePrecise;

     extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);

   }


   profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 /*

  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.

  */

 void plaq_quda_(double plaq[3]) {

   plaqQuda(plaq);

 }


 void plaqQuda(double plaq[3])

 {

   profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);


   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");


   cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);

   extendedGaugeResident = data;


   profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);

   double3 plaq3 = quda::plaquette(*data);

   plaq[0] = plaq3.x;

   plaq[1] = plaq3.y;

   plaq[2] = plaq3.z;

   profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);


   profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 /*

  * Performs a deep copy from the internal extendedGaugeResident field.

  */

 void copyExtendedResidentGaugeQuda(void* resident_gauge, QudaFieldLocation loc)

 {

   //profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);


   if (!gaugePrecise) errorQuda("Cannot perform deep copy of resident gauge field as there is no resident gauge field");


   cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);

   extendedGaugeResident = data;


   auto* io_gauge = (cudaGaugeField*)resident_gauge;


   copyExtendedGauge(*io_gauge, *extendedGaugeResident, loc);


   //profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)

 {

   profileWuppertal.TPSTART(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   cudaGaugeField *precise = nullptr;


   if (gaugeSmeared != nullptr) {

     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");

     GaugeFieldParam gParam(*gaugePrecise);

     gParam.create = QUDA_NULL_FIELD_CREATE;

     precise = new cudaGaugeField(gParam);

     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);

     precise->exchangeGhost();

   } else {

     if (getVerbosity() >= QUDA_VERBOSE)

       printfQuda("Wuppertal smearing done with gaugePrecise\n");

     precise = gaugePrecise;

   }


   ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), false, inv_param->input_location);

   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*in_h);

     double gpu = blas::norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);

   int parity = 0;


   // Computes out(x) = 1/(1+6*alpha)*(in(x) + alpha*\sum_mu (U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)))

   double a = alpha / (1. + 6. * alpha);

   double b = 1. / (1. + 6. * alpha);


   for (unsigned int i = 0; i < n_steps; i++) {

     if (i) in = out;

     ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, nullptr, profileWuppertal);

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       double norm = blas::norm2(out);

       printfQuda("Step %d, vector norm %e\n", i, norm);

     }

   }


   cpuParam.v = h_out;

   cpuParam.location = inv_param->output_location;

   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   *out_h = out;


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

     double cpu = blas::norm2(*out_h);

     double gpu = blas::norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   if (gaugeSmeared != nullptr)

     delete precise;


   delete out_h;

   delete in_h;


   popVerbosity();


   profileWuppertal.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void performAPEnStep(unsigned int n_steps, double alpha, int meas_interval)

 {

   profileAPE.TPSTART(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");


   if (gaugeSmeared != nullptr) delete gaugeSmeared;

   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);


   GaugeFieldParam gParam(*gaugeSmeared);

   auto *cudaGaugeTemp = new cudaGaugeField(gParam);


   QudaGaugeObservableParam param = newQudaGaugeObservableParam();

   param.compute_qcharge = QUDA_BOOLEAN_TRUE;


   if (getVerbosity() >= QUDA_SUMMARIZE) {

     gaugeObservablesQuda(&param);

     printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);

   }


   for (unsigned int i = 0; i < n_steps; i++) {

     profileAPE.TPSTART(QUDA_PROFILE_COMPUTE);

     APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);

     profileAPE.TPSTOP(QUDA_PROFILE_COMPUTE);

     if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {

       gaugeObservablesQuda(&param);

       printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);

     }

   }


   delete cudaGaugeTemp;

   profileAPE.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void performSTOUTnStep(unsigned int n_steps, double rho, int meas_interval)

 {

   profileSTOUT.TPSTART(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");


   if (gaugeSmeared != nullptr) delete gaugeSmeared;

   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileSTOUT);


   GaugeFieldParam gParam(*gaugeSmeared);

   auto *cudaGaugeTemp = new cudaGaugeField(gParam);


   QudaGaugeObservableParam param = newQudaGaugeObservableParam();

   param.compute_qcharge = QUDA_BOOLEAN_TRUE;


   if (getVerbosity() >= QUDA_SUMMARIZE) {

     gaugeObservablesQuda(&param);

     printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);

   }


   for (unsigned int i = 0; i < n_steps; i++) {

     profileSTOUT.TPSTART(QUDA_PROFILE_COMPUTE);

     STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);

     profileSTOUT.TPSTOP(QUDA_PROFILE_COMPUTE);

     if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {

       gaugeObservablesQuda(&param);

       printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);

     }

   }


   delete cudaGaugeTemp;

   profileSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void performOvrImpSTOUTnStep(unsigned int n_steps, double rho, double epsilon, int meas_interval)

 {

   profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");


   if (gaugeSmeared != nullptr) delete gaugeSmeared;

   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileOvrImpSTOUT);


   GaugeFieldParam gParam(*gaugeSmeared);

   auto *cudaGaugeTemp = new cudaGaugeField(gParam);


   QudaGaugeObservableParam param = newQudaGaugeObservableParam();

   param.compute_qcharge = QUDA_BOOLEAN_TRUE;


   if (getVerbosity() >= QUDA_SUMMARIZE) {

     gaugeObservablesQuda(&param);

     printfQuda("Q charge at step %03d = %+.16e\n", 0, param.qcharge);

   }


   for (unsigned int i = 0; i < n_steps; i++) {

     profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_COMPUTE);

     OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);

     profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_COMPUTE);

     if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_VERBOSE) {

       gaugeObservablesQuda(&param);

       printfQuda("Q charge at step %03d = %+.16e\n", i + 1, param.qcharge);

     }

   }


   delete cudaGaugeTemp;

   profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void performWFlownStep(unsigned int n_steps, double step_size, int meas_interval, QudaWFlowType wflow_type)

 {

   pushOutputPrefix("performWFlownStep: ");

   profileWFlow.TPSTART(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");


   if (gaugeSmeared != nullptr) delete gaugeSmeared;

   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileWFlow);


   GaugeFieldParam gParamEx(*gaugeSmeared);

   auto *gaugeAux = GaugeField::Create(gParamEx);


   GaugeFieldParam gParam(*gaugePrecise);

   gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct

   auto *gaugeTemp = GaugeField::Create(gParam);


   GaugeField *in = gaugeSmeared;

   GaugeField *out = gaugeAux;


   QudaGaugeObservableParam param = newQudaGaugeObservableParam();

   param.compute_plaquette = QUDA_BOOLEAN_TRUE;

   param.compute_qcharge = QUDA_BOOLEAN_TRUE;


   if (getVerbosity() >= QUDA_SUMMARIZE) {

     gaugeObservables(*in, param, profileWFlow);

     printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");

     printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, param.plaquette[0], param.energy[0], param.energy[1],

                param.energy[2], param.qcharge);

   }


   for (unsigned int i = 0; i < n_steps; i++) {

     // Perform W1, W2, and Vt Wilson Flow steps as defined in

     // https://arxiv.org/abs/1006.4518v3

     profileWFlow.TPSTART(QUDA_PROFILE_COMPUTE);

     if (i > 0) std::swap(in, out); // output from prior step becomes input for next step


     WFlowStep(*out, *gaugeTemp, *in, step_size, wflow_type);

     profileWFlow.TPSTOP(QUDA_PROFILE_COMPUTE);


     if ((i + 1) % meas_interval == 0 && getVerbosity() >= QUDA_SUMMARIZE) {

       gaugeObservables(*out, param, profileWFlow);

       printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", step_size * (i + 1), param.plaquette[0], param.energy[0],

                  param.energy[1], param.energy[2], param.qcharge);

     }

   }


   delete gaugeTemp;

   delete gaugeAux;

   profileWFlow.TPSTOP(QUDA_PROFILE_TOTAL);

   popOutputPrefix();

 }


 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,

                               const unsigned int verbose_interval, const double relax_boost, const double tolerance,

                               const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param,

                               double *timeinfo)

 {

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(gauge, *param);

   auto *cpuGauge = new cpuGaugeField(gParam);


   // gParam.pad = getFatLinkPadding(param->X);

   gParam.create = QUDA_NULL_FIELD_CREATE;

   gParam.link_type = param->type;

   gParam.reconstruct = param->reconstruct;

   gParam.setPrecision(gParam.Precision(), true);

   auto *cudaInGauge = new cudaGaugeField(gParam);


   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);


   cudaInGauge->loadCPUField(*cpuGauge);

  /* } else { // or use resident fields already present

     if (!gaugePrecise) errorQuda("No resident gauge field allocated");

     cudaInGauge = gaugePrecise;

     gaugePrecise = nullptr;

   } */


   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);


   if (comm_size() == 1) {

     // perform the update

     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);

     gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,

                    stopWtheta);

     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);

   } else {

     cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);


     // perform the update

     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);

     gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,

                    stopWtheta);

     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);


     //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge

     copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);

   }


   // copy the gauge field back to the host

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);

   cudaInGauge->saveCPUField(*cpuGauge);

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);


   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);


   if (param->make_resident_gauge) {

     if (gaugePrecise != nullptr) delete gaugePrecise;

     gaugePrecise = cudaInGauge;

   } else {

     delete cudaInGauge;

   }


   if(timeinfo){

     timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);

     timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);

     timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);

   }


   return 0;

 }


 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \

   const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \

   const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)

 {

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);


   GaugeFieldParam gParam(gauge, *param);

   auto *cpuGauge = new cpuGaugeField(gParam);


   //gParam.pad = getFatLinkPadding(param->X);

   gParam.create      = QUDA_NULL_FIELD_CREATE;

   gParam.link_type   = param->type;

   gParam.reconstruct = param->reconstruct;

   gParam.setPrecision(gParam.Precision(), true);

   auto *cudaInGauge = new cudaGaugeField(gParam);


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);


   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);


   //if (!param->use_resident_gauge) {   // load fields onto the device

   cudaInGauge->loadCPUField(*cpuGauge);

   /*} else { // or use resident fields already present

     if (!gaugePrecise) errorQuda("No resident gauge field allocated");

     cudaInGauge = gaugePrecise;

     gaugePrecise = nullptr;

   } */


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);


   // perform the update

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);


   gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);


   // copy the gauge field back to the host

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);

   cudaInGauge->saveCPUField(*cpuGauge);

   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);


   if (param->make_resident_gauge) {

     if (gaugePrecise != nullptr) delete gaugePrecise;

     gaugePrecise = cudaInGauge;

   } else {

     delete cudaInGauge;

   }


   if (timeinfo) {

     timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);

     timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);

     timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);

   }


   return 0;

 }


 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,

                   QudaInvertParam *param, const int *X)

 {

   // DMH: Easiest way to construct ColorSpinorField? Do we require the user

   //     to declare and fill and invert_param, or can it just be hacked?.


   profileContract.TPSTART(QUDA_PROFILE_TOTAL);

   profileContract.TPSTART(QUDA_PROFILE_INIT);

   // wrap CPU host side pointers

   ColorSpinorParam cpuParam((void *)hp_x, *param, X, false, param->input_location);

   ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);


   cpuParam.v = (void *)hp_y;

   ColorSpinorField *h_y = ColorSpinorField::Create(cpuParam);


   // Create device parameter

   ColorSpinorParam cudaParam(cpuParam);

   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;

   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   // Quda uses Degrand-Rossi gamma basis for contractions and will

   // automatically reorder data if necessary.

   cudaParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;

   cudaParam.setPrecision(cpuParam.Precision(), cpuParam.Precision(), true);


   std::vector<ColorSpinorField *> x, y;

   x.push_back(ColorSpinorField::Create(cudaParam));

   y.push_back(ColorSpinorField::Create(cudaParam));


   size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();

   void *d_result = pool_device_malloc(data_bytes);

   profileContract.TPSTOP(QUDA_PROFILE_INIT);


   profileContract.TPSTART(QUDA_PROFILE_H2D);

   *x[0] = *h_x;

   *y[0] = *h_y;

   profileContract.TPSTOP(QUDA_PROFILE_H2D);


   profileContract.TPSTART(QUDA_PROFILE_COMPUTE);

   contractQuda(*x[0], *y[0], d_result, cType);

   profileContract.TPSTOP(QUDA_PROFILE_COMPUTE);


   profileContract.TPSTART(QUDA_PROFILE_D2H);

   qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);

   profileContract.TPSTOP(QUDA_PROFILE_D2H);


   profileContract.TPSTART(QUDA_PROFILE_FREE);

   pool_device_free(d_result);

   delete x[0];

   delete y[0];

   delete h_y;

   delete h_x;

   profileContract.TPSTOP(QUDA_PROFILE_FREE);


   profileContract.TPSTOP(QUDA_PROFILE_TOTAL);

 }


 void gaugeObservablesQuda(QudaGaugeObservableParam *param)

 {

   profileGaugeObs.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeObservableParam(param);


   cudaGaugeField *gauge = nullptr;

   if (!gaugeSmeared) {

     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeObs);

     gauge = extendedGaugeResident;

   } else {

     gauge = gaugeSmeared;

   }


   gaugeObservables(*gauge, *param, profileGaugeObs);

   profileGaugeObs.TPSTOP(QUDA_PROFILE_TOTAL);

 }

blas_lapack.h

blas_magma.h

CloseMagma
void CloseMagma()

OpenMagma
void OpenMagma()

blas_quda.h

check_params.h

quda::CG
Conjugate-Gradient Solver.
Definition: invert_quda.h:639

quda::CloverField
Definition: clover_field.h:106

quda::CloverField::TrLog
double * TrLog() const
Definition: clover_field.h:157

quda::CloverField::Coeff
double Coeff() const
Definition: clover_field.h:202

quda::CloverField::Create
static CloverField * Create(const CloverFieldParam &param)
Definition: clover_field.cpp:65

quda::CloverField::setRho
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch),...
Definition: clover_field.cpp:80

quda::CloverField::V
void * V(bool inverse=false)
Definition: clover_field.h:138

quda::CloverField::Csw
double Csw() const
Definition: clover_field.h:197

quda::CloverField::Bytes
size_t Bytes() const
Definition: clover_field.h:167

quda::ColorSpinorField
Definition: color_spinor_field.h:379

quda::ColorSpinorField::Odd
const ColorSpinorField & Odd() const
Definition: color_spinor_field.cpp:578

quda::ColorSpinorField::SiteSubset
QudaSiteSubset SiteSubset() const
Definition: color_spinor_field.h:566

quda::ColorSpinorField::Create
static ColorSpinorField * Create(const ColorSpinorParam &param)
Definition: color_spinor_field.cpp:714

quda::ColorSpinorField::Even
const ColorSpinorField & Even() const
Definition: color_spinor_field.cpp:570

quda::ColorSpinorParam
Definition: color_spinor_field.h:131

quda::ColorSpinorParam::gammaBasis
QudaGammaBasis gammaBasis
Definition: color_spinor_field.h:145

quda::ColorSpinorParam::nColor
int nColor
Definition: color_spinor_field.h:136

quda::ColorSpinorParam::location
QudaFieldLocation location
Definition: color_spinor_field.h:134

quda::ColorSpinorParam::setPrecision
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
Definition: color_spinor_field.h:172

quda::ColorSpinorParam::fieldOrder
QudaFieldOrder fieldOrder
Definition: color_spinor_field.h:144

quda::ColorSpinorParam::norm
void * norm
Definition: color_spinor_field.h:157

quda::ColorSpinorParam::siteOrder
QudaSiteOrder siteOrder
Definition: color_spinor_field.h:142

quda::ColorSpinorParam::nSpin
int nSpin
Definition: color_spinor_field.h:137

quda::ColorSpinorParam::create
QudaFieldCreate create
Definition: color_spinor_field.h:146

quda::ColorSpinorParam::v
void * v
Definition: color_spinor_field.h:156

quda::Deflation
Definition: deflation.h:78

quda::DiracCloverPC
Definition: dirac_quda.h:543

quda::DiracG5M
Definition: dirac_quda.h:2304

quda::Dirac
Definition: dirac_quda.h:133

quda::Dirac::Dslash4
virtual void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Apply the local MdagM operator: equivalent to applying zero Dirichlet boundary condition to MdagM on ...
Definition: dirac_quda.h:243

quda::Dirac::prefetch
virtual void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the gauge field and temporary spinors to the CPU ...
Definition: dirac.cpp:305

quda::Dirac::setMass
void setMass(double mass)
Definition: dirac_quda.h:282

quda::Dirac::Dagger
void Dagger(QudaDagType dag) const
sets whether operator is daggered or not
Definition: dirac_quda.h:333

quda::Dirac::create
static Dirac * create(const DiracParam &param)
Creates a subclass from parameters.
Definition: dirac.cpp:151

quda::Dirac::Mdag
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Apply Mdag (daggered operator of M.
Definition: dirac.cpp:92

quda::DiracM
Definition: dirac_quda.h:1971

quda::DiracMMdag
Definition: dirac_quda.h:2142

quda::DiracMatrix
Definition: dirac_quda.h:1892

quda::DiracMatrix::shift
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
Definition: dirac_quda.h:1967

quda::DiracMdag
Definition: dirac_quda.h:2203

quda::DiracMdagM
Definition: dirac_quda.h:2025

quda::DiracMdagMLocal
Definition: dirac_quda.h:2087

quda::DiracParam
Definition: dirac_quda.h:21

quda::DiracParam::b_5
Complex b_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:29

quda::DiracParam::m5
double m5
Definition: dirac_quda.h:27

quda::DiracParam::tmp1
ColorSpinorField * tmp1
Definition: dirac_quda.h:52

quda::DiracParam::gauge
cudaGaugeField * gauge
Definition: dirac_quda.h:41

quda::DiracParam::epsilon
double epsilon
Definition: dirac_quda.h:50

quda::DiracParam::mass
double mass
Definition: dirac_quda.h:26

quda::DiracParam::mu
double mu
Definition: dirac_quda.h:48

quda::DiracParam::Ls
int Ls
Definition: dirac_quda.h:28

quda::DiracParam::mq2
double mq2
Definition: dirac_quda.h:36

quda::DiracParam::commDim
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:55

quda::DiracParam::type
QudaDiracType type
Definition: dirac_quda.h:24

quda::DiracParam::clover
cudaCloverField * clover
Definition: dirac_quda.h:45

quda::DiracParam::kappa
double kappa
Definition: dirac_quda.h:25

quda::DiracParam::c_5
Complex c_5[QUDA_MAX_DWF_LS]
Definition: dirac_quda.h:30

quda::DiracParam::eofa_pm
int eofa_pm
Definition: dirac_quda.h:34

quda::DiracParam::mq3
double mq3
Definition: dirac_quda.h:37

quda::DiracParam::longGauge
cudaGaugeField * longGauge
Definition: dirac_quda.h:43

quda::DiracParam::eofa_shift
double eofa_shift
Definition: dirac_quda.h:33

quda::DiracParam::fatGauge
cudaGaugeField * fatGauge
Definition: dirac_quda.h:42

quda::DiracParam::halo_precision
QudaPrecision halo_precision
Definition: dirac_quda.h:57

quda::DiracParam::matpcType
QudaMatPCType matpcType
Definition: dirac_quda.h:39

quda::DiracParam::mq1
double mq1
Definition: dirac_quda.h:35

quda::DiracParam::dagger
QudaDagType dagger
Definition: dirac_quda.h:40

quda::DiracParam::laplace3D
int laplace3D
Definition: dirac_quda.h:44

quda::DiracTwistedCloverPC
Definition: dirac_quda.h:1128

quda::EigenSolver
Definition: eigensolve_quda.h:15

quda::EigenSolver::create
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
Definition: eigensolve_quda.cpp:97

quda::GaugeCovDev::prepare
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
Definition: gauge_covdev.cpp:73

quda::GaugeCovDev::M
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
Definition: gauge_covdev.cpp:63

quda::GaugeCovDev::MdagM
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
Apply MdagM operator which may be optimized.
Definition: gauge_covdev.cpp:68

quda::GaugeCovDev::Dslash
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
Definition: gauge_covdev.cpp:49

quda::GaugeCovDev::reconstruct
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
Definition: gauge_covdev.cpp:80

quda::GaugeField
Definition: gauge_field.h:200

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294

quda::GaugeField::StaggeredPhase
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:295

quda::GaugeField::removeStaggeredPhase
void removeStaggeredPhase()
Definition: gauge_field.cpp:164

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:287

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:352

quda::GaugeField::Create
static GaugeField * Create(const GaugeFieldParam &param)
Create the gauge field, with meta data specified in the parameter struct.
Definition: gauge_field.cpp:349

quda::GaugeField::applyStaggeredPhase
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
Definition: gauge_field.cpp:149

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:285

quda::GaugeField::Gauge_p
virtual void * Gauge_p()
Definition: gauge_field.h:358

quda::GaugeField::checksum
uint64_t checksum(bool mini=false) const
Definition: gauge_field.cpp:345

quda::GaugeField::Anisotropy
double Anisotropy() const
Definition: gauge_field.h:288

quda::GaugeField::StaggeredPhaseApplied
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:296

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:567

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:505

quda::LatticeField::freeGhostBuffer
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
Definition: lattice_field.cpp:283

quda::MG
Definition: multigrid.h:182

quda::MinResExt
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
Definition: invert_quda.h:1303

quda::MultiShiftCG
Multi-Shift Conjugate Gradient Solver.
Definition: invert_quda.h:1258

quda::Solver
Definition: invert_quda.h:462

quda::Solver::create
static Solver * create(SolverParam &param, const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, TimeProfile &profile)
Solver factory.
Definition: solver.cpp:42

quda::TimeProfile
Definition: timer.h:174

quda::TimeProfile::Print
void Print()
Definition: timer.cpp:7

quda::TimeProfile::PrintGlobal
static void PrintGlobal()
Definition: timer.cpp:84

quda::TimeProfile::Last
double Last(QudaProfileType idx)
Definition: timer.h:254

quda::cpuCloverField
Definition: clover_field.h:332

quda::cpuColorSpinorField
Definition: color_spinor_field.h:976

quda::cpuColorSpinorField::freeGhostBuffer
static void freeGhostBuffer(void)
Definition: cpu_color_spinor_field.cpp:283

quda::cpuGaugeField
Definition: gauge_field.h:626

quda::cudaCloverField
Definition: clover_field.h:253

quda::cudaCloverField::copy
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
Definition: clover_field.cpp:174

quda::cudaColorSpinorField
Definition: color_spinor_field.h:682

quda::cudaGaugeField
Definition: gauge_field.h:449

quda::cudaGaugeField::copy
void copy(const GaugeField &src)
Definition: cuda_gauge_field.cpp:531

quda::cudaGaugeField::exchangeGhost
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
Definition: cuda_gauge_field.cpp:113

quda::cudaGaugeField::zero
void zero()
Definition: cuda_gauge_field.cpp:751

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
Definition: cuda_gauge_field.cpp:635

quda::cudaGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:580

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
Definition: cuda_gauge_field.cpp:646

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cpp:411

clover_field.h

color_spinor_field.h

comm_quda.h

comm_barrier
void comm_barrier(void)
Definition: communicator_stack.cpp:192

comm_size
int comm_size(void)
Definition: communicator_stack.cpp:91

comm_init
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle=false, void *user_comm=nullptr)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi....
Definition: communicator_stack.cpp:62

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: communicator_stack.cpp:74

QudaCommsMap
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12

comm_dim
int comm_dim(int dim)
Definition: communicator_stack.cpp:56

commDimPartitioned
int commDimPartitioned(int dir)
Definition: communicator_stack.cpp:206

comm_finalize
void comm_finalize(void)
Definition: communicator_stack.cpp:68

comm_gpuid
int comm_gpuid(void)
Definition: communicator_stack.cpp:96

wflow_type
QudaWFlowType wflow_type
Definition: command_line_params.cpp:241

kappa
double kappa
Definition: command_line_params.cpp:72

link_recon_sloppy
QudaReconstructType link_recon_sloppy
Definition: command_line_params.cpp:23

tol
double tol
Definition: command_line_params.cpp:86

link_recon
QudaReconstructType link_recon
Definition: command_line_params.cpp:22

verbosity
QudaVerbosity verbosity
Definition: command_line_params.cpp:33

link_recon_precondition
QudaReconstructType link_recon_precondition
Definition: command_line_params.cpp:24

tol_hq
double tol_hq
Definition: command_line_params.cpp:88

epsilon
double epsilon
Definition: command_line_params.cpp:74

prec
QudaPrecision prec
Definition: command_line_params.cpp:26

default_comm_key
constexpr quda::CommKey default_comm_key
Definition: communicator_quda.h:804

push_communicator
void push_communicator(const quda::CommKey &split_key)
Definition: communicator_stack.cpp:38

contract_quda.h

V
int V
Definition: host_utils.cpp:37

dirac
GaugeCovDev * dirac
Definition: covdev_test.cpp:42

parity
QudaParity parity
Definition: covdev_test.cpp:40

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34

gauge_param
QudaGaugeParam gauge_param
Definition: covdev_test.cpp:26

inv_param
QudaInvertParam inv_param
Definition: covdev_test.cpp:27

deflation.h

device.h

dirac_quda.h

dslash_quda.h

eigensolve_quda.h

QUDA_SOURCE_NORMALIZATION
@ QUDA_SOURCE_NORMALIZATION
Definition: enum_quda.h:234

QUDA_TWISTED_MASSPC_DIRAC
@ QUDA_TWISTED_MASSPC_DIRAC
Definition: enum_quda.h:312

QUDA_GAUGE_LAPLACE_DIRAC
@ QUDA_GAUGE_LAPLACE_DIRAC
Definition: enum_quda.h:317

QUDA_GAUGE_COVDEV_DIRAC
@ QUDA_GAUGE_COVDEV_DIRAC
Definition: enum_quda.h:319

QUDA_TWISTED_CLOVERPC_DIRAC
@ QUDA_TWISTED_CLOVERPC_DIRAC
Definition: enum_quda.h:314

QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC
Definition: enum_quda.h:304

QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC
@ QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC
Definition: enum_quda.h:295

QUDA_TWISTED_MASS_DIRAC
@ QUDA_TWISTED_MASS_DIRAC
Definition: enum_quda.h:311

QUDA_STAGGERED_DIRAC
@ QUDA_STAGGERED_DIRAC
Definition: enum_quda.h:305

QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC
@ QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC
Definition: enum_quda.h:296

QUDA_DOMAIN_WALL_4D_DIRAC
@ QUDA_DOMAIN_WALL_4D_DIRAC
Definition: enum_quda.h:299

QUDA_ASQTAD_DIRAC
@ QUDA_ASQTAD_DIRAC
Definition: enum_quda.h:308

QUDA_STAGGEREDPC_DIRAC
@ QUDA_STAGGEREDPC_DIRAC
Definition: enum_quda.h:306

QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC
Definition: enum_quda.h:303

QUDA_ASQTADPC_DIRAC
@ QUDA_ASQTADPC_DIRAC
Definition: enum_quda.h:309

QUDA_GAUGE_LAPLACEPC_DIRAC
@ QUDA_GAUGE_LAPLACEPC_DIRAC
Definition: enum_quda.h:318

QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:302

QUDA_TWISTED_CLOVER_DIRAC
@ QUDA_TWISTED_CLOVER_DIRAC
Definition: enum_quda.h:313

QUDA_DOMAIN_WALL_4DPC_DIRAC
@ QUDA_DOMAIN_WALL_4DPC_DIRAC
Definition: enum_quda.h:300

QUDA_CLOVER_DIRAC
@ QUDA_CLOVER_DIRAC
Definition: enum_quda.h:293

QUDA_MOBIUS_DOMAIN_WALL_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:301

QUDA_DOMAIN_WALLPC_DIRAC
@ QUDA_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:298

QUDA_DOMAIN_WALL_DIRAC
@ QUDA_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:297

QUDA_WILSONPC_DIRAC
@ QUDA_WILSONPC_DIRAC
Definition: enum_quda.h:292

QUDA_CLOVERPC_DIRAC
@ QUDA_CLOVERPC_DIRAC
Definition: enum_quda.h:294

QUDA_WILSON_DIRAC
@ QUDA_WILSON_DIRAC
Definition: enum_quda.h:291

QudaWFlowType
enum QudaWFlowType_s QudaWFlowType

QudaPrecision
enum QudaPrecision_s QudaPrecision

QUDA_STAGGERED_PHASE_NO
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515

QUDA_COVDEV_DSLASH
@ QUDA_COVDEV_DSLASH
Definition: enum_quda.h:102

QUDA_WILSON_DSLASH
@ QUDA_WILSON_DSLASH
Definition: enum_quda.h:90

QUDA_TWISTED_CLOVER_DSLASH
@ QUDA_TWISTED_CLOVER_DSLASH
Definition: enum_quda.h:100

QUDA_STAGGERED_DSLASH
@ QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:97

QUDA_MOBIUS_DWF_DSLASH
@ QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:95

QUDA_CLOVER_WILSON_DSLASH
@ QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:91

QUDA_TWISTED_MASS_DSLASH
@ QUDA_TWISTED_MASS_DSLASH
Definition: enum_quda.h:99

QUDA_DOMAIN_WALL_DSLASH
@ QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:93

QUDA_ASQTAD_DSLASH
@ QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:98

QUDA_MOBIUS_DWF_EOFA_DSLASH
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
Definition: enum_quda.h:96

QUDA_LAPLACE_DSLASH
@ QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:101

QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
@ QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
Definition: enum_quda.h:92

QUDA_DOMAIN_WALL_4D_DSLASH
@ QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:94

QUDA_CUDA_FIELD_LOCATION
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

QUDA_CPU_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

QUDA_KAPPA_NORMALIZATION
@ QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:226

QUDA_ASYMMETRIC_MASS_NORMALIZATION
@ QUDA_ASYMMETRIC_MASS_NORMALIZATION
Definition: enum_quda.h:228

QUDA_MASS_NORMALIZATION
@ QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:227

QUDA_DAG_NO
@ QUDA_DAG_NO
Definition: enum_quda.h:223

QUDA_DAG_YES
@ QUDA_DAG_YES
Definition: enum_quda.h:223

QUDA_USE_INIT_GUESS_YES
@ QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:430

QUDA_DEBUG_VERBOSE
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268

QUDA_SUMMARIZE
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266

QUDA_VERBOSE
@ QUDA_VERBOSE
Definition: enum_quda.h:267

QUDA_FULL_SITE_SUBSET
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333

QUDA_PARITY_SITE_SUBSET
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332

QUDA_BOOLEAN_TRUE
@ QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:461

QUDA_DEGRAND_ROSSI_GAMMA_BASIS
@ QUDA_DEGRAND_ROSSI_GAMMA_BASIS
Definition: enum_quda.h:368

QUDA_UKQCD_GAMMA_BASIS
@ QUDA_UKQCD_GAMMA_BASIS
Definition: enum_quda.h:369

QUDA_RECONSTRUCT_NO
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70

QUDA_RECONSTRUCT_12
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71

QUDA_RECONSTRUCT_8
@ QUDA_RECONSTRUCT_8
Definition: enum_quda.h:72

QUDA_RECONSTRUCT_10
@ QUDA_RECONSTRUCT_10
Definition: enum_quda.h:75

QUDA_PERIODIC_T
@ QUDA_PERIODIC_T
Definition: enum_quda.h:57

QudaPCType
enum QudaPCType_s QudaPCType

QUDA_EVEN_PARITY
@ QUDA_EVEN_PARITY
Definition: enum_quda.h:284

QUDA_ODD_PARITY
@ QUDA_ODD_PARITY
Definition: enum_quda.h:284

QUDA_MEMORY_MAPPED
@ QUDA_MEMORY_MAPPED
Definition: enum_quda.h:15

QUDA_MEMORY_PINNED
@ QUDA_MEMORY_PINNED
Definition: enum_quda.h:14

QUDA_MEMORY_DEVICE
@ QUDA_MEMORY_DEVICE
Definition: enum_quda.h:13

QUDA_TIFR_PADDED_DIRAC_ORDER
@ QUDA_TIFR_PADDED_DIRAC_ORDER
Definition: enum_quda.h:250

QUDA_CPS_WILSON_DIRAC_ORDER
@ QUDA_CPS_WILSON_DIRAC_ORDER
Definition: enum_quda.h:248

QUDA_HEAVY_QUARK_RESIDUAL
@ QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:195

QUDA_SCALAR_GEOMETRY
@ QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:500

QUDA_VECTOR_GEOMETRY
@ QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:501

QUDA_TENSOR_GEOMETRY
@ QUDA_TENSOR_GEOMETRY
Definition: enum_quda.h:502

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

QUDA_TRANSFER_COARSE_KD
@ QUDA_TRANSFER_COARSE_KD
Definition: enum_quda.h:454

QUDA_TRANSFER_OPTIMIZED_KD
@ QUDA_TRANSFER_OPTIMIZED_KD
Definition: enum_quda.h:455

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_GHOST_EXCHANGE_EXTENDED
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510

QUDA_GHOST_EXCHANGE_NO
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508

QUDA_GHOST_EXCHANGE_PAD
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509

QUDA_MATPC_ODD_ODD_ASYMMETRIC
@ QUDA_MATPC_ODD_ODD_ASYMMETRIC
Definition: enum_quda.h:219

QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
@ QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:218

QUDA_INC_EIGCG_INVERTER
@ QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:117

QUDA_PCG_INVERTER
@ QUDA_PCG_INVERTER
Definition: enum_quda.h:114

QUDA_INVALID_INVERTER
@ QUDA_INVALID_INVERTER
Definition: enum_quda.h:133

QUDA_EIGCG_INVERTER
@ QUDA_EIGCG_INVERTER
Definition: enum_quda.h:116

QUDA_MG_INVERTER
@ QUDA_MG_INVERTER
Definition: enum_quda.h:122

QUDA_EVEN_ODD_SITE_ORDER
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

QUDA_MATPC_SOLUTION
@ QUDA_MATPC_SOLUTION
Definition: enum_quda.h:159

QUDA_MATDAG_MAT_SOLUTION
@ QUDA_MATDAG_MAT_SOLUTION
Definition: enum_quda.h:158

QUDA_MATPCDAG_MATPC_SOLUTION
@ QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:161

QUDA_MAT_SOLUTION
@ QUDA_MAT_SOLUTION
Definition: enum_quda.h:157

QUDA_DOUBLE_PRECISION
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65

QUDA_SINGLE_PRECISION
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64

QUDA_5D_PC
@ QUDA_5D_PC
Definition: enum_quda.h:397

QUDA_4D_PC
@ QUDA_4D_PC
Definition: enum_quda.h:397

QUDA_INVALID_SCHWARZ
@ QUDA_INVALID_SCHWARZ
Definition: enum_quda.h:189

QUDA_FLOAT2_GAUGE_ORDER
@ QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:40

QUDA_BQCD_GAUGE_ORDER
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49

QUDA_TIFR_GAUGE_ORDER
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50

QUDA_TIFR_PADDED_GAUGE_ORDER
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51

QUDA_MILC_GAUGE_ORDER
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47

QudaContractType
enum QudaContractType_s QudaContractType

QUDA_SPECTRUM_SI_EIG
@ QUDA_SPECTRUM_SI_EIG
Definition: enum_quda.h:152

QUDA_SPECTRUM_LI_EIG
@ QUDA_SPECTRUM_LI_EIG
Definition: enum_quda.h:151

QUDA_FLOAT2_FIELD_ORDER
@ QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348

QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
@ QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:352

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351

QudaVerbosity
enum QudaVerbosity_s QudaVerbosity

QUDA_ZERO_FIELD_CREATE
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361

QUDA_COPY_FIELD_CREATE
@ QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:362

QUDA_REFERENCE_FIELD_CREATE
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363

QUDA_NULL_FIELD_CREATE
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360

QUDA_TWIST_SINGLET
@ QUDA_TWIST_SINGLET
Definition: enum_quda.h:400

QUDA_TWIST_NONDEG_DOUBLET
@ QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:401

QUDA_DIRECT_SOLVE
@ QUDA_DIRECT_SOLVE
Definition: enum_quda.h:167

QUDA_NORMERR_SOLVE
@ QUDA_NORMERR_SOLVE
Definition: enum_quda.h:171

QUDA_NORMERR_PC_SOLVE
@ QUDA_NORMERR_PC_SOLVE
Definition: enum_quda.h:172

QUDA_NORMOP_PC_SOLVE
@ QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:170

QUDA_DIRECT_PC_SOLVE
@ QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:169

QudaParity
enum QudaParity_s QudaParity

QUDA_SU3_LINKS
@ QUDA_SU3_LINKS
Definition: enum_quda.h:24

QUDA_ASQTAD_MOM_LINKS
@ QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:33

QUDA_ASQTAD_LONG_LINKS
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32

QUDA_GENERAL_LINKS
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25

QUDA_WILSON_LINKS
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30

QUDA_SMEARED_LINKS
@ QUDA_SMEARED_LINKS
Definition: enum_quda.h:29

QUDA_ASQTAD_FAT_LINKS
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31

gauge_field.h

gauge_force_quda.h

gauge_tools.h

gauge_update_quda.h

cudaMom
cudaGaugeField * cudaMom
Definition: hisq_paths_force_test.cpp:26

cpuMom
cpuGaugeField * cpuMom
Definition: hisq_paths_force_test.cpp:27

gParam
GaugeFieldParam gParam
Definition: hisq_paths_force_test.cpp:58

cpuGauge
cpuGaugeField * cpuGauge
Definition: hisq_paths_force_test.cpp:21

cudaForce
cudaGaugeField * cudaForce
Definition: hisq_paths_force_test.cpp:23

cudaGauge
cudaGaugeField * cudaGauge
Definition: hisq_paths_force_test.cpp:20

cudaFatLink
cudaGaugeField * cudaFatLink
Definition: hisq_unitarize_force_test.cpp:17

cpuFatLink
cpuGaugeField * cpuFatLink
Definition: hisq_unitarize_force_test.cpp:18

kappa5
double kappa5
Definition: host_utils.cpp:51

eigensolveQuda
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
Definition: interface_quda.cpp:2358

initQudaMemory
void initQudaMemory()
Definition: interface_quda.cpp:503

computeHISQForceQuda
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4591

momActionQuda
double momActionQuda(void *momentum, QudaGaugeParam *param)
Definition: interface_quda.cpp:5242

compute_gauge_force_quda_
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
Definition: interface_quda.cpp:5426

invertMultiSrcQuda
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into sub-partitions...
Definition: interface_quda.cpp:3614

gaussGaugeQuda
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
Definition: interface_quda.cpp:5545

initQuda
void initQuda(int dev)
Definition: interface_quda.cpp:536

update_gauge_field_quda_
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:5370

checkGauge
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
Definition: interface_quda.cpp:2210

solutionResident
std::vector< cudaColorSpinorField * > solutionResident
Definition: interface_quda.cpp:134

STR
#define STR(x)
Definition: interface_quda.cpp:443

createGaugeFieldQuda
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
Definition: interface_quda.cpp:4394

gaugeFatPrecise
cudaGaugeField * gaugeFatPrecise
Definition: interface_quda.cpp:109

new_quda_gauge_param_
void new_quda_gauge_param_(QudaGaugeParam *param)
Definition: interface_quda.cpp:5363

destroyGaugeFieldQuda
void destroyGaugeFieldQuda(void *gauge)
Definition: interface_quda.cpp:4430

momResident
cudaGaugeField * momResident
Definition: interface_quda.cpp:131

set_kernel_pack_t_
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
Definition: interface_quda.cpp:5539

new_quda_invert_param_
void new_quda_invert_param_(QudaInvertParam *param)
Definition: interface_quda.cpp:5366

load_gauge_quda_
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:5307

newDeflationQuda
void * newDeflationQuda(QudaEigParam *eig_param)
Definition: interface_quda.cpp:2816

end_quda_
void end_quda_()
Definition: interface_quda.cpp:5306

apply_staggered_phase_quda_
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:5487

checkClover
void checkClover(QudaInvertParam *param)
Definition: interface_quda.cpp:2183

chronoResident
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)

free_sloppy_gauge_quda_
void free_sloppy_gauge_quda_()
Definition: interface_quda.cpp:5309

invertQuda
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2837

momResidentQuda
void momResidentQuda(void *mom, QudaGaugeParam *param)
Definition: interface_quda.cpp:4310

gaugeLongPrecise
cudaGaugeField * gaugeLongPrecise
Definition: interface_quda.cpp:116

gaugeSloppy
cudaGaugeField * gaugeSloppy
Definition: interface_quda.cpp:103

mat_quda_
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5317

init_quda_memory_
void init_quda_memory_()
Definition: interface_quda.cpp:5305

invert_quda_
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5321

computeGaugeForceQuda
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
Definition: interface_quda.cpp:4186

gaugeLongPrecondition
cudaGaugeField * gaugeLongPrecondition
Definition: interface_quda.cpp:118

extendedGaugeResident
cudaGaugeField * extendedGaugeResident
Definition: interface_quda.cpp:132

cloverPrecondition
cudaCloverField * cloverPrecondition
Definition: interface_quda.cpp:127

setMPICommHandleQuda
void setMPICommHandleQuda(void *mycomm)
Definition: interface_quda.cpp:361

gaugeRefinement
cudaGaugeField * gaugeRefinement
Definition: interface_quda.cpp:105

plaqQuda
void plaqQuda(double plaq[3])
Definition: interface_quda.cpp:5578

performAPEnStep
void performAPEnStep(unsigned int n_steps, double alpha, int meas_interval)
Definition: interface_quda.cpp:5691

invertMultiSrcCloverQuda
void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
Definition: interface_quda.cpp:3627

invertMultiShiftQuda
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:3668

invert_multishift_quda_
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5329

gaugeFatExtended
cudaGaugeField * gaugeFatExtended
Definition: interface_quda.cpp:114

QUDA_MAX_CHRONO
#define QUDA_MAX_CHRONO
Definition: interface_quda.cpp:137

destroyDeflationQuda
void destroyDeflationQuda(void *df)
Definition: interface_quda.cpp:2830

setVerbosityQuda
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
Definition: interface_quda.cpp:316

freeSloppyCloverQuda
void freeSloppyCloverQuda()
Definition: interface_quda.cpp:1428

gaugeSmeared
cudaGaugeField * gaugeSmeared
Definition: interface_quda.cpp:123

loadSloppyCloverQuda
void loadSloppyCloverQuda(const QudaPrecision prec[])

init_quda_
void init_quda_(int *dev)
Definition: interface_quda.cpp:5303

saveGaugeFieldQuda
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:4417

freeSloppyGaugeQuda
void freeSloppyGaugeQuda()
Definition: interface_quda.cpp:1118

freeGaugeQuda
void freeGaugeQuda(void)
Definition: interface_quda.cpp:1190

dslashQuda
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
Definition: interface_quda.cpp:1934

cloverSloppy
cudaCloverField * cloverSloppy
Definition: interface_quda.cpp:126

newMultigridQuda
void * newMultigridQuda(QudaMultigridParam *mg_param)
Definition: interface_quda.cpp:2607

openMagma
void openMagma()
Definition: interface_quda.cpp:80

getProfileBLAS
TimeProfile & getProfileBLAS()
Profiler for covariant derivative.
Definition: interface_quda.cpp:227

invertMultiSrcStaggeredQuda
void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
Definition: interface_quda.cpp:3620

gaugeLongRefinement
cudaGaugeField * gaugeLongRefinement
Definition: interface_quda.cpp:119

dslashMultiSrcCloverQuda
void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
Definition: interface_quda.cpp:3649

computeGaugeFixingFFTQuda
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: interface_quda.cpp:5921

staggeredPhaseQuda
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
Definition: interface_quda.cpp:5187

mat_dag_mat_quda_
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5319

dslashMultiSrcQuda
void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into sub-partitions...
Definition: interface_quda.cpp:3634

gaugeFatRefinement
cudaGaugeField * gaugeFatRefinement
Definition: interface_quda.cpp:112

MatDagMatQuda
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2099

destroyMultigridQuda
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
Definition: interface_quda.cpp:2624

initQudaDevice
void initQudaDevice(int dev)
Definition: interface_quda.cpp:453

updateGaugeFieldQuda
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:5026

loadGaugeQuda
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:553

gaugeLongExtended
cudaGaugeField * gaugeLongExtended
Definition: interface_quda.cpp:121

gaugeLongSloppy
cudaGaugeField * gaugeLongSloppy
Definition: interface_quda.cpp:117

computeGaugeFixingOVRQuda
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
Definition: interface_quda.cpp:5846

loadCloverQuda
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:849

free_clover_quda_
void free_clover_quda_(void)
Definition: interface_quda.cpp:5312

comm_set_gridsize_
void comm_set_gridsize_(int *grid)
Definition: interface_quda.cpp:5529

freeCloverQuda
void freeCloverQuda(void)
Definition: interface_quda.cpp:1453

loadSloppyGaugeQuda
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
Definition: interface_quda.cpp:1223

gaugeEigensolver
cudaGaugeField * gaugeEigensolver
Definition: interface_quda.cpp:106

saveGaugeQuda
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:808

computeKSLinkQuda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
Definition: interface_quda.cpp:4071

performWFlownStep
void performWFlownStep(unsigned int n_steps, double step_size, int meas_interval, QudaWFlowType wflow_type)
Definition: interface_quda.cpp:5793

register_pinned_quda_
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:5353

kinetic_quda_
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo.
Definition: interface_quda.cpp:5508

gaugePrecise
cudaGaugeField * gaugePrecise
Definition: interface_quda.cpp:102

gaugeFatSloppy
cudaGaugeField * gaugeFatSloppy
Definition: interface_quda.cpp:110

dslash_quda_
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
Definition: interface_quda.cpp:5313

dslashMultiSrcStaggeredQuda
void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
Definition: interface_quda.cpp:3641

gitversion
char * gitversion
Definition: version.cpp:4

cloverPrecise
cudaCloverField * cloverPrecise
Definition: interface_quda.cpp:125

gaugeFatPrecondition
cudaGaugeField * gaugeFatPrecondition
Definition: interface_quda.cpp:111

computeStaggeredForceQuda
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4436

flushChronoQuda
void flushChronoQuda(int i)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:1461

endQuda
void endQuda(void)
Definition: interface_quda.cpp:1474

init_quda_device_
void init_quda_device_(int *dev)
Definition: interface_quda.cpp:5304

callMultiSrcQuda
void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv, Interface op, Args... args)
Definition: interface_quda.cpp:3311

copyExtendedResidentGaugeQuda
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
Definition: interface_quda.cpp:5600

loadFatLongGaugeQuda
void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks, void *milc_longlinks)
Definition: interface_quda.cpp:3258

free_gauge_quda_
void free_gauge_quda_()
Definition: interface_quda.cpp:5308

gaugeObservablesQuda
void gaugeObservablesQuda(QudaGaugeObservableParam *param)
Calculates a variety of gauge-field observables. If a smeared gauge field is presently loaded (in gau...
Definition: interface_quda.cpp:6042

remove_staggered_phase_quda_
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:5497

gaugePrecondition
cudaGaugeField * gaugePrecondition
Definition: interface_quda.cpp:104

load_clover_quda_
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5310

cloverQuda
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
Definition: interface_quda.cpp:2289

dumpMultigridQuda
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
Definition: interface_quda.cpp:2733

unregister_pinned_quda_
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:5358

initCommsGridQuda
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
Definition: interface_quda.cpp:371

cloverEigensolver
cudaCloverField * cloverEigensolver
Definition: interface_quda.cpp:129

performSTOUTnStep
void performSTOUTnStep(unsigned int n_steps, double rho, int meas_interval)
Definition: interface_quda.cpp:5725

getGaugePadding
int getGaugePadding(GaugeFieldParam &param)
Definition: interface_quda.cpp:4174

createCloverQuda
void createCloverQuda(QudaInvertParam *invertParam)
Definition: interface_quda.cpp:4365

updateR
void updateR()
update the radius for halos.
Definition: interface_quda.cpp:531

computeCloverForceQuda
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4842

performOvrImpSTOUTnStep
void performOvrImpSTOUTnStep(unsigned int n_steps, double rho, double epsilon, int meas_interval)
Definition: interface_quda.cpp:5759

flush_chrono_quda_
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:5351

cloverRefinement
cudaCloverField * cloverRefinement
Definition: interface_quda.cpp:128

gaugeLongEigensolver
cudaGaugeField * gaugeLongEigensolver
Definition: interface_quda.cpp:120

checkBLASParam
void checkBLASParam(QudaBLASParam &param)
Definition: interface_quda.cpp:56

clover_quda_
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
Definition: interface_quda.cpp:5315

projectSU3Quda
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
Definition: interface_quda.cpp:5126

gaugeExtended
cudaGaugeField * gaugeExtended
Definition: interface_quda.cpp:107

plaq_quda_
void plaq_quda_(double plaq[3])
Definition: interface_quda.cpp:5574

MAX
#define MAX(a, b)
Definition: interface_quda.cpp:44

compute_staggered_force_quda_
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5482

performWuppertalnStep
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
Definition: interface_quda.cpp:5616

closeMagma
void closeMagma()
Definition: interface_quda.cpp:91

gaugeFatEigensolver
cudaGaugeField * gaugeFatEigensolver
Definition: interface_quda.cpp:113

updateMultigridQuda
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
Definition: interface_quda.cpp:2628

MatQuda
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2029

invert_quda.h

ks_force_quda.h

ks_improved_force.h

llfat_quda.h

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:170

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:106

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:171

get_mapped_device_pointer
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116

host_free
#define host_free(ptr)
Definition: malloc_quda.h:115

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:108

momentum.h

mpi_comm_handle.h

multigrid.h

quda::blas_lapack::generic::destroy
void destroy()
Destroy the BLAS context.
Definition: blas_lapack_eigen.cpp:21

quda::blas_lapack::native::init
void init()
Create the BLAS context.
Definition: blas_lapack_cublas.cpp:28

quda::blas_lapack::native::destroy
void destroy()
Destroy the BLAS context.
Definition: blas_lapack_cublas.cpp:42

quda::blas_lapack::set_native
void set_native(bool native)
Definition: blas_lapack_eigen.cpp:14

quda::blas::destroy
void destroy()

quda::blas::init
void init()

quda::blas::bytes
unsigned long long bytes

quda::blas::ax
void ax(double a, ColorSpinorField &x)

quda::blas::zero
void zero(ColorSpinorField &a)

quda::blas::norm2
double norm2(const ColorSpinorField &a)

quda::blas::copy
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24

quda::blas::cDotProduct
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)

quda::device::profile::stop
void stop()
Stop profiling.
Definition: device.cpp:228

quda::device::profile::start
void start()
Start profiling.
Definition: device.cpp:226

quda::device::create_context
void create_context()
Create the streams associated with parallel execution.
Definition: device.cpp:185

quda::device::init
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
Definition: device.cpp:25

quda::device::destroy
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
Definition: device.cpp:200

quda::fermion_force
Definition: ks_improved_force.h:8

quda::fermion_force::setUnitarizeForceConstants
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.

quda::fermion_force::hisqCompleteForce
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...

quda::fermion_force::hisqLongLinkForce
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.

quda::fermion_force::hisqStaplesForce
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.

quda::fermion_force::unitarizeForce
void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.

quda::pool::init
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:632

quda::pool::flush_pinned
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:753

quda::pool::flush_device
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:761

quda
Definition: blas_lapack.h:24

quda::applyU
void applyU(GaugeField &force, GaugeField &U)

quda::APEStep
void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.

quda::canReuseResidentGauge
bool canReuseResidentGauge(QudaInvertParam *inv_param)
Definition: interface_quda.cpp:2173

quda::createDslashEvents
void createDslashEvents()

quda::setDiracRefineParam
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
Definition: interface_quda.cpp:1707

quda::setKernelPackT
void setKernelPackT(bool pack)

quda::saveTuneCache
void saveTuneCache(bool error=false)
Definition: tune.cpp:439

quda::gaugeObservables
void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile)
Calculates a variety of gauge-field observables.
Definition: gauge_observable.cpp:7

quda::arpack_solve
void arpack_solve(std::vector< ColorSpinorField * > &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data,...
Definition: quda_arpack_interface.cpp:507

quda::loadTuneCache
void loadTuneCache()
Definition: tune.cpp:337

quda::setDiracSloppyParam
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1689

quda::computeMomAction
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.

quda::product
constexpr int product(const CommKey &input)
Definition: comm_key.h:28

quda::massRescale
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param, bool for_multishift)
Definition: interface_quda.cpp:1846

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)

quda::plaquette
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.

quda::join_field
void join_field(std::vector< Field * > &v_base_field, const Field &collect_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
Definition: split_grid.h:121

quda::forceMonitor
bool forceMonitor()
Whether we are monitoring the force or not.

quda::computeCloverSigmaTrace
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action....

quda::split_field
void split_field(Field &collect_field, std::vector< Field * > &v_base_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
Definition: split_grid.h:17

quda::longKSLink
void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff)
Compute the long links for an improved staggered (Kogut-Susskind) fermions.

quda::createDiracWithRefine
void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1804

quda::fatKSLink
void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff)
Compute the fat links for an improved staggered (Kogut-Susskind) fermions.

quda::destroyDslashEvents
void destroyDslashEvents()

quda::printPeakMemUsage
void printPeakMemUsage()
Definition: malloc.cpp:539

quda::inverse
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:605

quda::printAPIProfile
void printAPIProfile()
Print out the timer profile for CUDA API calls.
Definition: quda_api.cpp:495

quda::OvrImpSTOUTStep
void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:86

quda::gaugeFixingFFT
void gaugeFixingFFT(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120

quda::WFlowStep
void WFlowStep(GaugeField &out, GaugeField &temp, GaugeField &in, double epsilon, QudaWFlowType wflow_type)
Apply Wilson Flow steps W1, W2, Vt to the gauge field. This routine assumes that the input and output...

quda::flushForceMonitor
void flushForceMonitor()
Flush any outstanding force monitoring information.

quda::computeCloverSigmaOprod
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...

quda::updateGaugeField
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)

quda::ApplyLaplace
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the Laplace stencil.

quda::QUDA_PROFILE_INIT
@ QUDA_PROFILE_INIT
Definition: timer.h:106

quda::QUDA_PROFILE_EPILOGUE
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110

quda::QUDA_PROFILE_COMPUTE
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108

quda::QUDA_PROFILE_TOTAL
@ QUDA_PROFILE_TOTAL
Definition: timer.h:149

quda::QUDA_PROFILE_FREE
@ QUDA_PROFILE_FREE
Definition: timer.h:111

quda::QUDA_PROFILE_PREAMBLE
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107

quda::QUDA_PROFILE_CHRONO
@ QUDA_PROFILE_CHRONO
Definition: timer.h:113

quda::QUDA_PROFILE_H2D
@ QUDA_PROFILE_H2D
Definition: timer.h:104

quda::QUDA_PROFILE_D2H
@ QUDA_PROFILE_D2H
Definition: timer.h:105

quda::createExtendedGauge
cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
Definition: gauge_field.cpp:364

quda::computeClover
void computeClover(CloverField &clover, const GaugeField &fmunu, double coeff)
Driver for computing the clover field from the field strength tensor.

quda::gaugeGauss
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...

quda::gaugeFixingOVR
void gaugeFixingOVR(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.

quda::cloverDerivative
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...

quda::cloverInvert
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...

quda::setDiracEigParam
void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
Definition: interface_quda.cpp:1758

quda::unitarizeLinks
void unitarizeLinks(GaugeField &outfield, const GaugeField &infield, int *fails)

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111

quda::reorder_location_set
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
Definition: lattice_field.cpp:749

quda::flushProfile
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:522

quda::setDiracParam
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1570

quda::computeStaggeredOprod
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...

quda::gamma5
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)

quda::printLaunchTimer
void printLaunchTimer()
Definition: tune.cpp:880

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1088

quda::copy
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64

quda::computeFmunu
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.

quda::assertAllMemFree
void assertAllMemFree()
Definition: malloc.cpp:549

quda::dynamic_clover_inverse
constexpr bool dynamic_clover_inverse()
Helper function that returns whether we have enabled dyanmic clover inversion or not.
Definition: clover_field.h:518

quda::updateMomentum
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)

quda::STOUTStep
void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.

quda::createDiracWithEig
void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1825

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)

quda::createDirac
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1787

quda::projectSU3
void projectSU3(GaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation....

quda::gaugeForce
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.

quda::setDiracPreParam
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
Definition: interface_quda.cpp:1726

quda::saveProfile
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:532

quda::computeCloverForce
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.

quda::contractQuda
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)

testing::internal::string
::std::string string
Definition: gtest-port.h:891

csParam
ColorSpinorParam csParam
Definition: pack_test.cpp:25

param
QudaGaugeParam param
Definition: pack_test.cpp:18

quda.h
Main header file for the QUDA library.

printQudaMultigridParam
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:689

printQudaInvertParam
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:342

newQudaGaugeParam
QudaGaugeParam newQudaGaugeParam(void)

printQudaEigParam
void printQudaEigParam(QudaEigParam *param)
Definition: check_params.h:158

newQudaGaugeObservableParam
QudaGaugeObservableParam newQudaGaugeObservableParam(void)

newQudaInvertParam
QudaInvertParam newQudaInvertParam(void)

printQudaGaugeParam
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204

qudaMemset
#define qudaMemset(ptr, value, count)
Definition: quda_api.h:218

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_api.h:250

QUDA_MAX_DWF_LS
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
Definition: quda_constants.h:49

QUDA_MAX_MG_LEVEL
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
Definition: quda_constants.h:56

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Definition: quda_constants.h:17

QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3

QUDA_VERSION_MAJOR
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1

QUDA_VERSION_MINOR
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2

QUDA_MAX_MULTI_SHIFT
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be.
Definition: quda_constants.h:31

quda_fortran.h
Fortran interface functions.

quda_internal.h

random_quda.h

split_grid.h

staggered_oprod.h

LexMapData
Definition: interface_quda.cpp:324

LexMapData::dims
int dims[QUDA_MAX_DIM]
Definition: interface_quda.cpp:326

LexMapData::ndim
int ndim
Definition: interface_quda.cpp:325

QudaBLASParam_s
Definition: quda.h:748

QudaEigParam_s
Definition: quda.h:406

QudaEigParam_s::spectrum
QudaEigSpectrumType spectrum
Definition: quda.h:466

QudaEigParam_s::use_dagger
QudaBoolean use_dagger
Definition: quda.h:449

QudaEigParam_s::arpack_check
QudaBoolean arpack_check
Definition: quda.h:492

QudaEigParam_s::secs
double secs
Definition: quda.h:539

QudaEigParam_s::use_norm_op
QudaBoolean use_norm_op
Definition: quda.h:450

QudaEigParam_s::cuda_prec_ritz
QudaPrecision cuda_prec_ritz
Definition: quda.h:510

QudaEigParam_s::compute_gamma5
QudaBoolean compute_gamma5
Definition: quda.h:460

QudaEigParam_s::location
QudaFieldLocation location
Definition: quda.h:516

QudaEigParam_s::gflops
double gflops
Definition: quda.h:536

QudaEigParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:413

QudaEigParam_s::mem_type_ritz
QudaMemoryType mem_type_ritz
Definition: quda.h:513

QudaEigParam_s::n_conv
int n_conv
Definition: quda.h:475

QudaGaugeObservableParam_s
Definition: quda.h:736

QudaGaugeParam_s
Definition: quda.h:31

QudaGaugeParam_s::reconstruct_precondition
QudaReconstructType reconstruct_precondition
Definition: quda.h:58

QudaGaugeParam_s::mom_offset
size_t mom_offset
Definition: quda.h:90

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:49

QudaGaugeParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:57

QudaGaugeParam_s::ga_pad
int ga_pad
Definition: quda.h:65

QudaGaugeParam_s::type
QudaLinkType type
Definition: quda.h:41

QudaGaugeParam_s::make_resident_mom
int make_resident_mom
Definition: quda.h:85

QudaGaugeParam_s::return_result_mom
int return_result_mom
Definition: quda.h:87

QudaGaugeParam_s::gauge_offset
size_t gauge_offset
Definition: quda.h:89

QudaGaugeParam_s::use_resident_gauge
int use_resident_gauge
Definition: quda.h:82

QudaGaugeParam_s::cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:54

QudaGaugeParam_s::location
QudaFieldLocation location
Definition: quda.h:33

QudaGaugeParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51

QudaGaugeParam_s::reconstruct_sloppy
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52

QudaGaugeParam_s::gauge_order
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42

QudaGaugeParam_s::make_resident_gauge
int make_resident_gauge
Definition: quda.h:84

QudaGaugeParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:48

QudaGaugeParam_s::site_size
size_t site_size
Definition: quda.h:91

QudaGaugeParam_s::reconstruct_eigensolver
QudaReconstructType reconstruct_eigensolver
Definition: quda.h:61

QudaGaugeParam_s::staggered_phase_type
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73

QudaGaugeParam_s::X
int X[4]
Definition: quda.h:35

QudaGaugeParam_s::cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:46

QudaGaugeParam_s::use_resident_mom
int use_resident_mom
Definition: quda.h:83

QudaGaugeParam_s::overlap
int overlap
Definition: quda.h:78

QudaGaugeParam_s::reconstruct_refinement_sloppy
QudaReconstructType reconstruct_refinement_sloppy
Definition: quda.h:55

QudaGaugeParam_s::staggered_phase_applied
int staggered_phase_applied
Definition: quda.h:74

QudaGaugeParam_s::return_result_gauge
int return_result_gauge
Definition: quda.h:86

QudaGaugeParam_s::overwrite_mom
int overwrite_mom
Definition: quda.h:80

QudaGaugeParam_s::cuda_prec_eigensolver
QudaPrecision cuda_prec_eigensolver
Definition: quda.h:60

QudaInvertParam_s
Definition: quda.h:98

QudaInvertParam_s::compute_clover
int compute_clover
Definition: quda.h:266

QudaInvertParam_s::solve_type
QudaSolveType solve_type
Definition: quda.h:229

QudaInvertParam_s::gflops
double gflops
Definition: quda.h:277

QudaInvertParam_s::iter
int iter
Definition: quda.h:276

QudaInvertParam_s::cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:240

QudaInvertParam_s::laplace3D
int laplace3D
Definition: quda.h:136

QudaInvertParam_s::solution_type
QudaSolutionType solution_type
Definition: quda.h:228

QudaInvertParam_s::clover_order
QudaCloverFieldOrder clover_order
Definition: quda.h:256

QudaInvertParam_s::mass_normalization
QudaMassNormalization mass_normalization
Definition: quda.h:232

QudaInvertParam_s::clover_location
QudaFieldLocation clover_location
Definition: quda.h:248

QudaInvertParam_s::mq2
double mq2
Definition: quda.h:128

QudaInvertParam_s::clover_cuda_prec_refinement_sloppy
QudaPrecision clover_cuda_prec_refinement_sloppy
Definition: quda.h:252

QudaInvertParam_s::num_offset
int num_offset
Definition: quda.h:186

QudaInvertParam_s::cuda_prec_eigensolver
QudaPrecision cuda_prec_eigensolver
Definition: quda.h:242

QudaInvertParam_s::mass
double mass
Definition: quda.h:109

QudaInvertParam_s::clover_cuda_prec
QudaPrecision clover_cuda_prec
Definition: quda.h:250

QudaInvertParam_s::m5
double m5
Definition: quda.h:112

QudaInvertParam_s::return_clover
int return_clover
Definition: quda.h:268

QudaInvertParam_s::overlap
int overlap
Definition: quda.h:197

QudaInvertParam_s::compute_clover_inverse
int compute_clover_inverse
Definition: quda.h:267

QudaInvertParam_s::twist_flavor
QudaTwistFlavorType twist_flavor
Definition: quda.h:134

QudaInvertParam_s::clover_cpu_prec
QudaPrecision clover_cpu_prec
Definition: quda.h:249

QudaInvertParam_s::eofa_pm
int eofa_pm
Definition: quda.h:126

QudaInvertParam_s::dslash_type
QudaDslashType dslash_type
Definition: quda.h:106

QudaInvertParam_s::return_clover_inverse
int return_clover_inverse
Definition: quda.h:269

QudaInvertParam_s::mq1
double mq1
Definition: quda.h:127

QudaInvertParam_s::compute_clover_trlog
int compute_clover_trlog
Definition: quda.h:263

QudaInvertParam_s::clover_cuda_prec_precondition
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:253

QudaInvertParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:238

QudaInvertParam_s::mu
double mu
Definition: quda.h:131

QudaInvertParam_s::verbosity
QudaVerbosity verbosity
Definition: quda.h:271

QudaInvertParam_s::b_5
double_complex b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:115

QudaInvertParam_s::clover_rho
double clover_rho
Definition: quda.h:261

QudaInvertParam_s::dslash_type_precondition
QudaDslashType dslash_type_precondition
Definition: quda.h:312

QudaInvertParam_s::clover_coeff
double clover_coeff
Definition: quda.h:260

QudaInvertParam_s::trlogA
double trlogA[2]
Definition: quda.h:264

QudaInvertParam_s::eofa_shift
double eofa_shift
Definition: quda.h:125

QudaInvertParam_s::secs
double secs
Definition: quda.h:278

QudaInvertParam_s::clover_cuda_prec_eigensolver
QudaPrecision clover_cuda_prec_eigensolver
Definition: quda.h:254

QudaInvertParam_s::dagger
QudaDagType dagger
Definition: quda.h:231

QudaInvertParam_s::inv_type
QudaInverterType inv_type
Definition: quda.h:107

QudaInvertParam_s::Ls
int Ls
Definition: quda.h:113

QudaInvertParam_s::epsilon
double epsilon
Definition: quda.h:132

QudaInvertParam_s::clover_csw
double clover_csw
Definition: quda.h:259

QudaInvertParam_s::residue
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:218

QudaInvertParam_s::cl_pad
int cl_pad
Definition: quda.h:274

QudaInvertParam_s::matpc_type
QudaMatPCType matpc_type
Definition: quda.h:230

QudaInvertParam_s::c_5
double_complex c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:116

QudaInvertParam_s::clover_cuda_prec_sloppy
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:251

QudaInvertParam_s::mq3
double mq3
Definition: quda.h:129

QudaInvertParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:239

QudaInvertParam_s::input_location
QudaFieldLocation input_location
Definition: quda.h:103

QudaInvertParam_s::output_location
QudaFieldLocation output_location
Definition: quda.h:104

QudaInvertParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:241

QudaInvertParam_s::use_resident_solution
int use_resident_solution
Definition: quda.h:378

QudaInvertParam_s::dirac_order
QudaDiracFieldOrder dirac_order
Definition: quda.h:244

QudaInvertParam_s::kappa
double kappa
Definition: quda.h:110

QudaMultigridParam_s
Definition: quda.h:546

QudaMultigridParam_s::thin_update_only
QudaBoolean thin_update_only
Definition: quda.h:733

QudaMultigridParam_s::precision_null
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
Definition: quda.h:568

QudaMultigridParam_s::n_level
int n_level
Definition: quda.h:556

QudaMultigridParam_s::gflops
double gflops
Definition: quda.h:718

QudaMultigridParam_s::n_vec
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:565

QudaMultigridParam_s::transfer_type
QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:727

QudaMultigridParam_s::setup_location
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: quda.h:674

QudaMultigridParam_s::secs
double secs
Definition: quda.h:721

QudaMultigridParam_s::smoother_solve_type
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:662

QudaMultigridParam_s::setup_minimize_memory
QudaBoolean setup_minimize_memory
Definition: quda.h:682

QudaMultigridParam_s::smoother_schwarz_type
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:652

QudaMultigridParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:551

QudaMultigridParam_s::smoother_halo_precision
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
Definition: quda.h:649

quda::CloverFieldParam
Definition: clover_field.h:35

quda::CloverFieldParam::cloverInv
void * cloverInv
Definition: clover_field.h:40

quda::CloverFieldParam::mu2
double mu2
Definition: clover_field.h:45

quda::CloverFieldParam::norm
void * norm
Definition: clover_field.h:39

quda::CloverFieldParam::invNorm
void * invNorm
Definition: clover_field.h:41

quda::CloverFieldParam::inverse
bool inverse
Definition: clover_field.h:37

quda::CloverFieldParam::twisted
bool twisted
Overall clover coefficient.
Definition: clover_field.h:44

quda::CloverFieldParam::coeff
double coeff
C_sw clover coefficient.
Definition: clover_field.h:43

quda::CloverFieldParam::direct
bool direct
Definition: clover_field.h:36

quda::CloverFieldParam::order
QudaCloverFieldOrder order
Definition: clover_field.h:48

quda::CloverFieldParam::location
QudaFieldLocation location
Definition: clover_field.h:51

quda::CloverFieldParam::create
QudaFieldCreate create
Definition: clover_field.h:49

quda::CloverFieldParam::setPrecision
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
Definition: clover_field.h:59

quda::CloverFieldParam::csw
double csw
Definition: clover_field.h:42

quda::CloverFieldParam::clover
void * clover
Definition: clover_field.h:38

quda::CommKey
Definition: comm_key.h:6

quda::CommKey::n_dim
static constexpr int n_dim
Definition: comm_key.h:8

quda::CommKey::is_valid
constexpr bool is_valid() const
Definition: comm_key.h:22

quda::DeflationParam
Definition: deflation.h:13

quda::GaugeFieldParam
Definition: gauge_field.h:44

quda::GaugeFieldParam::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:50

quda::GaugeFieldParam::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:51

quda::GaugeFieldParam::site_size
size_t site_size
Definition: gauge_field.h:82

quda::GaugeFieldParam::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:62

quda::GaugeFieldParam::setPrecision
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
Definition: gauge_field.h:173

quda::GaugeFieldParam::gauge
void * gauge
Definition: gauge_field.h:58

quda::GaugeFieldParam::link_type
QudaLinkType link_type
Definition: gauge_field.h:53

quda::GaugeFieldParam::create
QudaFieldCreate create
Definition: gauge_field.h:60

quda::GaugeFieldParam::nFace
int nFace
Definition: gauge_field.h:48

quda::GaugeFieldParam::t_boundary
QudaTboundary t_boundary
Definition: gauge_field.h:54

quda::GaugeFieldParam::site_offset
size_t site_offset
Definition: gauge_field.h:79

quda::LatticeFieldParam::pad
int pad
Definition: lattice_field.h:70

quda::LatticeFieldParam::mem_type
QudaMemoryType mem_type
Definition: lattice_field.h:74

quda::LatticeFieldParam::ghostExchange
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77

quda::LatticeFieldParam::nDim
int nDim
Definition: lattice_field.h:65

quda::LatticeFieldParam::x
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68

quda::LatticeFieldParam::siteSubset
QudaSiteSubset siteSubset
Definition: lattice_field.h:72

quda::LatticeFieldParam::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:59

quda::MGParam
Definition: multigrid.h:25

quda::SolverParam
Definition: invert_quda.h:17

quda::SolverParam::iter
int iter
Definition: invert_quda.h:133

quda::SolverParam::true_res
double true_res
Definition: invert_quda.h:124

quda::SolverParam::true_res_hq
double true_res_hq
Definition: invert_quda.h:127

quda::SolverParam::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:178

quda::SolverParam::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:184

quda::SolverParam::use_init_guess
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:58

quda::SolverParam::tol_hq
double tol_hq
Definition: invert_quda.h:115

quda::SolverParam::tol
double tol
Definition: invert_quda.h:109

quda::SolverParam::delta
double delta
Definition: invert_quda.h:64

quda::SolverParam::updateInvertParam
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:428

quda::deflated_solver
Definition: deflation.h:180

quda::deflated_solver::profile
TimeProfile & profile
Definition: deflation.h:190

quda::deflated_solver::defl
Deflation * defl
Definition: deflation.h:189

quda::deflated_solver::deflated_solver
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
Definition: interface_quda.cpp:2750

quda::deflated_solver::RV
ColorSpinorField * RV
Definition: deflation.h:185

quda::deflated_solver::m
DiracMatrix * m
Definition: deflation.h:183

quda::deflated_solver::deflParam
DeflationParam * deflParam
Definition: deflation.h:187

quda::deflated_solver::d
Dirac * d
Definition: deflation.h:182

quda::multigrid_solver
Definition: multigrid.h:503

quda::multigrid_solver::mg
MG * mg
Definition: multigrid.h:516

quda::multigrid_solver::m
DiracM * m
Definition: multigrid.h:508

quda::multigrid_solver::profile
TimeProfile & profile
Definition: multigrid.h:517

quda::multigrid_solver::dSmoothSloppy
Dirac * dSmoothSloppy
Definition: multigrid.h:506

quda::multigrid_solver::multigrid_solver
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
Definition: interface_quda.cpp:2518

quda::multigrid_solver::d
Dirac * d
Definition: multigrid.h:504

quda::multigrid_solver::mSmooth
DiracM * mSmooth
Definition: multigrid.h:509

quda::multigrid_solver::mSmoothSloppy
DiracM * mSmoothSloppy
Definition: multigrid.h:510

quda::multigrid_solver::dSmooth
Dirac * dSmooth
Definition: multigrid.h:505

quda::multigrid_solver::mgParam
MGParam * mgParam
Definition: multigrid.h:514

quda::multigrid_solver::B
std::vector< ColorSpinorField * > B
Definition: multigrid.h:512

swap
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134

tune_quda.h

unitarization_links.h

cpuULink
cpuGaugeField * cpuULink
Definition: unitarize_link_test.cpp:37

pushVerbosity
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
Definition: util_quda.cpp:83

popOutputPrefix
void popOutputPrefix()
Pop the output prefix restoring the prior one on the stack.
Definition: util_quda.cpp:121

printfQuda
#define printfQuda(...)
Definition: util_quda.h:114

checkCudaError
#define checkCudaError()
Definition: util_quda.h:158

popVerbosity
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
Definition: util_quda.cpp:94

pushOutputPrefix
void pushOutputPrefix(const char *prefix)
Push a new output prefix onto the stack.
Definition: util_quda.cpp:105

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

warningQuda
#define warningQuda(...)
Definition: util_quda.h:132

setVerbosity
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25

setOutputPrefix
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:69

errorQuda
#define errorQuda(...)
Definition: util_quda.h:120

setOutputFile
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:75