v0.9.0/doc/interface__quda_8cpp_source.html

 #include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
 #include <sys/time.h>

 #include <quda.h>
 #include <quda_fortran.h>
 #include <quda_internal.h>
 #include <comm_quda.h>
 #include <tune_quda.h>
 #include <blas_quda.h>
 #include <gauge_field.h>
 #include <dirac_quda.h>
 #include <ritz_quda.h>
 #include <dslash_quda.h>
 #include <invert_quda.h>
 #include <lanczos_quda.h>
 #include <color_spinor_field.h>
 #include <eig_variables.h>
 #include <clover_field.h>
 #include <llfat_quda.h>
 #include <unitarization_links.h>
 #include <algorithm>
 #include <staggered_oprod.h>
 #include <ks_improved_force.h>
 #include <ks_force_quda.h>
 #include <random_quda.h>

 #include <multigrid.h>

 #include <deflation.h>

 #ifdef NUMA_NVML
 #include <numa_affinity.h>
 #endif

 #include <cuda.h>

 #ifdef MULTI_GPU
 extern void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
     QudaPrecision gPrecision, int optflag, int geom);
 #endif // MULTI_GPU

 #include <ks_force_quda.h>

 #ifdef GPU_GAUGE_FORCE
 #include <gauge_force_quda.h>
 #endif
 #include <gauge_update_quda.h>

 #define MAX(a,b) ((a)>(b)? (a):(b))
 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))

 #define spinorSiteSize 24 // real numbers per spinor

 #define MAX_GPU_NUM_PER_NODE 16

 // define newQudaGaugeParam() and newQudaInvertParam()
 #define INIT_PARAM
 #include "check_params.h"
 #undef INIT_PARAM

 // define (static) checkGaugeParam() and checkInvertParam()
 #define CHECK_PARAM
 #include "check_params.h"
 #undef CHECK_PARAM

 // define printQudaGaugeParam() and printQudaInvertParam()
 #define PRINT_PARAM
 #include "check_params.h"
 #undef PRINT_PARAM

 #include <gauge_tools.h>
 #include <contractQuda.h>

 #include <momentum.h>


 using namespace quda;

 static int R[4] = {0, 0, 0, 0};
 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels
 static bool redundant_comms = false;

 //for MAGMA lib:
 #include <blas_magma.h>

 static bool InitMagma = false;

 void openMagma() {

   if (!InitMagma) {
     OpenMagma();
     InitMagma = true;
   } else {
     printfQuda("\nMAGMA library was already initialized..\n");
   }

 }

 void closeMagma(){

   if (InitMagma) {
     CloseMagma();
     InitMagma = false;
   } else {
     printfQuda("\nMAGMA library was not initialized..\n");
   }

   return;
 }

 cudaGaugeField *gaugePrecise = NULL;
 cudaGaugeField *gaugeSloppy = NULL;
 cudaGaugeField *gaugePrecondition = NULL;
 cudaGaugeField *gaugeExtended = NULL;

 // It's important that these alias the above so that constants are set correctly in Dirac::Dirac()
 cudaGaugeField *&gaugeFatPrecise = gaugePrecise;
 cudaGaugeField *&gaugeFatSloppy = gaugeSloppy;
 cudaGaugeField *&gaugeFatPrecondition = gaugePrecondition;
 cudaGaugeField *&gaugeFatExtended = gaugeExtended;


 cudaGaugeField *gaugeLongExtended = NULL;
 cudaGaugeField *gaugeLongPrecise = NULL;
 cudaGaugeField *gaugeLongSloppy = NULL;
 cudaGaugeField *gaugeLongPrecondition = NULL;

 cudaGaugeField *gaugeSmeared = NULL;

 cudaCloverField *cloverPrecise = NULL;
 cudaCloverField *cloverSloppy = NULL;
 cudaCloverField *cloverPrecondition = NULL;

 cudaGaugeField *momResident = NULL;
 cudaGaugeField *extendedGaugeResident = NULL;

 std::vector<cudaColorSpinorField*> solutionResident;

 // vector of spinors used for forecasting solutions in HMC
 #define QUDA_MAX_CHRONO 2
 // each entry is a pair for both p and Ap storage
 std::vector< std::vector< std::pair<ColorSpinorField*,ColorSpinorField*> > > chronoResident(QUDA_MAX_CHRONO);

 // Mapped memory buffer used to hold unitarization failures
 static int *num_failures_h = NULL;
 static int *num_failures_d = NULL;

 cudaDeviceProp deviceProp;
 cudaStream_t *streams;
 #ifdef PTHREADS
 pthread_mutex_t pthread_mutex;
 #endif

 static bool initialized = false;

 static TimeProfile profileInit("initQuda");

 static TimeProfile profileGauge("loadGaugeQuda");

 static TimeProfile profileClover("loadCloverQuda");

 static TimeProfile profileDslash("dslashQuda");

 static TimeProfile profileInvert("invertQuda");

 static TimeProfile profileMulti("invertMultiShiftQuda");

 static TimeProfile profileFatLink("computeKSLinkQuda");

 static TimeProfile profileGaugeForce("computeGaugeForceQuda");

 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");

 static TimeProfile profileExtendedGauge("createExtendedGaugeField");

 static TimeProfile profileCloverForce("computeCloverForceQuda");

 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");

 static TimeProfile profileHISQForce("computeHISQForceQuda");

 static TimeProfile profilePlaq("plaqQuda");

 static TimeProfile profileWuppertal("wuppertalQuda");

 static TimeProfile profileGauss("gaussQuda");

 static TimeProfile profileQCharge("qChargeQuda");

 static TimeProfile profileAPE("APEQuda");

 static TimeProfile profileSTOUT("STOUTQuda");

 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");

 static TimeProfile profileProject("projectSU3Quda");

 static TimeProfile profilePhase("staggeredPhaseQuda");

 static TimeProfile profileContract("contractQuda");

 static TimeProfile profileCovDev("covDevQuda");

 static TimeProfile profileMomAction("momActionQuda");

 static TimeProfile profileEnd("endQuda");

 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");
 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");


 static TimeProfile profileInit2End("initQuda-endQuda",false);

 namespace quda {
   void printLaunchTimer();
 }

 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
 {
   setVerbosity(verbosity);
   setOutputPrefix(prefix);
   setOutputFile(outfile);
 }


 typedef struct {
   int ndim;
   int dims[QUDA_MAX_DIM];
 } LexMapData;

 static int lex_rank_from_coords(const int *coords, void *fdata)
 {
   LexMapData *md = static_cast<LexMapData *>(fdata);

   int rank = coords[0];
   for (int i = 1; i < md->ndim; i++) {
     rank = md->dims[i] * rank + coords[i];
   }
   return rank;
 }

 #ifdef QMP_COMMS

 static int qmp_rank_from_coords(const int *coords, void *fdata)
 {
   return QMP_get_node_number_from(coords);
 }
 #endif


 static bool comms_initialized = false;

 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
 {
   if (nDim != 4) {
     errorQuda("Number of communication grid dimensions must be 4");
   }

   LexMapData map_data;
   if (!func) {

 #if QMP_COMMS
     if (QMP_logical_topology_is_declared()) {
       if (QMP_get_logical_number_of_dimensions() != 4) {
         errorQuda("QMP logical topology must have 4 dimensions");
       }
       for (int i=0; i<nDim; i++) {
         int qdim = QMP_get_logical_dimensions()[i];
         if(qdim != dims[i]) {
           errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
         }
       }
       fdata = NULL;
       func = qmp_rank_from_coords;
     } else {
       warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
 #endif

       map_data.ndim = nDim;
       for (int i=0; i<nDim; i++) {
         map_data.dims[i] = dims[i];
       }
       fdata = (void *) &map_data;
       func = lex_rank_from_coords;

 #if QMP_COMMS
     }
 #endif

   }
   comm_init(nDim, dims, func, fdata);
   comms_initialized = true;
 }


 static void init_default_comms()
 {
 #if defined(QMP_COMMS)
   if (QMP_logical_topology_is_declared()) {
     int ndim = QMP_get_logical_number_of_dimensions();
     const int *dims = QMP_get_logical_dimensions();
     initCommsGridQuda(ndim, dims, NULL, NULL);
   } else {
     errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
         " and QMP logical topology has not been declared");
   }
 #elif defined(MPI_COMMS)
   errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
 #else // single-GPU
   const int dims[4] = {1, 1, 1, 1};
   initCommsGridQuda(4, dims, NULL, NULL);
 #endif
 }


 #define STR_(x) #x
 #define STR(x) STR_(x)
   static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
 #undef STR
 #undef STR_

 extern char* gitversion;

 /*
  * Set the device that QUDA uses.
  */
 void initQudaDevice(int dev) {

   //static bool initialized = false;
   if (initialized) return;
   initialized = true;

   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_INIT);

   if (getVerbosity() >= QUDA_SUMMARIZE) {
 #ifdef GITVERSION
     printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
 #else
     printfQuda("QUDA %s\n",quda_version.c_str());
 #endif
   }

 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000)
   //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
   // not needed for CUDA >= 4.1
   char* cni_str = getenv("CUDA_NIC_INTEROP");
   if(cni_str == NULL){
     errorQuda("Environment variable CUDA_NIC_INTEROP is not set");
   }
   int cni_int = atoi(cni_str);
   if (cni_int != 1){
     errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");
   }
 #endif

   int deviceCount;
   cudaGetDeviceCount(&deviceCount);
   if (deviceCount == 0) {
     errorQuda("No CUDA devices found");
   }

   for(int i=0; i<deviceCount; i++) {
     cudaGetDeviceProperties(&deviceProp, i);
     checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
     if (getVerbosity() >= QUDA_SUMMARIZE) {
       printfQuda("Found device %d: %s\n", i, deviceProp.name);
     }
   }

 #ifdef MULTI_GPU
   if (dev < 0) {
     if (!comms_initialized) {
       errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
     }
     dev = comm_gpuid();
   }
 #else
   if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
 #endif

   cudaGetDeviceProperties(&deviceProp, dev);
   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
   if (deviceProp.major < 1) {
     errorQuda("Device %d does not support CUDA", dev);
   }


 // Check GPU and QUDA build compatibiliy
 // 4 cases:
 // a) QUDA and GPU match: great
 // b) QUDA built for higher compute capability: error
 // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
 // d) QUDA built for same major compute capability but lower minor: warn

   const int my_major = __COMPUTE_CAPABILITY__ / 100;
   const int my_minor = (__COMPUTE_CAPABILITY__  - my_major * 100) / 10;
 // b) UDA was compiled for a higher compute capability
   if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
     errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);


 // c) QUDA was compiled for a lower compute capability
   if (deviceProp.major < my_major) {
     char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
     if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
       if (getVerbosity() > QUDA_SILENT) warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
     } else {
       errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.", deviceProp.major, deviceProp.minor, my_major, my_minor);
     }
   }
 // d) QUDA built for same major compute capability but lower minor
   if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
     warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
   }

   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("Using device %d: %s\n", dev, deviceProp.name);
   }
 #ifndef USE_QDPJIT
   cudaSetDevice(dev);
   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
 #endif


 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML)
   char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
   if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
     if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
   }
   else{
     setNumaAffinityNVML(dev);
   }
 #endif


   cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
   //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
   // cudaGetDeviceProperties(&deviceProp, dev);

   { // determine if we will do CPU or GPU data reordering (default is GPU)
     char *reorder_str = getenv("QUDA_REORDER_LOCATION");

     if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {
       warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
       reorder_location_set(QUDA_CUDA_FIELD_LOCATION);
     } else {
       warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
       reorder_location_set(QUDA_CPU_FIELD_LOCATION);
     }
   }

   profileInit.TPSTOP(QUDA_PROFILE_INIT);
   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 /*
  * Any persistent memory allocations that QUDA uses are done here.
  */
 void initQudaMemory()
 {
   profileInit.TPSTART(QUDA_PROFILE_TOTAL);
   profileInit.TPSTART(QUDA_PROFILE_INIT);

   if (!comms_initialized) init_default_comms();

   streams = new cudaStream_t[Nstream];

 #if (CUDA_VERSION >= 5050)
   int greatestPriority;
   int leastPriority;
   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
   for (int i=0; i<Nstream-1; i++) {
     cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
   }
   cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);
 #else
   for (int i=0; i<Nstream; i++) {
     cudaStreamCreate(&streams[i]);
   }
 #endif

   checkCudaError();
   createDslashEvents();
   blas::init();

   // initalize the memory pool allocators
   pool::init();

   num_failures_h = static_cast<int*>(mapped_malloc(sizeof(int)));
   cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);

   loadTuneCache();

   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));

   profileInit.TPSTOP(QUDA_PROFILE_INIT);
   profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void updateR()
 {
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 }

 void initQuda(int dev)
 {
   // initialize communications topology, if not already done explicitly via initCommsGridQuda()
   if (!comms_initialized) init_default_comms();

   // set the device that QUDA uses
   initQudaDevice(dev);

   // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
   initQudaMemory();

 #ifdef PTHREADS
   pthread_mutexattr_t mutex_attr;
   pthread_mutexattr_init(&mutex_attr);
   pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
   pthread_mutex_init(&pthread_mutex, &mutex_attr);
 #endif
 }

 // helper for creating extended gauge fields
 static cudaGaugeField* createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile,
              bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
 {
   profile.TPSTART(QUDA_PROFILE_INIT);
   int y[4];
   for (int dir=0; dir<4; ++dir) y[dir] = in.X()[dir] + 2*R[dir];
   int pad = 0;

   GaugeFieldParam gParamEx(y, in.Precision(), recon != QUDA_RECONSTRUCT_INVALID ? recon : in.Reconstruct(), pad,
          in.Geometry(), QUDA_GHOST_EXCHANGE_EXTENDED);
   gParamEx.create = QUDA_ZERO_FIELD_CREATE;
   gParamEx.order = in.Order();
   gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
   gParamEx.t_boundary = in.TBoundary();
   gParamEx.nFace = 1;
   gParamEx.tadpole = in.Tadpole();
   for (int d=0; d<4; d++) gParamEx.r[d] = R[d];

   cudaGaugeField *out = new cudaGaugeField(gParamEx);

   // copy input field into the extended device gauge field
   copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION);

   profile.TPSTOP(QUDA_PROFILE_INIT);

   // now fill up the halos
   out->exchangeExtendedGhost(R,profile,redundant_comms);

   return out;
 }

 // This is a flag used to signal when we have downloaded new gauge
 // field.  Set by loadGaugeQuda and consumed by loadCloverQuda as one
 // possible flag to indicate we need to recompute the clover field
 static bool invalidate_clover = true;

 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");
   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);

   checkGaugeParam(param);

   profileGauge.TPSTART(QUDA_PROFILE_INIT);
   // Set the specific input parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(h_gauge, *param);

   // if we are using half precision then we need to compute the fat
   // link maximum while still on the cpu
   // FIXME get a kernel for this
   if (param->type == QUDA_ASQTAD_FAT_LINKS)
     gauge_param.compute_fat_link_max = true;

   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?
     static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
     static_cast<GaugeField*>(new cudaGaugeField(gauge_param));

   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
     static size_t checksum = SIZE_MAX;
     size_t in_checksum = in->checksum(true);
     if (in_checksum == checksum) {
       if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
       profileGauge.TPSTOP(QUDA_PROFILE_INIT);
       profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
       delete in;
       invalidate_clover = false;
       return;
     }
     checksum = in_checksum;
     invalidate_clover = true;
   }

   // free any current gauge field before new allocations to reduce memory overhead
   switch (param->type) {
     case QUDA_WILSON_LINKS:
       if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
       if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
       if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
       if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
       if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise;
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
       if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
       if (gaugeLongPrecise) delete gaugeLongPrecise;
       break;
     case QUDA_SMEARED_LINKS:
       if (gaugeSmeared) delete gaugeSmeared;
       break;
     default:
       errorQuda("Invalid gauge type %d", param->type);
   }

   // if not preserving then copy the gauge field passed in
   cudaGaugeField *precise = NULL;

   // switch the parameters for creating the mirror precise cuda gauge field
   gauge_param.create = QUDA_NULL_FIELD_CREATE;
   gauge_param.precision = param->cuda_prec;
   gauge_param.reconstruct = param->reconstruct;
   gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   gauge_param.pad = param->ga_pad;
   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
            gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   precise = new cudaGaugeField(gauge_param);

   if (param->use_resident_gauge) {
     if(gaugePrecise == NULL) errorQuda("No resident gauge field");
     // copy rather than point at to ensure that the padded region is filled in
     precise->copy(*gaugePrecise);
     precise->exchangeGhost();
     delete gaugePrecise;
     gaugePrecise = NULL;
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
     profileGauge.TPSTART(QUDA_PROFILE_H2D);
     precise->copy(*in);
     profileGauge.TPSTOP(QUDA_PROFILE_H2D);
   }

   param->gaugeGiB += precise->GBytes();

   // for gaugeSmeared we are interested only in the precise version
   if (param->type == QUDA_SMEARED_LINKS) {
     gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);

     profileGauge.TPSTART(QUDA_PROFILE_FREE);
     delete precise;
     delete in;
     profileGauge.TPSTOP(QUDA_PROFILE_FREE);

     profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
     return;
   }

   // creating sloppy fields isn't really compute, but it is work done on the gpu
   profileGauge.TPSTART(QUDA_PROFILE_COMPUTE);

   // switch the parameters for creating the mirror sloppy cuda gauge field
   gauge_param.precision = param->cuda_prec_sloppy;
   gauge_param.reconstruct = param->reconstruct_sloppy;
   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
       gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
   cudaGaugeField *sloppy = NULL;
   if (param->cuda_prec != param->cuda_prec_sloppy ||
       param->reconstruct != param->reconstruct_sloppy) {
     sloppy = new cudaGaugeField(gauge_param);
     sloppy->copy(*precise);
     param->gaugeGiB += sloppy->GBytes();
   } else {
     sloppy = precise;
   }

   // switch the parameters for creating the mirror preconditioner cuda gauge field
   gauge_param.precision = param->cuda_prec_precondition;
   gauge_param.reconstruct = param->reconstruct_precondition;
   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
       gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
   cudaGaugeField *precondition = NULL;
   if (param->cuda_prec_sloppy != param->cuda_prec_precondition ||
       param->reconstruct_sloppy != param->reconstruct_precondition) {
     precondition = new cudaGaugeField(gauge_param);
     precondition->copy(*sloppy);
     param->gaugeGiB += precondition->GBytes();
   } else {
     precondition = sloppy;
   }

   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);

   // create an extended preconditioning field
   cudaGaugeField* extended = nullptr;
   if (param->overlap){
     int R[4]; // domain-overlap widths in different directions
     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
     extended = createExtendedGauge(*precondition, R, profileGauge);
   }

   switch (param->type) {
     case QUDA_WILSON_LINKS:
       gaugePrecise = precise;
       gaugeSloppy = sloppy;
       gaugePrecondition = precondition;

       if(param->overlap) gaugeExtended = extended;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       gaugeFatPrecise = precise;
       gaugeFatSloppy = sloppy;
       gaugeFatPrecondition = precondition;

       if(param->overlap){
         if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
   gaugeFatExtended = extended;
       }
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       gaugeLongPrecise = precise;
       gaugeLongSloppy = sloppy;
       gaugeLongPrecondition = precondition;

       if(param->overlap){
         if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
     gaugeLongExtended = extended;
       }
       break;
     default:
       errorQuda("Invalid gauge type %d", param->type);
   }

   profileGauge.TPSTART(QUDA_PROFILE_FREE);
   delete in;
   profileGauge.TPSTOP(QUDA_PROFILE_FREE);

   if (extendedGaugeResident) {
     // updated the resident gauge field if needed
     const int *R_ = extendedGaugeResident->R();
     const int R[] = { R_[0], R_[1], R_[2], R_[3] };
     QudaReconstructType recon = extendedGaugeResident->Reconstruct();
     delete extendedGaugeResident;

     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
   }

   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
   profileGauge.TPSTART(QUDA_PROFILE_TOTAL);

   if (param->location != QUDA_CPU_FIELD_LOCATION)
     errorQuda("Non-cpu output location not yet supported");

   if (!initialized) errorQuda("QUDA not initialized");
   checkGaugeParam(param);

   // Set the specific cpu parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(h_gauge, *param);
   cpuGaugeField cpuGauge(gauge_param);
   cudaGaugeField *cudaGauge = NULL;
   switch (param->type) {
     case QUDA_WILSON_LINKS:
       cudaGauge = gaugePrecise;
       break;
     case QUDA_ASQTAD_FAT_LINKS:
       cudaGauge = gaugeFatPrecise;
       break;
     case QUDA_ASQTAD_LONG_LINKS:
       cudaGauge = gaugeLongPrecise;
       break;
     case QUDA_SMEARED_LINKS:
       gauge_param.create = QUDA_NULL_FIELD_CREATE;
       gauge_param.precision = param->cuda_prec;
       gauge_param.reconstruct = param->reconstruct;
       gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
       gauge_param.pad = param->ga_pad;
       gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
                            gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
         QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
       cudaGauge = new cudaGaugeField(gauge_param);
       copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
       break;
     default:
       errorQuda("Invalid gauge type");
   }

   profileGauge.TPSTART(QUDA_PROFILE_D2H);
   cudaGauge->saveCPUField(cpuGauge);
   profileGauge.TPSTOP(QUDA_PROFILE_D2H);

   if(param->type == QUDA_SMEARED_LINKS) {
     delete cudaGauge;
   }

   profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 void loadSloppyCloverQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition);
 void freeSloppyCloverQuda();

 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
   profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   profileClover.TPSTART(QUDA_PROFILE_INIT);
   bool device_calc = false; // calculate clover and inverse on the device?

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   if (!initialized) errorQuda("QUDA not initialized");

   if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
     device_calc = true;
     if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set");
     if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
   }

   if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION)  errorQuda("Half precision not supported on CPU");
   if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover");
   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {
     errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
   }

   // determines whether operator is preconditioned when calling invertQuda()
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
       inv_param->solve_type == QUDA_NORMOP_PC_SOLVE ||
       inv_param->solve_type == QUDA_NORMERR_PC_SOLVE );

   // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
   bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
       inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);

   // uninverted clover term is required when applying unpreconditioned operator,
   // but note that dslashQuda() is always preconditioned
   if (!h_clover && !pc_solve && !pc_solution) {
     //warningQuda("Uninverted clover term not loaded");
   }

   // uninverted clover term is also required for "asymmetric" preconditioning
   if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
     warningQuda("Uninverted clover term not loaded");
   }

   bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
 #ifdef DYNAMIC_CLOVER
   bool dynamic_clover = twisted ? true : false; // dynamic clover only supported on twisted clover currently
 #else
   bool dynamic_clover = false;
 #endif

   CloverFieldParam clover_param;
   clover_param.nDim = 4;
   clover_param.csw = inv_param->clover_coeff;
   clover_param.twisted = twisted;
   clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
   clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
   for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
   clover_param.pad = inv_param->cl_pad;
   clover_param.create = QUDA_NULL_FIELD_CREATE;
   clover_param.norm = nullptr;
   clover_param.invNorm = nullptr;
   clover_param.setPrecision(inv_param->clover_cuda_prec);
   clover_param.direct = h_clover || device_calc ? true : false;
   clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false;
   CloverField *in = nullptr;
   profileClover.TPSTOP(QUDA_PROFILE_INIT);

   // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)
   bool clover_update = false;
   double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
   if (!cloverPrecise || invalidate_clover || inv_param->clover_coeff != csw_old) clover_update = true;

   // compute or download clover field only if gauge field has been updated or clover field doesn't exist
   if (clover_update) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
     freeSloppyCloverQuda();
     if (cloverPrecise) delete cloverPrecise;

     profileClover.TPSTART(QUDA_PROFILE_INIT);
     cloverPrecise = new cudaCloverField(clover_param);

     if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
       // create a param for the cpu clover field
       CloverFieldParam inParam(clover_param);
       inParam.precision = inv_param->clover_cpu_prec;
       inParam.order = inv_param->clover_order;
       inParam.direct = h_clover ? true : false;
       inParam.inverse = h_clovinv ? true : false;
       inParam.clover = h_clover;
       inParam.cloverInv = h_clovinv;
       inParam.create = QUDA_REFERENCE_FIELD_CREATE;
       in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
   static_cast<CloverField*>(new cpuCloverField(inParam)) :
   static_cast<CloverField*>(new cudaCloverField(inParam));
     }
     profileClover.TPSTOP(QUDA_PROFILE_INIT);

     if (!device_calc) {
       profileClover.TPSTART(QUDA_PROFILE_H2D);
       cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false);
       profileClover.TPSTOP(QUDA_PROFILE_H2D);
     } else {
       profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
       createCloverQuda(inv_param);
       profileClover.TPSTART(QUDA_PROFILE_TOTAL);
     }

     // inverted clover term is required when applying preconditioned operator
     if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
       profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
       if (!dynamic_clover) {
   cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
   if (inv_param->compute_clover_trlog) {
     inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
     inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
   }
       }
       profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
     }
   } else {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
   }

   inv_param->cloverGiB = cloverPrecise->GBytes();

   clover_param.direct = true;
   clover_param.inverse = dynamic_clover ? false : true;

   cloverPrecise->setRho(inv_param->clover_rho);

   loadSloppyCloverQuda(inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition);

   // if requested, copy back the clover / inverse field
   if ( inv_param->return_clover || inv_param->return_clover_inverse ) {
     if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");

     // copy the inverted clover term into host application order on the device
     clover_param.setPrecision(inv_param->clover_cpu_prec);
     clover_param.direct = (h_clover && inv_param->return_clover);
     clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);

     // this isn't really "epilogue" but this label suffices
     profileClover.TPSTART(QUDA_PROFILE_EPILOGUE);
     cudaCloverField *hack = nullptr;
     if (!dynamic_clover) {
       clover_param.order = inv_param->clover_order;
       hack = new cudaCloverField(clover_param);
       hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
     } else {
       cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack
       hackOfTheHack->copy(*cloverPrecise, false);
       cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
       if (inv_param->compute_clover_trlog) {
   inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
   inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
       }
       clover_param.order = inv_param->clover_order;
       hack = new cudaCloverField(clover_param);
       hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
       delete hackOfTheHack;
     }
     profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE);

     // copy the field into the host application's clover field
     profileClover.TPSTART(QUDA_PROFILE_D2H);
     if (inv_param->return_clover) {
       qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
     }
     if (inv_param->return_clover_inverse) {
       qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
     }

     profileClover.TPSTOP(QUDA_PROFILE_D2H);

     delete hack;
     checkCudaError();
   }

   profileClover.TPSTART(QUDA_PROFILE_FREE);
   if (in) delete in; // delete object referencing input field
   profileClover.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void freeSloppyCloverQuda(void);

 void loadSloppyCloverQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
 {
   freeSloppyCloverQuda();

   if (cloverPrecise) {
     // create the mirror sloppy clover field
     CloverFieldParam clover_param(*cloverPrecise);

     clover_param.setPrecision(prec_sloppy);

     if (cloverPrecise->V(false) != cloverPrecise->V(true)) {
       clover_param.direct = true;
       clover_param.inverse = true;
     } else {
       clover_param.direct = false;
       clover_param.inverse = true;
     }

     if (clover_param.precision != cloverPrecise->Precision()) {
       cloverSloppy = new cudaCloverField(clover_param);
       cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
     } else {
       cloverSloppy = cloverPrecise;
     }

     // switch the parameters for creating the mirror preconditioner clover field
     clover_param.setPrecision(prec_precondition);

     // create the mirror preconditioner clover field
     if (clover_param.precision != cloverSloppy->Precision()) {
       cloverPrecondition = new cudaCloverField(clover_param);
       cloverPrecondition->copy(*cloverSloppy, clover_param.inverse);
     } else {
       cloverPrecondition = cloverSloppy;
     }
   }

 }

 void freeGaugeQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
   if (gaugePrecise) delete gaugePrecise;
   if (gaugeExtended) delete gaugeExtended;

   gaugePrecondition = NULL;
   gaugeSloppy = NULL;
   gaugePrecise = NULL;
   gaugeExtended = NULL;

   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
   if (gaugeLongPrecise) delete gaugeLongPrecise;
   if (gaugeLongExtended) delete gaugeLongExtended;

   gaugeLongPrecondition = NULL;
   gaugeLongSloppy = NULL;
   gaugeLongPrecise = NULL;
   gaugeLongExtended = NULL;

   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
   if (gaugeFatPrecise) delete gaugeFatPrecise;

   gaugeFatPrecondition = NULL;
   gaugeFatSloppy = NULL;
   gaugeFatPrecise = NULL;
   gaugeFatExtended = NULL;

   if (gaugeSmeared) delete gaugeSmeared;

   gaugeSmeared = NULL;
   // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
   if (extendedGaugeResident) {
     delete extendedGaugeResident;
     extendedGaugeResident = NULL;
   }
 }

 // just free the sloppy fields used in mixed-precision solvers
 void freeSloppyGaugeQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;

   gaugePrecondition = NULL;
   gaugeSloppy = NULL;

   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;

   gaugeLongPrecondition = NULL;
   gaugeLongSloppy = NULL;

   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;

   gaugeFatPrecondition = NULL;
   gaugeFatSloppy = NULL;
 }


 void loadSloppyGaugeQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
 {
   // first do SU3 links (if they exist)
   if (gaugePrecise) {
     GaugeFieldParam gauge_param(*gaugePrecise);
     gauge_param.setPrecision(prec_sloppy);
     //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME

     if (gaugeSloppy) errorQuda("gaugeSloppy already exists");

     if (gauge_param.precision != gaugePrecise->Precision() ||
   gauge_param.reconstruct != gaugePrecise->Reconstruct()) {
       gaugeSloppy = new cudaGaugeField(gauge_param);
       gaugeSloppy->copy(*gaugePrecise);
     } else {
       gaugeSloppy = gaugePrecise;
     }

     // switch the parameters for creating the mirror preconditioner cuda gauge field
     gauge_param.setPrecision(prec_precondition);
     //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME

     if (gaugePrecondition) errorQuda("gaugePrecondition already exists");

     if (gauge_param.precision != gaugeSloppy->Precision() ||
   gauge_param.reconstruct != gaugeSloppy->Reconstruct()) {
       gaugePrecondition = new cudaGaugeField(gauge_param);
       gaugePrecondition->copy(*gaugeSloppy);
     } else {
       gaugePrecondition = gaugeSloppy;
     }
   }

   // fat links (if they exist)
   if (gaugeFatPrecise) {
     GaugeFieldParam gauge_param(*gaugeFatPrecise);

     if (gaugeFatSloppy != gaugeSloppy) {
       gauge_param.setPrecision(prec_sloppy);
       //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME

       if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");
       if (gaugeFatSloppy != gaugeFatPrecise) delete gaugeFatSloppy;

       if (gauge_param.precision != gaugeFatPrecise->Precision() ||
     gauge_param.reconstruct != gaugeFatPrecise->Reconstruct()) {
   gaugeFatSloppy = new cudaGaugeField(gauge_param);
   gaugeFatSloppy->copy(*gaugeFatPrecise);
       } else {
   gaugeFatSloppy = gaugeFatPrecise;
       }
     }

     if (gaugeFatPrecondition != gaugePrecondition) {
       // switch the parameters for creating the mirror preconditioner cuda gauge field
       gauge_param.setPrecision(prec_precondition);
       //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME

       if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");

       if (gauge_param.precision != gaugeFatSloppy->Precision() ||
     gauge_param.reconstruct != gaugeFatSloppy->Reconstruct()) {
   gaugeFatPrecondition = new cudaGaugeField(gauge_param);
   gaugeFatPrecondition->copy(*gaugeFatSloppy);
       } else {
   gaugeFatPrecondition = gaugeFatSloppy;
       }
     }
   }

   // long links (if they exist)
   if (gaugeLongPrecise) {
     GaugeFieldParam gauge_param(*gaugeLongPrecise);
     gauge_param.setPrecision(prec_sloppy);
     //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME

     if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");
     if (gaugeLongSloppy != gaugeLongPrecise) delete gaugeLongSloppy;

     if (gauge_param.precision != gaugeLongPrecise->Precision() ||
   gauge_param.reconstruct != gaugeLongPrecise->Reconstruct()) {
       gaugeLongSloppy = new cudaGaugeField(gauge_param);
       gaugeLongSloppy->copy(*gaugeLongPrecise);
     } else {
       gaugeLongSloppy = gaugeLongPrecise;
     }

     // switch the parameters for creating the mirror preconditioner cuda gauge field
     gauge_param.setPrecision(prec_precondition);
     //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME

     if (gaugeLongPrecondition) warningQuda("gaugeLongPrecondition already exists\n");

     if (gauge_param.precision != gaugeLongSloppy->Precision() ||
   gauge_param.reconstruct != gaugeLongSloppy->Reconstruct()) {
       gaugeLongPrecondition = new cudaGaugeField(gauge_param);
       gaugeLongPrecondition->copy(*gaugeLongSloppy);
     } else {
       gaugeLongPrecondition = gaugeLongSloppy;
     }
   }
 }

 void freeSloppyCloverQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");
   if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition;
   if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;
   cloverPrecondition = nullptr;
   cloverSloppy = nullptr;
 }

 void freeCloverQuda(void)
 {
   if (!initialized) errorQuda("QUDA not initialized");
   freeSloppyCloverQuda();
   if (cloverPrecise) delete cloverPrecise;
   cloverPrecise = nullptr;
 }

 void flushChronoQuda(int i)
 {
   if (i >= QUDA_MAX_CHRONO)
     errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);

   auto &basis = chronoResident[i];

   for (unsigned int j=0; j<basis.size(); j++) {
     if (basis[j].first)  delete basis[j].first;
     if (basis[j].second) delete basis[j].second;
   }
   basis.clear();
 }

 void endQuda(void)
 {
   profileEnd.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) return;

   freeGaugeQuda();
   freeCloverQuda();

   for (int i=0; i<QUDA_MAX_CHRONO; i++) flushChronoQuda(i);

   for (auto v : solutionResident) if (v) delete v;
   solutionResident.clear();

   if(momResident) delete momResident;

   LatticeField::freeGhostBuffer();
   cpuColorSpinorField::freeGhostBuffer();

   blas::end();

   pool::flush_pinned();
   pool::flush_device();

   host_free(num_failures_h);
   num_failures_h = NULL;
   num_failures_d = NULL;

   if (streams) {
     for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
     delete []streams;
     streams = NULL;
   }
   destroyDslashEvents();

   saveTuneCache();
   saveProfile();

   initialized = false;

   comm_finalize();
   comms_initialized = false;

   profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);
   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);

   // print out the profile information of the lifetime of the library
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     profileInit.Print();
     profileGauge.Print();
     profileClover.Print();
     profileDslash.Print();
     profileInvert.Print();
     profileMulti.Print();
     profileFatLink.Print();
     profileGaugeForce.Print();
     profileGaugeUpdate.Print();
     profileExtendedGauge.Print();
     profileCloverForce.Print();
     profileStaggeredForce.Print();
     profileHISQForce.Print();
     profileContract.Print();
     profileCovDev.Print();
     profilePlaq.Print();
     profileQCharge.Print();
     profileAPE.Print();
     profileSTOUT.Print();
     profileProject.Print();
     profilePhase.Print();
     profileMomAction.Print();
     profileEnd.Print();

     profileInit2End.Print();
     TimeProfile::PrintGlobal();

     printLaunchTimer();
     printAPIProfile();

     printfQuda("\n");
     printPeakMemUsage();
     printfQuda("\n");
   }

   assertAllMemFree();

   char *device_reset_env = getenv("QUDA_DEVICE_RESET");
   if (device_reset_env && strcmp(device_reset_env,"1") == 0) {
     // end this CUDA context
     cudaDeviceReset();
   }

 }


 namespace quda {

   void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
   {
     double kappa = inv_param->kappa;
     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
       kappa *= gaugePrecise->Anisotropy();
     }

     switch (inv_param->dslash_type) {
     case QUDA_WILSON_DSLASH:
       diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
       break;
     case QUDA_CLOVER_WILSON_DSLASH:
       diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
       break;
     case QUDA_DOMAIN_WALL_DSLASH:
       diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC;
       diracParam.Ls = inv_param->Ls;
       break;
     case QUDA_DOMAIN_WALL_4D_DSLASH:
       if(pc) {
   diracParam.type = QUDA_DOMAIN_WALL_4DPC_DIRAC;
   diracParam.Ls = inv_param->Ls;
       } else errorQuda("For 4D type of DWF dslash, pc must be turned on, %d", inv_param->dslash_type);
       break;
     case QUDA_MOBIUS_DWF_DSLASH:
       if (inv_param->Ls > QUDA_MAX_DWF_LS)
   errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC;
       diracParam.Ls = inv_param->Ls;
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(double)*inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(double)*inv_param->Ls);
       break;
     case QUDA_STAGGERED_DSLASH:
       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
       break;
     case QUDA_ASQTAD_DSLASH:
       diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
       break;
     case QUDA_TWISTED_MASS_DSLASH:
       diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC;
       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
   diracParam.Ls = 1;
   diracParam.epsilon = 0.0;
       } else {
   diracParam.Ls = 2;
   diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
       }
       break;
     case QUDA_TWISTED_CLOVER_DSLASH:
       diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC;
       if (inv_param->twist_flavor == QUDA_TWIST_SINGLET)  {
   diracParam.Ls = 1;
   diracParam.epsilon = 0.0;
       } else {
   diracParam.Ls = 2;
   diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
       }
       break;
     case QUDA_LAPLACE_DSLASH:
       diracParam.type = pc ? QUDA_GAUGE_LAPLACEPC_DIRAC : QUDA_GAUGE_LAPLACE_DIRAC;
       break;
     case QUDA_COVDEV_DSLASH:
       diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
       break;
     default:
       errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
     }

     diracParam.matpcType = inv_param->matpc_type;
     diracParam.dagger = inv_param->dagger;
     diracParam.gauge = gaugePrecise;
     diracParam.fatGauge = gaugeFatPrecise;
     diracParam.longGauge = gaugeLongPrecise;
     diracParam.clover = cloverPrecise;
     diracParam.kappa = kappa;
     diracParam.mass = inv_param->mass;
     diracParam.m5 = inv_param->m5;
     diracParam.mu = inv_param->mu;

     for (int i=0; i<4; i++) diracParam.commDim[i] = 1;   // comms are always on
   }


   void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);

     diracParam.gauge = gaugeSloppy;
     diracParam.fatGauge = gaugeFatSloppy;
     diracParam.longGauge = gaugeLongSloppy;
     diracParam.clover = cloverSloppy;

     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
     }

   }

   // The preconditioner currently mimicks the sloppy operator with no comms
   void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
   {
     setDiracParam(diracParam, inv_param, pc);

     if(inv_param->overlap){
       diracParam.gauge = gaugeExtended;
       diracParam.fatGauge = gaugeFatExtended;
       diracParam.longGauge = gaugeLongExtended;
     }else{
       diracParam.gauge = gaugePrecondition;
       diracParam.fatGauge = gaugeFatPrecondition;
       diracParam.longGauge = gaugeLongPrecondition;
     }
     diracParam.clover = cloverPrecondition;

     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = comms ? 1 : 0;
     }

     // In the preconditioned staggered CG allow a different dslash type in the preconditioning
     if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH
        && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) {
        diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
        diracParam.gauge = gaugeFatPrecondition;
     }
   }


   void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
     DiracParam diracPreParam;

     setDiracParam(diracParam, &param, pc_solve);
     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
     bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ?  false : true ;//inc eigCG needs 2 sloppy precisions.
     setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);


     d = Dirac::create(diracParam); // create the Dirac operator
     dSloppy = Dirac::create(diracSloppyParam);
     dPre = Dirac::create(diracPreParam);
   }

   static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];

   void massRescale(cudaColorSpinorField &b, QudaInvertParam &param) {

     double kappa5 = (0.5/(5.0 + param.m5));
     double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
         param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
         param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       printfQuda("Mass rescale: Kappa is: %g\n", kappa);
       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
       double nin = blas::norm2(b);
       printfQuda("Mass rescale: norm of source in = %g\n", nin);
     }

     // staggered dslash uses mass normalization internally
     if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {
       switch (param.solution_type) {
         case QUDA_MAT_SOLUTION:
         case QUDA_MATPC_SOLUTION:
           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);
           break;
         case QUDA_MATDAG_MAT_SOLUTION:
         case QUDA_MATPCDAG_MATPC_SOLUTION:
           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);
           break;
         default:
           errorQuda("Not implemented");
       }
       return;
     }

     for(int i=0; i<param.num_offset; i++) {
       unscaled_shifts[i] = param.offset[i];
     }

     // multiply the source to compensate for normalization of the Dirac operator, if necessary
     switch (param.solution_type) {
       case QUDA_MAT_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(2.0*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;
         }
         break;
       case QUDA_MATDAG_MAT_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         }
         break;
       case QUDA_MATPC_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(2.0*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;
         }
         break;
       case QUDA_MATPCDAG_MATPC_SOLUTION:
         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
     blas::ax(16.0*std::pow(kappa,4), b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 16.0*std::pow(kappa,4);
         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
     blas::ax(4.0*kappa*kappa, b);
     for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;
         }
         break;
       default:
         errorQuda("Solution type %d not supported", param.solution_type);
     }

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       printfQuda("Mass rescale: Kappa is: %g\n", kappa);
       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
       double nin = blas::norm2(b);
       printfQuda("Mass rescale: norm of source out = %g\n", nin);
     }

   }
 }

 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
   profileDslash.TPSTART(QUDA_PROFILE_TOTAL);

   profileDslash.TPSTART(QUDA_PROFILE_INIT);
   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField in(*in_h, cudaParam);
   cudaColorSpinorField out(in, cudaParam);

   bool pc = true;
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   profileDslash.TPSTOP(QUDA_PROFILE_INIT);

   profileDslash.TPSTART(QUDA_PROFILE_H2D);
   in = *in_h;
   profileDslash.TPSTOP(QUDA_PROFILE_H2D);

   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
        inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )
     blas::ax(1.0/(2.0*inv_param->mass), in);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gaugePrecise->Anisotropy(), in);
   }

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     cudaColorSpinorField tmp1(in, cudaParam);
     ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
     dirac->Dslash(out, tmp1, parity); // apply the operator
   } else {
     dirac->Dslash(out, in, parity); // apply the operator
   }
   profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileDslash.TPSTART(QUDA_PROFILE_D2H);
   *out_h = out;
   profileDslash.TPSTOP(QUDA_PROFILE_D2H);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   profileDslash.TPSTART(QUDA_PROFILE_FREE);
   delete dirac; // clean up

   delete out_h;
   delete in_h;
   profileDslash.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();
   profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
 {
   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH )
     setKernelPackT(true);
   else
     errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gaugePrecise->Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator
   printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);
   switch (test_type) {
     case 0:
       dirac.Dslash4(out, in, parity);
       break;
     case 1:
       dirac.Dslash5(out, in, parity);
       break;
     case 2:
       dirac.Dslash5inv(out, in, parity, inv_param->kappa);
       break;
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }

 void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
 {
   if ( inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)
     setKernelPackT(true);
   else
     errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gaugePrecise->Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   DiracMobiusPC dirac(diracParam); // create the Dirac operator
   switch (test_type) {
     case 0:
       dirac.Dslash4(out, in, parity);
       break;
     case 1:
       dirac.Dslash5(out, in, parity);
       break;
     case 2:
       dirac.Dslash4pre(out, in, parity);
       break;
     case 3:
       dirac.Dslash5inv(out, in, parity);
       break;
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }


 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
 {
   pushVerbosity(inv_param->verbosity);

   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   dirac->M(out, in); // apply the operator
   delete dirac; // clean up

   double kappa = inv_param->kappa;
   if (pc) {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.5/kappa, out);
     }
   } else {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.5/kappa, out);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }


 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
 {
   pushVerbosity(inv_param->verbosity);

   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   //  double kappa = inv_param->kappa;
   //  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);

   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
   dirac->MdagM(out, in); // apply the operator
   delete dirac; // clean up

   double kappa = inv_param->kappa;
   if (pc) {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
       blas::ax(1.0/std::pow(2.0*kappa,4), out);
     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     }
   } else {
     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
       blas::ax(0.25/(kappa*kappa), out);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   delete out_h;
   delete in_h;

   popVerbosity();
 }

 namespace quda{
 bool canReuseResidentGauge(QudaInvertParam *param){
   return (gaugePrecise != NULL) and param->cuda_prec == gaugePrecise->Precision();
 }
 }

 void checkClover(QudaInvertParam *param) {

   if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
     return;
   }

   if (param->cuda_prec != cloverPrecise->Precision()) {
     errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());
   }

   if ((!cloverSloppy || param->cuda_prec_sloppy != cloverSloppy->Precision()) ||
       (!cloverPrecondition || param->cuda_prec_precondition != cloverPrecondition->Precision())) {
     freeSloppyCloverQuda();
     loadSloppyCloverQuda(param->cuda_prec_sloppy, param->cuda_prec_precondition);
   }

   if (cloverPrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
   if (cloverSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");
   if (cloverPrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
 }

 quda::cudaGaugeField* checkGauge(QudaInvertParam *param) {

   if (param->cuda_prec != gaugePrecise->Precision()) {
     errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
   }

   quda::cudaGaugeField *cudaGauge = NULL;
   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (param->cuda_prec_sloppy != gaugeSloppy->Precision() ||
   param->cuda_prec_precondition != gaugePrecondition->Precision()) {
       freeSloppyGaugeQuda();
       loadSloppyGaugeQuda(param->cuda_prec_sloppy, param->cuda_prec_precondition);
     }

     if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");
     if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");
     if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");
     if (param->overlap) {
       if (gaugeExtended == NULL) errorQuda("Extended gauge field doesn't exist");
     }
     cudaGauge = gaugePrecise;
   } else {
     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision() ||
   param->cuda_prec_precondition != gaugeFatPrecondition->Precision() ||
   param->cuda_prec_sloppy != gaugeLongSloppy->Precision() ||
   param->cuda_prec_precondition != gaugeLongPrecondition->Precision()) {
       freeSloppyGaugeQuda();
       loadSloppyGaugeQuda(param->cuda_prec_sloppy, param->cuda_prec_precondition);
     }

     if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");
     if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");
     if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");
     if (param->overlap) {
       if(gaugeFatExtended == NULL) errorQuda("Extended gauge fat field doesn't exist");
     }

     if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");
     if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");
     if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");
     if (param->overlap) {
       if(gaugeLongExtended == NULL) errorQuda("Extended gauge long field doesn't exist");
     }
     cudaGauge = gaugeFatPrecise;
   }

   checkClover(param);

   return cudaGauge;
 }

 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
 {
   pushVerbosity(inv_param->verbosity);

   if (!initialized) errorQuda("QUDA not initialized");
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL) errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))
     errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");

   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);

   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);

   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
     if (parity == QUDA_EVEN_PARITY) {
       parity = QUDA_ODD_PARITY;
     } else {
       parity = QUDA_EVEN_PARITY;
     }
     blas::ax(gaugePrecise->Anisotropy(), in);
   }
   bool pc = true;

   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);
   //FIXME: Do we need this for twisted clover???
   DiracCloverPC dirac(diracParam); // create the Dirac operator
   if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
   else dirac.CloverInv(out, in, parity);

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   /*for (int i=0; i<in_h->Volume(); i++) {
     ((cpuColorSpinorField*)out_h)->PrintVector(i);
     }*/

   delete out_h;
   delete in_h;

   popVerbosity();
 }


 void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V,
                  void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
 {
   QudaInvertParam *param;
   param = eig_param->invert_param;

   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);

   checkInvertParam(param);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   checkEigParam(eig_param);

   bool pc_solution = (param->solution_type == QUDA_MATPC_DAG_SOLUTION) ||
                      (param->solution_type == QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION);

   // create the dirac operator
   DiracParam diracParam;
   setDiracParam(diracParam, param, pc_solution);
   Dirac *d = Dirac::create(diracParam); // create the Dirac operator

   Dirac &dirac = *d;

   profileInvert.TPSTART(QUDA_PROFILE_H2D);

   cudaColorSpinorField *r = NULL;
   cudaColorSpinorField *Apsi = NULL;
   const int *X = cudaGauge->X();

   // wrap CPU host side pointers
   ColorSpinorParam cpuParam(hp_r, *param, X, pc_solution);
   ColorSpinorField *h_r = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?
                           static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
                           static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   cpuParam.v = hp_Apsi;
   ColorSpinorField *h_Apsi = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?
                              static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
                              static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   //Make Eigen vector data set
   cpuColorSpinorField **h_Eig_Vec;
   h_Eig_Vec =(cpuColorSpinorField **)safe_malloc( m*sizeof(cpuColorSpinorField*));
   for( int k = 0 ; k < m ; k++)
   {
     cpuParam.v = ((double**)hp_V)[k];
     h_Eig_Vec[k] = new cpuColorSpinorField(cpuParam);
   }

   // download source
   ColorSpinorParam cudaParam(cpuParam, *param);
   cudaParam.create = QUDA_COPY_FIELD_CREATE;
   r = new cudaColorSpinorField(*h_r, cudaParam);
   Apsi = new cudaColorSpinorField(*h_Apsi, cudaParam);

   double cpu;
   double gpu;

   if (getVerbosity() >= QUDA_VERBOSE) {
     cpu = blas::norm2(*h_r);
     gpu = blas::norm2(*r);
     printfQuda("r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
     cpu = blas::norm2(*h_Apsi);
     gpu = blas::norm2(*Apsi);
     printfQuda("Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
   }

   // download Eigen vector set
   cudaColorSpinorField **Eig_Vec;
   Eig_Vec = (cudaColorSpinorField **)safe_malloc( m*sizeof(cudaColorSpinorField*));

   for( int k = 0 ; k < m ; k++)
   {
     Eig_Vec[k] = new cudaColorSpinorField(*h_Eig_Vec[k], cudaParam);
     if (getVerbosity() >= QUDA_VERBOSE) {
       cpu = blas::norm2(*h_Eig_Vec[k]);
       gpu = blas::norm2(*Eig_Vec[k]);
       printfQuda("Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);
     }
   }
   profileInvert.TPSTOP(QUDA_PROFILE_H2D);

   if(eig_param->RitzMat_lanczos == QUDA_MATPC_DAG_SOLUTION)
   {
     DiracMdag mat(dirac);
     RitzMat ritz_mat(mat,*eig_param);
     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
     delete eig_solve;
   }
   else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SOLUTION)
   {
     DiracMdagM mat(dirac);
     RitzMat ritz_mat(mat,*eig_param);
     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
     delete eig_solve;
   }
   else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)
   {
     DiracMdagM mat(dirac);
     RitzMat ritz_mat(mat,*eig_param);
     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
     delete eig_solve;
   }
   else
   {
     errorQuda("invalid ritz matrix type\n");
     exit(0);
   }

   //Write back calculated eigen vector
   profileInvert.TPSTART(QUDA_PROFILE_D2H);
   for( int k = 0 ; k < m ; k++)
   {
     *h_Eig_Vec[k] = *Eig_Vec[k];
   }
   *h_r = *r;
   *h_Apsi = *Apsi;
   profileInvert.TPSTOP(QUDA_PROFILE_D2H);


   delete h_r;
   delete h_Apsi;
   for( int k = 0 ; k < m ; k++)
   {
     delete Eig_Vec[k];
     delete h_Eig_Vec[k];
   }
   host_free(Eig_Vec);
   host_free(h_Eig_Vec);

   delete d;

   popVerbosity();

   saveTuneCache();
   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
   : profile(profile) {
   profile.TPSTART(QUDA_PROFILE_INIT);
   QudaInvertParam *param = mg_param.invert_param;

   cudaGaugeField *cudaGauge = checkGauge(param);
   checkMultigridParam(&mg_param);

   // check MG params (needs to go somewhere else)
   if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
     errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);
   for (int i=0; i<mg_param.n_level; i++) {
     if (mg_param.smoother_solve_type[i] != QUDA_DIRECT_SOLVE && mg_param.smoother_solve_type[i] != QUDA_DIRECT_PC_SOLVE)
       errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);
   }
   if (param->solve_type != QUDA_DIRECT_SOLVE)
     errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param);
   mg_param.secs = 0;
   mg_param.gflops = 0;

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   // create the dirac operators for the fine grid

   // this is the Dirac operator we use for inter-grid residual computation
   DiracParam diracParam;
   setDiracSloppyParam(diracParam, param, outer_pc_solve);
   d = Dirac::create(diracParam);
   m = new DiracM(*d);

   // this is the Dirac operator we use for smoothing
   DiracParam diracSmoothParam;
   bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
     (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
   setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
   dSmooth = Dirac::create(diracSmoothParam);
   mSmooth = new DiracM(*dSmooth);

   // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
   DiracParam diracSmoothSloppyParam;
   setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
   dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
   mSmoothSloppy = new DiracM(*dSmoothSloppy);

   printfQuda("Creating vector of null space fields of length %d\n", mg_param.n_vec[0]);

   ColorSpinorParam cpuParam(0, *param, cudaGauge->X(), pc_solution, QUDA_CPU_FIELD_LOCATION);
   cpuParam.create = QUDA_ZERO_FIELD_CREATE;
   cpuParam.precision = param->cuda_prec_sloppy;
   B.resize(mg_param.n_vec[0]);
   for (int i=0; i<mg_param.n_vec[0]; i++) B[i] = new cpuColorSpinorField(cpuParam);

   // fill out the MG parameters for the fine level
   mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);

   mg = new MG(*mgParam, profile);
   mgParam->updateInvertParam(*param);
   profile.TPSTOP(QUDA_PROFILE_INIT);
 }

 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   multigrid_solver *mg = new multigrid_solver(*mg_param, profileInvert);

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   saveProfile(__func__);
   flushProfile();
   saveTuneCache();
   return static_cast<void*>(mg);
 }

 void destroyMultigridQuda(void *mg) {
   delete static_cast<multigrid_solver*>(mg);
 }

 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) {
   multigrid_solver *mg = static_cast<multigrid_solver*>(mg_);

   QudaInvertParam *param = mg_param->invert_param;
   checkGauge(param);
   checkMultigridParam(mg_param);

   bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   // free the previous dirac oprators
   if (mg->m) delete mg->m;
   if (mg->mSmooth) delete mg->mSmooth;
   if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;

   if (mg->d) delete mg->d;
   if (mg->dSmooth) delete mg->dSmooth;
   if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;

   // create new fine dirac operators

   // this is the Dirac operator we use for inter-grid residual computation
   DiracParam diracParam;
   setDiracSloppyParam(diracParam, param, outer_pc_solve);
   mg->d = Dirac::create(diracParam);
   mg->m = new DiracM(*(mg->d));

   // this is the Dirac operator we use for smoothing
   DiracParam diracSmoothParam;
   bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
     (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
   setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
   mg->dSmooth = Dirac::create(diracSmoothParam);
   mg->mSmooth = new DiracM(*(mg->dSmooth));

   // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
   DiracParam diracSmoothSloppyParam;
   setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
   mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
   mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));

   mg->mgParam->matResidual = mg->m;
   mg->mgParam->matSmooth = mg->mSmooth;
   mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;

   // recreate the smoothers on the fine level
   mg->mg->destroySmoother();
   mg->mg->createSmoother();

   //mgParam = new MGParam(mg_param, B, *m, *mSmooth, *mSmoothSloppy);
   //mg = new MG(*mgParam, profile);
   mg->mgParam->updateInvertParam(*param);
 }

 deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
   : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr),  profile(profile) {

   QudaInvertParam *param = eig_param.invert_param;

   if(param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER)  return;

   profile.TPSTART(QUDA_PROFILE_INIT);

   cudaGaugeField *cudaGauge = checkGauge(param);
   eig_param.secs   = 0;
   eig_param.gflops = 0;

   DiracParam diracParam;
   if(eig_param.cuda_prec_ritz == param->cuda_prec)
   {
     setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
   } else {
     setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
   }

   const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   d = Dirac::create(diracParam);
   m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));

   ColorSpinorParam ritzParam(0, *param, cudaGauge->X(), pc_solve, eig_param.location);

   ritzParam.create        = QUDA_ZERO_FIELD_CREATE;
   ritzParam.is_composite  = true;
   ritzParam.is_component  = false;
   ritzParam.composite_dim = param->nev*param->deflation_grid;
   ritzParam.setPrecision(param->cuda_prec_ritz);

   if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
     ritzParam.fieldOrder = (param->cuda_prec_ritz == QUDA_DOUBLE_PRECISION ) ?  QUDA_FLOAT2_FIELD_ORDER : QUDA_FLOAT4_FIELD_ORDER;
     if(ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;

     //select memory location here, by default ritz vectors will be allocated on the device
     //but if not sufficient device memory, then the user may choose mapped type of memory
     ritzParam.mem_type = eig_param.mem_type_ritz;
   } else { //host location
     ritzParam.mem_type = QUDA_MEMORY_PINNED;
   }

   int ritzVolume = 1;
   for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];

   if( getVerbosity() == QUDA_DEBUG_VERBOSE ) {

     size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.precision);
     printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)" , byte_estimate, ritzVolume, ritzParam.precision);
     if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");
     else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED) printfQuda("Using mapped memory type.\n");
   }

   RV = ColorSpinorField::Create(ritzParam);

   deflParam = new DeflationParam(eig_param, RV, *m);

   defl = new Deflation(*deflParam, profile);

   profile.TPSTOP(QUDA_PROFILE_INIT);
 }

 void* newDeflationQuda(QudaEigParam *eig_param) {
   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
 #ifdef MAGMA_LIB
   openMagma();
 #endif
   deflated_solver *defl = new deflated_solver(*eig_param, profileInvert);

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);

   saveProfile(__func__);
   flushProfile();
   return static_cast<void*>(defl);
 }

 void destroyDeflationQuda(void *df) {
 #ifdef MAGMA_LIB
   closeMagma();
 #endif
   delete static_cast<deflated_solver*>(df);
 }

 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);

   checkInvertParam(param);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
   // for now, though, so here we factorize everything for convenience.

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
     (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
     (param->solve_type == QUDA_DIRECT_PC_SOLVE);
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
   if (!pc_solve) param->spinorGiB *= 2;
   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
   } else {
     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
   }

   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   Dirac *d = NULL;
   Dirac *dSloppy = NULL;
   Dirac *dPre = NULL;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;
   Dirac &diracPre = *dPre;

   profileInvert.TPSTART(QUDA_PROFILE_H2D);

   ColorSpinorField *b = NULL;
   ColorSpinorField *x = NULL;
   ColorSpinorField *in = NULL;
   ColorSpinorField *out = NULL;

   const int *X = cudaGauge->X();

   // wrap CPU host side pointers
   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);

   cpuParam.v = hp_x;
   cpuParam.location = param->output_location;
   ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);

   // download source
   ColorSpinorParam cudaParam(cpuParam, *param);
   cudaParam.create = QUDA_COPY_FIELD_CREATE;
   b = new cudaColorSpinorField(*h_b, cudaParam);

   // now check if we need to invalidate the solutionResident vectors
   bool invalidate = false;
   for (auto v : solutionResident)
     if (cudaParam.precision != v->Precision()) { invalidate = true; break; }

   if (invalidate) {
     for (auto v : solutionResident) if (v) delete v;
     solutionResident.clear();
   }

   if (!solutionResident.size()) {
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution
   }
   x = solutionResident[0];

   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
     // initial guess only supported for single-pass solvers
     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
       errorQuda("Initial guess not supported for two-pass solver");
     }

     *x = *h_x; // solution
   } else { // zero initial guess
     blas::zero(*x);
   }

   profileInvert.TPSTOP(QUDA_PROFILE_H2D);

   double nb = blas::norm2(*b);
   if (nb==0.0) errorQuda("Source has zero norm");

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nh_b = blas::norm2(*h_b);
     double nh_x = blas::norm2(*h_x);
     double nx = blas::norm2(*x);
     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
     printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
   }

   // rescale the source and solution vectors to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     blas::ax(1.0/sqrt(nb), *b);
     blas::ax(1.0/sqrt(nb), *x);
   }

   massRescale(*static_cast<cudaColorSpinorField*>(b), *param);

   dirac.prepare(in, out, *x, *b, param->solution_type);

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nin = blas::norm2(*in);
     double nout = blas::norm2(*out);
     printfQuda("Prepared source = %g\n", nin);
     printfQuda("Prepared solution = %g\n", nout);
   }

   if (getVerbosity() >= QUDA_VERBOSE) {
     double nin = blas::norm2(*in);
     printfQuda("Prepared source post mass rescale = %g\n", nin);
   }

   // solution_type specifies *what* system is to be solved.
   // solve_type specifies *how* the system is to be solved.
   //
   // We have the following four cases (plus preconditioned variants):
   //
   // solution_type    solve_type    Effect
   // -------------    ----------    ------
   // MAT              DIRECT        Solve Ax=b
   // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y
   // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)
   // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b
   // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y
   //
   // We generally require that the solution_type and solve_type
   // preconditioning match.  As an exception, the unpreconditioned MAT
   // solution_type may be used with any solve_type, including
   // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the
   // preconditioned source and reconstruction of the full solution are
   // taken care of by Dirac::prepare() and Dirac::reconstruct(),
   // respectively.

   if (pc_solution && !pc_solve) {
     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
   }

   if (!mat_solution && !pc_solution && pc_solve) {
     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
   }

   if (!mat_solution && norm_error_solve) {
     errorQuda("Normal-error solve requires Mat solution");
   }

   if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {
     errorQuda("Multigrid preconditioning only supported for direct solves");
   }

   if (param->use_resident_chrono && (direct_solve || norm_error_solve) ){
     errorQuda("Chronological forcasting only presently supported for M^dagger M solver");
   }

   if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
     cudaColorSpinorField tmp(*in);
     dirac.Mdag(*in, tmp);
   } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
     DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);
     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     blas::copy(*in, *out);
     solverParam.updateInvertParam(*param);
     delete solve;
   }

   if (direct_solve) {
     DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);
     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     solverParam.updateInvertParam(*param);
     delete solve;
   } else if (!norm_error_solve) {
     DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     SolverParam solverParam(*param);

     // chronological forecasting
     if (param->use_resident_chrono && chronoResident[param->chrono_index].size() > 0) {
       auto &basis = chronoResident[param->chrono_index];

       cudaColorSpinorField tmp(*in), tmp2(*in);

       for (unsigned int j=0; j<basis.size(); j++) m(*basis[j].second, *basis[j].first, tmp, tmp2);

       bool orthogonal = true;
       bool apply_mat = false;
       MinResExt mre(m, orthogonal, apply_mat, profileInvert);
       blas::copy(tmp, *in);

       mre(*out, tmp, basis);
     }

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(*out, *in);
     solverParam.updateInvertParam(*param);
     delete solve;
   } else { // norm_error_solve
     DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
     cudaColorSpinorField tmp(*out);
     SolverParam solverParam(*param);
     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
     (*solve)(tmp, *in); // y = (M M^\dag) b
     dirac.Mdag(*out, tmp);  // x = M^dag y
     solverParam.updateInvertParam(*param);
     delete solve;
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     double nx = blas::norm2(*x);
    printfQuda("Solution = %g\n",nx);
   }

   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   dirac.reconstruct(*x, *b, param->solution_type);

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     // rescale the solution
     blas::ax(sqrt(nb), *x);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   if (!param->make_resident_solution) {
     profileInvert.TPSTART(QUDA_PROFILE_D2H);
     *h_x = *x;
     profileInvert.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);

   if (param->make_resident_chrono) {
     int i = param->chrono_index;
     if (i >= QUDA_MAX_CHRONO)
       errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);

     auto &basis = chronoResident[i];

     // if we have filled the space yet just augment
     if ((int)basis.size() < param->max_chrono_dim) {
       ColorSpinorParam cs_param(*x);
       basis.push_back(std::pair<ColorSpinorField*,ColorSpinorField*>(ColorSpinorField::Create(cs_param),ColorSpinorField::Create(cs_param)));
     }

     // shuffle every entry down one and bring the last to the front
     ColorSpinorField *tmp = basis[basis.size()-1].first;
     for (unsigned int j=basis.size()-1; j>0; j--) basis[j].first = basis[j-1].first;
     basis[0].first = tmp;
     *(basis[0]).first = *x; // set first entry to new solution
   }

   if (param->compute_action) {
     Complex action = blas::cDotProduct(*b, *x);
     param->action[0] = action.real();
     param->action[1] = action.imag();
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     double nx = blas::norm2(*x);
     double nh_x = blas::norm2(*h_x);
     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   profileInvert.TPSTART(QUDA_PROFILE_FREE);

   delete h_b;
   delete h_x;
   delete b;

   if (!param->make_resident_solution) {
     for (auto v: solutionResident) if (v) delete v;
     solutionResident.clear();
   }

   delete d;
   delete dSloppy;
   delete dPre;

   profileInvert.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
 {

   // currently that code is just a copy of invertQuda and cannot work

   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   profileInvert.TPSTART(QUDA_PROFILE_TOTAL);

   if (!initialized) errorQuda("QUDA not initialized");

   pushVerbosity(param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);

   checkInvertParam(param);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
   // for now, though, so here we factorize everything for convenience.

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
     (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
     (param->solve_type == QUDA_DIRECT_PC_SOLVE);
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
   if (!pc_solve) param->spinorGiB *= 2;
   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
   } else {
     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
   }

   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   Dirac *d = NULL;
   Dirac *dSloppy = NULL;
   Dirac *dPre = NULL;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;
   Dirac &diracPre = *dPre;

   profileInvert.TPSTART(QUDA_PROFILE_H2D);

   // std::vector<ColorSpinorField*> b;  // Cuda Solutions
   // b.resize(param->num_src);
   // std::vector<ColorSpinorField*> x;  // Cuda Solutions
   // x.resize(param->num_src);
   ColorSpinorField* in;  // = NULL;
   //in.resize(param->num_src);
   ColorSpinorField* out;  // = NULL;
   //out.resize(param->num_src);

   // for(int i=0;i < param->num_src;i++){
   //   in[i] = NULL;
   //   out[i] = NULL;
   // }

   const int *X = cudaGauge->X();


   // Host pointers for x, take a copy of the input host pointers
   void** hp_x;
   hp_x = new void* [ param->num_src ];

   void** hp_b;
   hp_b = new void* [param->num_src];

   for(int i=0;i < param->num_src;i++){
     hp_x[i] = _hp_x[i];
     hp_b[i] = _hp_b[i];
   }

   // wrap CPU host side pointers
   ColorSpinorParam cpuParam(hp_b[0], *param, X, pc_solution, param->input_location);
   std::vector<ColorSpinorField*> h_b;
   h_b.resize(param->num_src);
   for(int i=0; i < param->num_src; i++) {
     cpuParam.v = hp_b[i]; //MW seems wird in the loop
     h_b[i] = ColorSpinorField::Create(cpuParam);
   }

  // cpuParam.v = hp_x;
   cpuParam.location = param->output_location;
   std::vector<ColorSpinorField*> h_x;
   h_x.resize(param->num_src);
 //
   for(int i=0; i < param->num_src; i++) {
     cpuParam.v = hp_x[i]; //MW seems wird in the loop
     h_x[i] = ColorSpinorField::Create(cpuParam);
   }


   // MW currently checked until here

   // download source
   printfQuda("Setup b\n");
   ColorSpinorParam cudaParam(cpuParam, *param);
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaParam.is_composite = true;
   cudaParam.composite_dim = param->num_src;

   printfQuda("Create b \n");
   ColorSpinorField *b = ColorSpinorField::Create(cudaParam);


   for(int i=0; i < param->num_src; i++) {
     b->Component(i) = *h_b[i];
   }
   printfQuda("Done b \n");

     ColorSpinorField *x;
   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
     // initial guess only supported for single-pass solvers
     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
       errorQuda("Initial guess not supported for two-pass solver");
     }
     cudaParam.is_composite = true;
     cudaParam.is_component = false;
     cudaParam.composite_dim = param->num_src;

     x = ColorSpinorField::Create(cudaParam);
     for(int i=0; i < param->num_src; i++) {
       x->Component(i) = *h_x[i];
     }

   } else { // zero initial guess
     // Create the solution fields filled with zero
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
       printfQuda("Create x \n");
     x = ColorSpinorField::Create(cudaParam);
       printfQuda("Done x \n");
  // solution
   }

   profileInvert.TPSTOP(QUDA_PROFILE_H2D);

   double * nb = new double[param->num_src];
   for(int i=0; i < param->num_src; i++) {
     nb[i] = blas::norm2(b->Component(i));
     printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nb[i], nb[i]);
     if (nb[i]==0.0) errorQuda("Source has zero norm");

     if (getVerbosity() >= QUDA_VERBOSE) {
       double nh_b = blas::norm2(*h_b[i]);
       double nh_x = blas::norm2(*h_x[i]);
       double nx = blas::norm2(x->Component(i));
       printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nh_b, nb[i]);
       printfQuda("Solution %i: CPU = %g, CUDA copy = %g\n", i, nh_x, nx);
     }
   }

   // MW checked until here do far

   // rescale the source and solution vectors to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     for(int i=0; i < param->num_src; i++) {
       blas::ax(1.0/sqrt(nb[i]), b->Component(i));
       blas::ax(1.0/sqrt(nb[i]), x->Component(i));
     }
   }

   for(int i=0; i < param->num_src; i++) {
     massRescale(dynamic_cast<cudaColorSpinorField&>( b->Component(i) ), *param);
   }

   // MW: need to check what dirac.prepare does
   // for now let's just try looping of num_rhs already here???
   // for(int i=0; i < param->num_src; i++) {
     dirac.prepare(in, out, *x, *b, param->solution_type);
 for(int i=0; i < param->num_src; i++) {
     if (getVerbosity() >= QUDA_VERBOSE) {
       double nin = blas::norm2((in->Component(i)));
       double nout = blas::norm2((out->Component(i)));
       printfQuda("Prepared source %i = %g\n", i, nin);
       printfQuda("Prepared solution %i = %g\n", i, nout);
     }

     if (getVerbosity() >= QUDA_VERBOSE) {
       double nin = blas::norm2(in->Component(i));
       printfQuda("Prepared source %i post mass rescale = %g\n", i, nin);
     }
   }

     // solution_type specifies *what* system is to be solved.
     // solve_type specifies *how* the system is to be solved.
     //
     // We have the following four cases (plus preconditioned variants):
     //
     // solution_type    solve_type    Effect
     // -------------    ----------    ------
     // MAT              DIRECT        Solve Ax=b
     // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y
     // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)
     // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b
     // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y
     //
     // We generally require that the solution_type and solve_type
     // preconditioning match.  As an exception, the unpreconditioned MAT
     // solution_type may be used with any solve_type, including
     // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the
     // preconditioned source and reconstruction of the full solution are
     // taken care of by Dirac::prepare() and Dirac::reconstruct(),
     // respectively.

     if (pc_solution && !pc_solve) {
       errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
     }

     if (!mat_solution && !pc_solution && pc_solve) {
       errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
     }

     if (!mat_solution && norm_error_solve) {
       errorQuda("Normal-error solve requires Mat solution");
     }

     if (param->inv_type_precondition == QUDA_MG_INVERTER && (pc_solve || pc_solution || !direct_solve || !mat_solution))
       errorQuda("Multigrid preconditioning only supported for direct non-red-black solve");

     if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
       for(int i=0; i < param->num_src; i++) {
         cudaColorSpinorField tmp((in->Component(i)));
         dirac.Mdag(in->Component(i), tmp);
       }
     } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
       DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->solve(*out,*in);
       for(int i=0; i < param->num_src; i++) {
         blas::copy(in->Component(i), out->Component(i));
       }
       solverParam.updateInvertParam(*param);
       delete solve;
     }

     if (direct_solve) {
       DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->solve(*out,*in);
       solverParam.updateInvertParam(*param);
       delete solve;
     } else if (!norm_error_solve) {
       DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       SolverParam solverParam(*param);
       Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       solve->solve(*out,*in);
       solverParam.updateInvertParam(*param);
       delete solve;
     } else { // norm_error_solve
       DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
       errorQuda("norm_error_solve not supported in multi source solve");
       //cudaColorSpinorField tmp(*out);
       // SolverParam solverParam(*param);
       //Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
       //(*solve)(tmp, *in); // y = (M M^\dag) b
       //dirac.Mdag(*out, tmp);  // x = M^dag y
       //solverParam.updateInvertParam(*param,i,i);
       // delete solve;
     }

     if (getVerbosity() >= QUDA_VERBOSE){
       for(int i=0; i < param->num_src; i++) {
         double nx = blas::norm2(x->Component(i));
         printfQuda("Solution %i = %g\n",i, nx);
       }
     }


   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   for(int i=0; i< param->num_src; i++){
     dirac.reconstruct(x->Component(i), b->Component(i), param->solution_type);
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     for(int i=0; i< param->num_src; i++){
       // rescale the solution
       blas::ax(sqrt(nb[i]), x->Component(i));
     }
   }

   // MW -- not sure how to handle that here
   if (!param->make_resident_solution) {
     profileInvert.TPSTART(QUDA_PROFILE_D2H);
     for(int i=0; i< param->num_src; i++){
       *h_x[i] = x->Component(i);
     }
     profileInvert.TPSTOP(QUDA_PROFILE_D2H);
   }

   if (getVerbosity() >= QUDA_VERBOSE){
     for(int i=0; i< param->num_src; i++){
       double nx = blas::norm2(x->Component(i));
       double nh_x = blas::norm2(*h_x[i]);
       printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
     }
   }

   //FIX need to make sure all deletes are correct again
   for(int i=0; i < param->num_src; i++){
     delete h_x[i];
     // delete x[i];
     delete h_b[i];
     // delete b[i];
   }
    delete [] hp_b;
    delete [] hp_x;
 //   delete [] b;
 //  if (!param->make_resident_solution) delete x; // FIXME make this cleaner

   delete d;
   delete dSloppy;
   delete dPre;
   delete x;
   delete b;

   popVerbosity();

   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
 {

   profileMulti.TPSTART(QUDA_PROFILE_TOTAL);
   profileMulti.TPSTART(QUDA_PROFILE_INIT);

   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   if (!initialized) errorQuda("QUDA not initialized");

   checkInvertParam(param);

   // check the gauge fields have been created
   cudaGaugeField *cudaGauge = checkGauge(param);

   if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
         param->num_offset, QUDA_MAX_MULTI_SHIFT);

   pushVerbosity(param->verbosity);

   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);

   if (mat_solution) {
     errorQuda("Multi-shift solver does not support MAT or MATPC solution types");
   }
   if (direct_solve) {
     errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
   }
   if (pc_solution & !pc_solve) {
     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
   }
   if (!pc_solution & pc_solve) {
     errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
   }

   // No of GiB in a checkerboard of a spinor
   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
   if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve

   // **** WARNING *** this may not match implementation...
   if( param->inv_type == QUDA_CG_INVERTER ) {
     // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
     param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
   } else {
     errorQuda("QUDA only currently supports multi-shift CG");
     // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
     // (Jegerlehner hep-lat/9612014 eq (3.13)
     param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
   }

   // Timing and FLOP counters
   param->secs = 0;
   param->gflops = 0;
   param->iter = 0;

   for (int i=0; i<param->num_offset-1; i++) {
     for (int j=i+1; j<param->num_offset; j++) {
       if (param->offset[i] > param->offset[j])
         errorQuda("Offsets must be ordered from smallest to largest");
     }
   }

   // Host pointers for x, take a copy of the input host pointers
   void** hp_x;
   hp_x = new void* [ param->num_offset ];

   void* hp_b = _hp_b;
   for(int i=0;i < param->num_offset;i++){
     hp_x[i] = _hp_x[i];
   }

   // Create the matrix.
   // The way this works is that createDirac will create 'd' and 'dSloppy'
   // which are global. We then grab these with references...
   //
   // Balint: Isn't there a nice construction pattern we could use here? This is
   // expedient but yucky.
   //  DiracParam diracParam;
   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH){
     param->mass = sqrt(param->offset[0]/4);
   }

   Dirac *d = NULL;
   Dirac *dSloppy = NULL;
   Dirac *dPre = NULL;

   // create the dirac operator
   createDirac(d, dSloppy, dPre, *param, pc_solve);
   Dirac &dirac = *d;
   Dirac &diracSloppy = *dSloppy;

   cudaColorSpinorField *b = NULL;   // Cuda RHS
   std::vector<ColorSpinorField*> x;  // Cuda Solutions
   x.resize(param->num_offset);

   // Grab the dimension array of the input gauge field.
   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
     gaugeFatPrecise->X() : gaugePrecise->X();

   // This creates a ColorSpinorParam struct, from the host data
   // pointer, the definitions in param, the dimensions X, and whether
   // the solution is on a checkerboard instruction or not. These can
   // then be used as 'instructions' to create the actual
   // ColorSpinorField
   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
   ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);

   std::vector<ColorSpinorField*> h_x;
   h_x.resize(param->num_offset);

   cpuParam.location = param->output_location;
   for(int i=0; i < param->num_offset; i++) {
     cpuParam.v = hp_x[i];
     h_x[i] = ColorSpinorField::Create(cpuParam);
   }

   profileMulti.TPSTOP(QUDA_PROFILE_INIT);
   profileMulti.TPSTART(QUDA_PROFILE_H2D);
   // Now I need a colorSpinorParam for the device
   ColorSpinorParam cudaParam(cpuParam, *param);
   // This setting will download a host vector
   cudaParam.create = QUDA_COPY_FIELD_CREATE;
   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
   profileMulti.TPSTOP(QUDA_PROFILE_H2D);

   profileMulti.TPSTART(QUDA_PROFILE_INIT);
   // Create the solution fields filled with zero
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   // now check if we need to invalidate the solutionResident vectors
   bool invalidate = false;
   for (auto v : solutionResident)
     if (cudaParam.precision != v->Precision()) { invalidate = true; break; }

   if (invalidate) {
     for (auto v : solutionResident) delete v;
     solutionResident.clear();
   }

   // grow resident solutions to be big enough
   for (int i=solutionResident.size(); i < param->num_offset; i++) {
     solutionResident.push_back(new cudaColorSpinorField(cudaParam));
   }
   for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];

   profileMulti.TPSTOP(QUDA_PROFILE_INIT);


   profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);

   // Check source norms
   double nb = blas::norm2(*b);
   if (nb==0.0) errorQuda("Source has zero norm");

   if(getVerbosity() >= QUDA_VERBOSE ) {
     double nh_b = blas::norm2(*h_b);
     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
   }

   // rescale the source vector to help prevent the onset of underflow
   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
     blas::ax(1.0/sqrt(nb), *b);
   }

   massRescale(*b, *param);
   profileMulti.TPSTOP(QUDA_PROFILE_PREAMBLE);

   // use multi-shift CG
   {
     DiracMdagM m(dirac), mSloppy(diracSloppy);
     SolverParam solverParam(*param);
     MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
     cg_m(x, *b);
     solverParam.updateInvertParam(*param);
   }

   if (param->compute_true_res) {
     // check each shift has the desired tolerance and use sequential CG to refine
     profileMulti.TPSTART(QUDA_PROFILE_INIT);
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
     cudaColorSpinorField r(*b, cudaParam);
     profileMulti.TPSTOP(QUDA_PROFILE_INIT);

 #define REFINE_INCREASING_MASS
 #ifdef REFINE_INCREASING_MASS
     for(int i=0; i < param->num_offset; i++) {
 #else
     for(int i=param->num_offset-1; i >= 0; i--) {
 #endif
       double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
   param->true_res_hq_offset[i] : 0;
       double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
   param->tol_hq_offset[i] : 0;

       /*
   In the case where the shifted systems have zero tolerance
   specified, we refine these systems until either the limit of
   precision is reached (prec_tol) or until the tolerance reaches
   the iterated residual tolerance of the previous multi-shift
   solver (iter_res_offset[i]), which ever is greater.
       */
       const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+2));
       const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));
       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
       if ((param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq)) {
   if (getVerbosity() >= QUDA_SUMMARIZE)
     printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
          i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);

   // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH) {
     dirac.setMass(sqrt(param->offset[i]/4));
     diracSloppy.setMass(sqrt(param->offset[i]/4));
   }

   DiracMdagM m(dirac), mSloppy(diracSloppy);

   // need to curry in the shift if we are not doing staggered
   if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
       param->dslash_type != QUDA_STAGGERED_DSLASH) {
     m.shift = param->offset[i];
     mSloppy.shift = param->offset[i];
   }

   if (0) { // experimenting with Minimum residual extrapolation
     // only perform MRE using current and previously refined solutions
 #ifdef REFINE_INCREASING_MASS
     const int nRefine = i+1;
 #else
     const int nRefine = param->num_offset - i + 1;
 #endif

     std::vector<ColorSpinorField*> q;
     q.resize(nRefine);
     std::vector<ColorSpinorField*> z;
     z.resize(nRefine);
     cudaParam.create = QUDA_NULL_FIELD_CREATE;
     cudaColorSpinorField tmp(cudaParam);

     for(int j=0; j < nRefine; j++) {
       q[j] = new cudaColorSpinorField(cudaParam);
       z[j] = new cudaColorSpinorField(cudaParam);
     }

     *z[0] = *x[0]; // zero solution already solved
 #ifdef REFINE_INCREASING_MASS
     for (int j=1; j<nRefine; j++) *z[j] = *x[j];
 #else
     for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];
 #endif

     bool orthogonal = true;
     bool apply_mat = true;
     MinResExt mre(m, orthogonal, apply_mat, profileMulti);
     blas::copy(tmp, *b);
     mre(*x[i], tmp, z, q);

     for(int j=0; j < nRefine; j++) {
       delete q[j];
       delete z[j];
     }
   }

   SolverParam solverParam(*param);
   solverParam.iter = 0;
   solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES;
   solverParam.tol = (param->tol_offset[i] > 0.0 ?  param->tol_offset[i] : iter_tol); // set L2 tolerance
   solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance

   CG cg(m, mSloppy, solverParam, profileMulti);
   cg(*x[i], *b);

   solverParam.true_res_offset[i] = solverParam.true_res;
   solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
   solverParam.updateInvertParam(*param,i);

   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH) {
     dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
     diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
   }

       }
     }
   }

   // restore shifts -- avoid side effects
   for(int i=0; i < param->num_offset; i++) {
     param->offset[i] = unscaled_shifts[i];
   }

   profileMulti.TPSTART(QUDA_PROFILE_D2H);

   if (param->compute_action) {
     Complex action(0);
     for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);
     param->action[0] = action.real();
     param->action[1] = action.imag();
   }

   for(int i=0; i < param->num_offset; i++) {
     if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
       blas::ax(sqrt(nb), *x[i]);
     }

     if (getVerbosity() >= QUDA_VERBOSE){
       double nx = blas::norm2(*x[i]);
       printfQuda("Solution %d = %g\n", i, nx);
     }

     if (!param->make_resident_solution) *h_x[i] = *x[i];
   }
   profileMulti.TPSTOP(QUDA_PROFILE_D2H);

   profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);

   if (!param->make_resident_solution) {
     for (auto v: solutionResident) if (v) delete v;
     solutionResident.clear();
   }

   profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);

   profileMulti.TPSTART(QUDA_PROFILE_FREE);
   for(int i=0; i < param->num_offset; i++){
     delete h_x[i];
     //if (!param->make_resident_solution) delete x[i];
   }

   delete h_b;
   delete b;

   delete [] hp_x;

   delete d;
   delete dSloppy;
   delete dPre;
   profileMulti.TPSTOP(QUDA_PROFILE_FREE);

   popVerbosity();

   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();

   profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param) {

 #ifdef GPU_FATLINK
   profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);
   profileFatLink.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(param);

   if (ulink) {
     const double unitarize_eps = 1e-14;
     const double max_error = 1e-10;
     const int reunit_allow_svd = 1;
     const int reunit_svd_only  = 0;
     const double svd_rel_error = 1e-6;
     const double svd_abs_error = 1e-6;
     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only,
              svd_rel_error, svd_abs_error);
   }

   GaugeFieldParam gParam(fatlink, *param, QUDA_GENERAL_LINKS);
   cpuGaugeField cpuFatLink(gParam);   // create the host fatlink
   gParam.gauge = longlink;
   cpuGaugeField cpuLongLink(gParam);  // create the host longlink
   gParam.gauge = ulink;
   cpuGaugeField cpuUnitarizedLink(gParam);
   gParam.link_type = param->type;
   gParam.gauge     = inlink;
   cpuGaugeField cpuInLink(gParam);    // create the host sitelink

   // create the device fields
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(param->cuda_prec);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);

   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);

   profileFatLink.TPSTART(QUDA_PROFILE_H2D);
   cudaInLink->loadCPUField(cpuInLink);
   profileFatLink.TPSTOP(QUDA_PROFILE_H2D);

   cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);

   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLink;
   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   gParam.create = QUDA_ZERO_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.setPrecision(param->cuda_prec);
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
   cudaGaugeField *cudaUnitarizedLink = ulink ? new cudaGaugeField(gParam) : nullptr;
   cudaGaugeField *cudaLongLink = longlink ? new cudaGaugeField(gParam) : nullptr;

   profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
   fatLongKSLink(cudaFatLink, cudaLongLink, *cudaInLinkEx, path_coeff);
   profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (ulink) {
     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
     *num_failures_h = 0;
     quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
     if (*num_failures_h>0) errorQuda("Error in unitarization component of the hisq fattening: %d failures\n", *num_failures_h);
     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
   }

   profileFatLink.TPSTART(QUDA_PROFILE_D2H);
   if (ulink) cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink);
   if (fatlink) cudaFatLink->saveCPUField(cpuFatLink);
   if (longlink) cudaLongLink->saveCPUField(cpuLongLink);
   profileFatLink.TPSTOP(QUDA_PROFILE_D2H);

   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaFatLink;
   if (longlink) delete cudaLongLink;
   if (ulink) delete cudaUnitarizedLink;
   delete cudaInLinkEx;
   profileFatLink.TPSTOP(QUDA_PROFILE_FREE);

   profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);
 #else
   errorQuda("Fat-link has not been built");
 #endif // GPU_FATLINK

   return;
 }

 int getGaugePadding(GaugeFieldParam& param){
   int pad = 0;
 #ifdef MULTI_GPU
   int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
   int face_size[4];
   for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
   pad = *std::max_element(face_size, face_size+4);
 #endif

   return pad;
 }

 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
         double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
 {
 #ifdef GPU_GAUGE_FORCE
   profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(qudaGaugeParam);

   GaugeFieldParam gParam(siteLink, *qudaGaugeParam);
   gParam.site_offset = qudaGaugeParam->gauge_offset;
   gParam.site_size = qudaGaugeParam->site_size;
   cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : NULL;

   cudaGaugeField* cudaSiteLink = NULL;

   if (qudaGaugeParam->use_resident_gauge) {
     if (!gaugePrecise) errorQuda("No resident gauge field to use");
     cudaSiteLink = gaugePrecise;
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     gParam.create = QUDA_NULL_FIELD_CREATE;
     gParam.reconstruct = qudaGaugeParam->reconstruct;
     gParam.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||
         qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?
       QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

     cudaSiteLink = new cudaGaugeField(gParam);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);

     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
     cudaSiteLink->loadCPUField(*cpuSiteLink);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);

     profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
   }

   GaugeFieldParam gParamMom(mom, *qudaGaugeParam, QUDA_ASQTAD_MOM_LINKS);
   // FIXME - test program always uses MILC for mom but can use QDP for gauge
   if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;
   if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
   else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

   gParamMom.site_offset = qudaGaugeParam->mom_offset;
   gParamMom.site_size = qudaGaugeParam->site_size;
   cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : NULL;

   cudaGaugeField* cudaMom = NULL;
   if (qudaGaugeParam->use_resident_mom) {
     if (!momResident) errorQuda("No resident momentum field to use");
     cudaMom = momResident;
     if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
     gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.precision = qudaGaugeParam->cuda_prec;
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
     cudaMom = new cudaGaugeField(gParamMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
     if (!qudaGaugeParam->overwrite_mom) {
       profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
       cudaMom->loadCPUField(*cpuMom);
       profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
     }
   }

   cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);

   // actually do the computation
   profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
   gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf,  path_length, loop_coeff, num_paths, max_length);
   profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (qudaGaugeParam->return_result_mom) {
     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
     cudaMom->saveCPUField(*cpuMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
   if (qudaGaugeParam->make_resident_gauge) {
     if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
     gaugePrecise = cudaSiteLink;
   } else {
     delete cudaSiteLink;
   }

   if (qudaGaugeParam->make_resident_mom) {
     if (momResident && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   if (cpuSiteLink) delete cpuSiteLink;
   if (cpuMom) delete cpuMom;

   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaGauge;
   } else {
     delete cudaGauge;
   }
   profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);

   profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);

   checkCudaError();
 #else
   errorQuda("Gauge force has not been built");
 #endif // GPU_GAUGE_FORCE
   return 0;
 }

 void createCloverQuda(QudaInvertParam* invertParam)
 {
   profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   if (!cloverPrecise) errorQuda("Clover field not allocated");

   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   int R[4];
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
   cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);

   profileClover.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field
   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField Fmunu(tensorParam);
   profileClover.TPSTOP(QUDA_PROFILE_INIT);

   profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
   computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION);
   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);
   profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileClover.TPSTOP(QUDA_PROFILE_TOTAL);

   // FIXME always preserve the extended gauge
   extendedGaugeResident = gauge;

   return;
 }

 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 {
   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
   gParam.geometry = static_cast<QudaFieldGeometry>(geometry);
   if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
     errorQuda("Only scalar and vector geometries are supported\n");

   cpuGaugeField *cpuGauge = nullptr;
   if (gauge) cpuGauge = new cpuGaugeField(gParam);

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   cudaGaugeField* cudaGauge = new cudaGaugeField(gParam);

   if (gauge) {
     cudaGauge->loadCPUField(*cpuGauge);
     delete cpuGauge;
   }

   return cudaGauge;
 }


 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){

   cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);

   GaugeFieldParam gParam(gauge, *param, QUDA_GENERAL_LINKS);
   gParam.geometry = cudaGauge->Geometry();

   cpuGaugeField cpuGauge(gParam);
   cudaGauge->saveCPUField(cpuGauge);

 }


 void destroyGaugeFieldQuda(void* gauge){
   cudaGaugeField* g = reinterpret_cast<cudaGaugeField*>(gauge);
   delete g;
 }


 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *h_force, void **x,
              QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
 {
   profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);

   // create the host momentum field
   gParam.reconstruct = gauge_param->reconstruct;
   gParam.t_boundary = QUDA_PERIODIC_T;
   cpuGaugeField cpuMom(gParam);

   // create the host momentum field
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.gauge = h_force;
   cpuGaugeField cpuForce(gParam);

   // create the device momentum field
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;

   // create temporary field for quark-field outer product
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   cudaGaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};

   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
   qParam.nSpin = 1;
   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 5; // 5 since staggered mrhs
   qParam.precision = gParam.precision;
   qParam.pad = 0;
   for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];
   qParam.x[4] = 1;
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);

   if (gauge_param->use_resident_mom) {
     if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
     cudaMom = momResident;
   } else {
     // download the initial momentum (FIXME make an option just to return?)
     cudaMom->loadCPUField(cpuMom);
   }

   // resident gauge field is required
   if (!gauge_param->use_resident_gauge || !gaugePrecise)
     errorQuda("Resident gauge field is required");

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);

   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
   for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);

   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
       errorQuda("solutionResident.size() %lu does not match number of shifts %d",
     solutionResident.size(), nvector);
   }

   // create the staggered operator
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, QUDA_NORMOP_PC_SOLVE);
   Dirac *dirac = Dirac::create(diracParam);

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);

   for (int i=0; i<nvector; i++) {
     ColorSpinorField &x = *(X[i]);

     if (inv_param->use_resident_solution) x.Even() = *(solutionResident[i]);
     else errorQuda("%s requires resident solution", __func__);

     // set the odd solution component
     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
   }

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);

 #if 0
   if (inv_param->use_resident_solution) {
     for (auto v : solutionResident) if (v) delete solutionResident[i];
     solutionResident.clear();
   }
 #endif
   delete dirac;

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);

   // compute quark-field outer product
   for (int i=0; i<nvector; i++) {
     ColorSpinorField &x = *(X[i]);
     // second component is zero since we have no three hop term
     double coeff[2] = {dt * inv_param->residue[i], 0.0};

     // Operate on even-parity sites
     computeStaggeredOprod(cudaForce_, x, coeff, 1);
   }

   // mom += delta * [U * force]TA
   applyU(cudaForce, *gaugePrecise);
   updateMomentum(*cudaMom, delta, cudaForce);
   qudaDeviceSynchronize();

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);

   if (gauge_param->return_result_mom) {
     // copy the momentum field back to the host
     cudaMom->saveCPUField(cpuMom);
   }

   if (gauge_param->make_resident_mom) {
     // make the momentum field resident
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);

   for (int i=0; i<nvector; i++) delete X[i];

   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);

   checkCudaError();
   return;
 }

 void computeHISQForceQuda(void* const milc_momentum,
                           long long *flops,
                           const double level2_coeff[6],
                           const double fat7_coeff[6],
                           const void* const w_link,
                           const void* const v_link,
                           const void* const u_link,
                           void **fermion,
                           int num_terms,
                           int num_naik_terms,
                           double **coeff,
                           QudaGaugeParam* gParam)
 {
 #ifdef  GPU_STAGGERED_OPROD
   using namespace quda;
   using namespace quda::fermion_force;
   profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);
   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);

   checkGaugeParam(gParam);

   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);

   // create the device outer-product field
   GaugeFieldParam oParam(0, *gParam, QUDA_GENERAL_LINKS);
   oParam.nFace = 0;
   oParam.create = QUDA_ZERO_FIELD_CREATE;
   oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
   cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
   cudaGaugeField *naikOprod = new cudaGaugeField(oParam);

   {
     // default settings for the unitarization
     const double unitarize_eps = 1e-14;
     const double hisq_force_filter = 5e-5;
     const double max_det_error = 1e-10;
     const bool   allow_svd = true;
     const bool   svd_only = false;
     const double svd_rel_err = 1e-8;
     const double svd_abs_err = 1e-8;

     setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);
   }

   double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
   // You have to look at the MILC routine to understand the following
   // Basically, I have already absorbed the one-link coefficient

   GaugeFieldParam param(milc_momentum, *gParam, QUDA_ASQTAD_MOM_LINKS);
   //param.nFace = 0;
   param.order  = QUDA_MILC_GAUGE_ORDER;
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange =  QUDA_GHOST_EXCHANGE_NO;
   cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : NULL;

   param.link_type = QUDA_GENERAL_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_NO;
   param.gauge = (void*)w_link;
   cpuGaugeField cpuWLink(param);
   param.gauge = (void*)v_link;
   cpuGaugeField cpuVLink(param);
   param.gauge = (void*)u_link;
   cpuGaugeField cpuULink(param);

   param.create = QUDA_ZERO_FIELD_CREATE;
   param.order  = QUDA_FLOAT2_GAUGE_ORDER;
   param.link_type = QUDA_ASQTAD_MOM_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_10;
   GaugeFieldParam momParam(param);

   param.create = QUDA_ZERO_FIELD_CREATE;
   param.link_type = QUDA_GENERAL_LINKS;
   param.precision = gParam->cpu_prec;
   param.order = QUDA_FLOAT2_GAUGE_ORDER;

   int R[4] = { 2*comm_dim_partitioned(0), 2*comm_dim_partitioned(1), 2*comm_dim_partitioned(2), 2*comm_dim_partitioned(3) };
   for (int dir=0; dir<4; ++dir) {
     param.x[dir] += 2*R[dir];
     param.r[dir] = R[dir];
   }

   param.reconstruct = QUDA_RECONSTRUCT_NO;
   param.create = QUDA_ZERO_FIELD_CREATE;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;

   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

   { // do outer-product computation
     ColorSpinorParam qParam;
     qParam.nColor = 3;
     qParam.nSpin = 1;
     qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
     qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
     qParam.nDim = 4;
     qParam.precision = oParam.precision;
     qParam.pad = 0;
     for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];

     // create the device quark field
     qParam.create = QUDA_NULL_FIELD_CREATE;
     qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
     cudaColorSpinorField cudaQuark(qParam);

     // create the host quark field
     qParam.create = QUDA_REFERENCE_FIELD_CREATE;
     qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
     qParam.v = fermion[0];

     { // regular terms
       GaugeField *oprod[2] = {stapleOprod, naikOprod};

       // loop over different quark fields
       for(int i=0; i<num_terms; ++i){

         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i];
         cpuColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);

         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }

     { // naik terms
       oneLinkOprod->copy(*stapleOprod);
       ax(level2_coeff[0], *oneLinkOprod);
       GaugeField *oprod[2] = {oneLinkOprod, naikOprod};

       // loop over different quark fields
       for(int i=0; i<num_naik_terms; ++i){

         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i + num_terms - num_naik_terms];
         cpuColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

         profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);

         profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
         profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
   }

   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
   cudaGaugeField* cudaInForce = new cudaGaugeField(param);
   copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
   delete stapleOprod;

   cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
   copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
   delete oneLinkOprod;

   cudaGaugeField* cudaGauge = new cudaGaugeField(param);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);

   cudaGauge->loadCPUField(cpuWLink, profileHISQForce);

   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);
   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff, flops);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   // Load naik outer product
   copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
   cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
   delete naikOprod;

   // Compute Naik three-link term
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1], flops);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);

   // load v-link
   cudaGauge->loadCPUField(cpuVLink, profileHISQForce);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   *num_failures_h = 0;
   unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d, flops);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);

   cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());

   // read in u-link
   cudaGauge->loadCPUField(cpuULink, profileHISQForce);
   cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true);

   // Compute Fat7-staple term
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff, flops);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   delete cudaInForce;
   cudaGaugeField* cudaMom = new cudaGaugeField(momParam);

   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   hisqCompleteForce(*cudaMom, *cudaOutForce, *cudaGauge, flops);
   profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (gParam->use_resident_mom) {
     if (!momResident) errorQuda("No resident momentum field to use");
     updateMomentum(*momResident, 1.0, *cudaMom);
   }

   if (gParam->return_result_mom) {
     // Close the paths, make anti-hermitian, and store in compressed format
     if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
   }

   profileHISQForce.TPSTART(QUDA_PROFILE_FREE);

   if (cpuMom) delete cpuMom;

   if (!gParam->make_resident_mom) {
     delete momResident;
     momResident = nullptr;
   }
   if (cudaMom) delete cudaMom;
   delete cudaOutForce;
   delete cudaGauge;
   profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);

   profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);

   return;
 #else
   errorQuda("HISQ force has not been built");
 #endif
 }

 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,
           double *coeff, double kappa2, double ck,
           int nvector, double multiplicity, void *gauge,
           QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {


   using namespace quda;
   profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
   profileCloverForce.TPSTART(QUDA_PROFILE_INIT);

   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");

   GaugeFieldParam fParam(h_mom, *gauge_param, QUDA_ASQTAD_MOM_LINKS);
   // create the host momentum field
   fParam.reconstruct = QUDA_RECONSTRUCT_10;
   fParam.order = gauge_param->gauge_order;
   cpuGaugeField cpuMom(fParam);

   // create the device momentum field
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   cudaGaugeField cudaMom(fParam);

   // create the device force field
   fParam.link_type = QUDA_GENERAL_LINKS;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   fParam.reconstruct = QUDA_RECONSTRUCT_NO;
   cudaGaugeField cudaForce(fParam);

   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
   qParam.nSpin = 4;
   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 4;
   qParam.precision = fParam.precision;
   qParam.pad = 0;
   for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];

   // create the device quark field
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
   qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;

   std::vector<ColorSpinorField*> quarkX, quarkP;
   for (int i=0; i<nvector; i++) {
     quarkX.push_back(ColorSpinorField::Create(qParam));
     quarkP.push_back(ColorSpinorField::Create(qParam));
   }

   qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
   qParam.x[0] /= 2;
   cudaColorSpinorField tmp(qParam);

   // create the host quark field
   qParam.create = QUDA_REFERENCE_FIELD_CREATE;
   qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc_solve);
   diracParam.tmp1 = &tmp; // use as temporary for dirac->M
   Dirac *dirac = Dirac::create(diracParam);

   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
       errorQuda("solutionResident.size() %lu does not match number of shifts %d",
     solutionResident.size(), nvector);
   }

   cudaGaugeField &gaugeEx = *extendedGaugeResident;

   // create oprod and trace fields
   fParam.geometry = QUDA_TENSOR_GEOMETRY;
   cudaGaugeField oprod(fParam);

   profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);

   std::vector<double> force_coeff(nvector);
   // loop over different quark fields
   for(int i=0; i<nvector; i++){
     ColorSpinorField &x = *(quarkX[i]);
     ColorSpinorField &p = *(quarkP[i]);

     if (!inv_param->use_resident_solution) {
       // for downloading x_e
       qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
       qParam.x[0] /= 2;

       // Wrap the even-parity MILC quark field
       profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
       qParam.v = h_x[i];
       cpuColorSpinorField cpuQuarkX(qParam); // create host quark field
       profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);

       profileCloverForce.TPSTART(QUDA_PROFILE_H2D);
       x.Even() = cpuQuarkX;
       profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);

       profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
       gamma5(x.Even(), x.Even());
     } else {
       x.Even() = *(solutionResident[i]);
     }

     dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
     dirac->M(p.Even(), x.Even());
     dirac->Dagger(QUDA_DAG_YES);
     dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
     dirac->Dagger(QUDA_DAG_NO);

     gamma5(x, x);
     gamma5(p, p);

     force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
   }

   computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);

   // In double precision the clover derivative is faster with no reconstruct
   cudaGaugeField *u = &gaugeEx;
   if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
     GaugeFieldParam param(gaugeEx);
     param.reconstruct = QUDA_RECONSTRUCT_NO;
     u = new cudaGaugeField(param);
     u -> copy(gaugeEx);
   }

   computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);

   /* Now the U dA/dU terms */
   std::vector< std::vector<double> > ferm_epsilon(nvector);
   for (int shift = 0; shift < nvector; shift++) {
     ferm_epsilon[shift].reserve(2);
     ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
     ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
   }

   computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);

   cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);

   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);

   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);

   if (u != &gaugeEx) delete u;

   updateMomentum(cudaMom, -1.0, cudaForce);
   profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);

   // copy the outer product field back to the host
   profileCloverForce.TPSTART(QUDA_PROFILE_D2H);
   cudaMom.saveCPUField(cpuMom);
   profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);

   profileCloverForce.TPSTART(QUDA_PROFILE_FREE);

   for (int i=0; i<nvector; i++) {
     delete quarkX[i];
     delete quarkP[i];
   }

 #if 0
   if (inv_param->use_resident_solution) {
     for (auto v : solutionResident) if (v) delete v;
     solutionResident.clear();
   }
 #endif
   delete dirac;
   profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();
   profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
   return;
 }


 void updateGaugeFieldQuda(void* gauge,
         void* momentum,
         double dt,
         int conj_mom,
         int exact,
         QudaGaugeParam* param)
 {
   profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);

   // create the host fields
   GaugeFieldParam gParam(gauge, *param, QUDA_SU3_LINKS);
   gParam.site_offset = param->gauge_offset;
   gParam.site_size = param->site_size;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

   GaugeFieldParam gParamMom(momentum, *param);
   gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParamMom.site_offset = param->mom_offset;
   gParamMom.site_size = param->site_size;
   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : NULL;

   // create the device fields
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : NULL;

   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
   cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;
   cudaGaugeField *cudaOutGauge = new cudaGaugeField(gParam);

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);

   if (!param->use_resident_gauge) {   // load fields onto the device
     cudaInGauge->loadCPUField(*cpuGauge);
   } else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = NULL;
   }

   if (!param->use_resident_mom) {
     cudaMom->loadCPUField(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
     momResident = NULL;
   }

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);
   updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
       (bool)conj_mom, (bool)exact);
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);

   if (param->return_result_gauge) {
     // copy the gauge field back to the host
     profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
     cudaOutGauge->saveCPUField(*cpuGauge);
     profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
   }

   profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_gauge) {
     if (gaugePrecise != NULL) delete gaugePrecise;
     gaugePrecise = cudaOutGauge;
   } else {
     delete cudaOutGauge;
   }

   if (param->make_resident_mom) {
     if (momResident != NULL && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
   }

   delete cudaInGauge;
   if (cpuMom) delete cpuMom;
   if (cpuGauge) delete cpuGauge;
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();

   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);
   return;
 }

  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
    profileProject.TPSTART(QUDA_PROFILE_TOTAL);

    profileProject.TPSTART(QUDA_PROFILE_INIT);
    checkGaugeParam(param);

    // create the gauge field
    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
    gParam.site_offset = param->gauge_offset;
    gParam.site_size = param->site_size;
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

    // create the device fields
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;
    profileProject.TPSTOP(QUDA_PROFILE_INIT);

    if (param->use_resident_gauge) {
      if (!gaugePrecise) errorQuda("No resident gauge field to use");
      cudaGauge = gaugePrecise;
    } else {
      profileProject.TPSTART(QUDA_PROFILE_H2D);
      cudaGauge->loadCPUField(*cpuGauge);
      profileProject.TPSTOP(QUDA_PROFILE_H2D);
    }

    profileProject.TPSTART(QUDA_PROFILE_COMPUTE);
    *num_failures_h = 0;

    // project onto SU(3)
    projectSU3(*cudaGauge, tol, num_failures_d);

    profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);

    if(*num_failures_h>0)
      errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);

    profileProject.TPSTART(QUDA_PROFILE_D2H);
    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
    profileProject.TPSTOP(QUDA_PROFILE_D2H);

    if (param->make_resident_gauge) {
      if (gaugePrecise != NULL && cudaGauge != gaugePrecise) delete gaugePrecise;
      gaugePrecise = cudaGauge;
    } else {
      delete cudaGauge;
    }

    profileProject.TPSTART(QUDA_PROFILE_FREE);
    if (cpuGauge) delete cpuGauge;
    profileProject.TPSTOP(QUDA_PROFILE_FREE);

    profileProject.TPSTOP(QUDA_PROFILE_TOTAL);
  }

  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
    profilePhase.TPSTART(QUDA_PROFILE_TOTAL);

    profilePhase.TPSTART(QUDA_PROFILE_INIT);
    checkGaugeParam(param);

    // create the gauge field
    GaugeFieldParam gParam(gauge_h, *param, QUDA_GENERAL_LINKS);
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

    // create the device fields
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
    cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;
    profilePhase.TPSTOP(QUDA_PROFILE_INIT);

    if (param->use_resident_gauge) {
      if (!gaugePrecise) errorQuda("No resident gauge field to use");
      cudaGauge = gaugePrecise;
    } else {
      profilePhase.TPSTART(QUDA_PROFILE_H2D);
      cudaGauge->loadCPUField(*cpuGauge);
      profilePhase.TPSTOP(QUDA_PROFILE_H2D);
    }

    profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
    *num_failures_h = 0;

    // apply / remove phase as appropriate
    if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();
    else cudaGauge->removeStaggeredPhase();

    profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);

    profilePhase.TPSTART(QUDA_PROFILE_D2H);
    if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
    profilePhase.TPSTOP(QUDA_PROFILE_D2H);

    if (param->make_resident_gauge) {
      if (gaugePrecise != NULL && cudaGauge != gaugePrecise) delete gaugePrecise;
      gaugePrecise = cudaGauge;
    } else {
      delete cudaGauge;
    }

    profilePhase.TPSTART(QUDA_PROFILE_FREE);
    if (cpuGauge) delete cpuGauge;
    profilePhase.TPSTOP(QUDA_PROFILE_FREE);

    profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);
  }

 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
 {
   profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);

   profileMomAction.TPSTART(QUDA_PROFILE_INIT);
   checkGaugeParam(param);

   // create the momentum fields
   GaugeFieldParam gParam(momentum, *param, QUDA_ASQTAD_MOM_LINKS);
   gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
     QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;

   cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : NULL;

   // create the device fields
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;

   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : NULL;

   profileMomAction.TPSTOP(QUDA_PROFILE_INIT);

   profileMomAction.TPSTART(QUDA_PROFILE_H2D);
   if (!param->use_resident_mom) {
     cudaMom->loadCPUField(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
   }
   profileMomAction.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);
   double action = computeMomAction(*cudaMom);
   profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileMomAction.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_mom) {
     if (momResident != NULL && momResident != cudaMom) delete momResident;
     momResident = cudaMom;
   } else {
     delete cudaMom;
     momResident = NULL;
   }
   if (cpuMom) {
     delete cpuMom;
   }

   profileMomAction.TPSTOP(QUDA_PROFILE_FREE);

   checkCudaError();

   profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);
   return action;
 }

 /*
   The following functions are for the Fortran interface.
 */

 void init_quda_(int *dev) { initQuda(*dev); }
 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
 void init_quda_memory_() { initQudaMemory(); }
 void end_quda_() { endQuda(); }
 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
 void free_gauge_quda_() { freeGaugeQuda(); }
 void free_sloppy_gauge_quda_() { freeSloppyGaugeQuda(); }
 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
 void free_clover_quda_(void) { freeCloverQuda(); }
 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
     QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
     QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
 { MatQuda(h_out, h_in, inv_param); }
 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
 { MatDagMatQuda(h_out, h_in, inv_param); }
 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {
   fflush(stdout);
   // ensure that fifth dimension is set to 1
   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
   invertQuda(hp_x, hp_b, param);
   fflush(stdout);
 }

 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param) {
   // ensure that fifth dimension is set to 1
   if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;

   if (!gaugePrecise) errorQuda("Resident gauge field not allocated");

   // get data into array of pointers
   int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;

   // compute offset assuming TIFR padded ordering (FIXME)
   if (param->dirac_order != QUDA_TIFR_PADDED_DIRAC_ORDER)
     errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");

   const int *X = gaugePrecise->X();
   size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;
   void *hp_x[QUDA_MAX_MULTI_SHIFT];
   for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;

   invertMultiShiftQuda(hp_x, hp_b, param);
 }

 void flush_chrono_quda_(int *index) { flushChronoQuda(*index); }

 void register_pinned_quda_(void *ptr, size_t *bytes) {
   cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
   checkCudaError();
 }

 void unregister_pinned_quda_(void *ptr) {
   cudaHostUnregister(ptr);
   checkCudaError();
 }

 void new_quda_gauge_param_(QudaGaugeParam *param) {
   *param = newQudaGaugeParam();
 }
 void new_quda_invert_param_(QudaInvertParam *param) {
   *param = newQudaInvertParam();
 }

 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
     bool *conj_mom, bool *exact,
     QudaGaugeParam *param) {
   updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
 }

 static inline int opp(int dir) { return 7-dir; }

 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){

   int index=0;
   // Plaquette paths
   if (num_loop_types >= 1)
     for(int i=0; i<4; ++i){
       if(i==dir) continue;
       paths[index][0] = i;        paths[index][1] = opp(dir);   paths[index++][2] = opp(i);
       paths[index][0] = opp(i);   paths[index][1] = opp(dir);   paths[index++][2] = i;
     }

   // Rectangle Paths
   if (num_loop_types >= 2)
     for(int i=0; i<4; ++i){
       if(i==dir) continue;
       paths[index][0] = paths[index][1] = i;       paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
       index++;
       paths[index][0] = paths[index][1] = opp(i);  paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
       index++;
       paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
       index++;
       paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
       index++;
       paths[index][0] = i;  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
       index++;
       paths[index][0] = opp(i);  paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
       index++;
     }

   if (num_loop_types >= 3) {
     // Staple paths
     for(int i=0; i<4; ++i){
       for(int j=0; j<4; ++j){
   if(i==dir || j==dir || i==j) continue;
   paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
   index++;
   paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
   index++;
   paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
   index++;
   paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
   index++;
      }
     }
   }

 }

 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,
              QudaGaugeParam *param) {

   int numPaths = 0;
   switch (*num_loop_types) {
   case 1:
     numPaths = 6;
     break;
   case 2:
     numPaths = 24;
     break;
   case 3:
     numPaths = 48;
     break;
   default:
     errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);
   }

   double *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
   int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));

   if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
       loop_coeff[i] = coeff[0];
       path_length[i] = 3;
     }
   if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {
       loop_coeff[i] = coeff[1];
       path_length[i] = 5;
     }
   if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {
       loop_coeff[i] = coeff[2];
       path_length[i] = 5;
     }

   int** input_path_buf[4];
   for(int dir=0; dir<4; ++dir){
     input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
     for(int i=0; i<numPaths; ++i){
       input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));
     }
     createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
   }

   int max_length = 6;

   computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);

   for(int dir=0; dir<4; ++dir){
     for(int i=0; i<numPaths; ++i) host_free(input_path_buf[dir][i]);
     host_free(input_path_buf[dir]);
   }

   host_free(path_length);
   host_free(loop_coeff);
 }

 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {
   computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);
 }

 // apply the staggered phases
 void apply_staggered_phase_quda_() {
   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");
   if (gaugePrecise) {
     gaugePrecise->applyStaggeredPhase();
   } else {
     errorQuda("No persistent gauge field");
   }
 }

 // remove the staggered phases
 void remove_staggered_phase_quda_() {
   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");
   if (gaugePrecise) {
     gaugePrecise->removeStaggeredPhase();
   } else {
     errorQuda("No persistent gauge field");
   }
   qudaDeviceSynchronize();
 }

 // evaluate the kinetic term
 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {
   *kin = momActionQuda(momentum, param);
 }


 #ifdef MULTI_GPU
 static int bqcd_rank_from_coords(const int *coords, void *fdata)
 {
   int *dims = static_cast<int *>(fdata);

   int rank = coords[3];
   for (int i = 2; i >= 0; i--) {
     rank = dims[i] * rank + coords[i];
   }
   return rank;
 }
 #endif

 void comm_set_gridsize_(int *grid)
 {
 #ifdef MULTI_GPU
   initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
 #endif
 }

 void set_kernel_pack_t_(int* pack)
 {
   bool pack_ = *pack ? true : false;
   setKernelPackT(pack_);
 }


 void gaussGaugeQuda(long seed)
 {
 #ifdef GPU_GAUGE_TOOLS
   profileGauss.TPSTART(QUDA_PROFILE_TOTAL);

   profileGauss.TPSTART(QUDA_PROFILE_INIT);
   if (!gaugePrecise)
     errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");

   cudaGaugeField *data = NULL;
   data = gaugePrecise;

   profileGauss.TPSTOP(QUDA_PROFILE_INIT);

   profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
   RNG* randstates = new RNG(data->Volume(), seed, data->X());
   randstates->Init();
   quda::gaugeGauss(*data, *randstates);
   randstates->Release();
   delete randstates;
   profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);

   profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);

   if (extendedGaugeResident) {
     extendedGaugeResident = gaugePrecise;
     extendedGaugeResident -> exchangeExtendedGhost(R,profileGauss,redundant_comms);
   }
 #else
   errorQuda("Gauge tools are not build");
 #endif
 }


 /*
  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.
  */
 void plaq_quda_(double plaq[3]) {
   plaqQuda(plaq);
 }


 void plaqQuda (double plq[3])
 {
   profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);

   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");

   cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;

   profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
   double3 plaq = quda::plaquette(*data, QUDA_CUDA_FIELD_LOCATION);
   plq[0] = plaq.x;
   plq[1] = plaq.y;
   plq[2] = plaq.z;
   profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);

   profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
   return;
 }

 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
                            unsigned int nSteps, double alpha)
 {
   profileWuppertal.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");

   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);

   cudaGaugeField *precise = NULL;

   if (gaugeSmeared != NULL) {
     if (getVerbosity() >= QUDA_VERBOSE)
       printfQuda("Wuppertal smearing done with gaugeSmeared\n");
     GaugeFieldParam gParam(*gaugePrecise);
     gParam.create = QUDA_NULL_FIELD_CREATE;
     precise = new cudaGaugeField(gParam);
     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     precise->exchangeGhost();
   } else {
     if (getVerbosity() >= QUDA_VERBOSE)
       printfQuda("Wuppertal smearing done with gaugePrecise\n");
     precise = gaugePrecise;
   }

   ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), 0, inv_param->input_location);
   ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);

   ColorSpinorParam cudaParam(cpuParam, *inv_param);
   cudaColorSpinorField in(*in_h, cudaParam);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*in_h);
     double gpu = blas::norm2(in);
     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
   }

   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaColorSpinorField out(in, cudaParam);
   int parity = 0;

   for (unsigned int i=0; i<nSteps; i++) {
     if(i) in = out;
     wuppertalStep(out, in, parity, *precise, alpha);
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
       double norm = blas::norm2(out);
       printfQuda("Step %d, vector norm %e\n", i, norm);
     }
   }

   cpuParam.v = h_out;
   cpuParam.location = inv_param->output_location;
   ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
   *out_h = out;

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
     double cpu = blas::norm2(*out_h);
     double gpu = blas::norm2(out);
     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
   }

   if (gaugeSmeared != NULL)
     delete precise;

   delete out_h;
   delete in_h;

   popVerbosity();

   profileWuppertal.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performAPEnStep(unsigned int nSteps, double alpha)
 {
   profileAPE.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != NULL) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);

   GaugeFieldParam gParam(*gaugeSmeared);
   cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after 0 APE steps: %le %le %le\n", plq.x, plq.y, plq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileAPE,redundant_comms);
     APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,profileAPE,redundant_comms);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after %d APE steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
   }

   profileAPE.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void performSTOUTnStep(unsigned int nSteps, double rho)
 {
   profileSTOUT.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != NULL) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);

   GaugeFieldParam gParam(*gaugeSmeared);
   cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after 0 STOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileSTOUT,redundant_comms);
     STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,redundant_comms);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
   }

   profileSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
 }

  void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
 {
   profileOvrImpSTOUT.TPSTART(QUDA_PROFILE_TOTAL);

   if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");

   if (gaugeSmeared != NULL) delete gaugeSmeared;
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAPE);

   GaugeFieldParam gParam(*gaugeSmeared);
   cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
   }

   for (unsigned int i=0; i<nSteps; i++) {
     cudaGaugeTemp->copy(*gaugeSmeared);
     cudaGaugeTemp->exchangeExtendedGhost(R,profileOvrImpSTOUT,redundant_comms);
     OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);
   }

   delete cudaGaugeTemp;

   gaugeSmeared->exchangeExtendedGhost(R,profileOvrImpSTOUT,redundant_comms);

   if (getVerbosity() == QUDA_VERBOSE) {
     double3 plq = plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     printfQuda("Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
   }

   profileOvrImpSTOUT.TPSTOP(QUDA_PROFILE_TOTAL);
 }


 int computeGaugeFixingOVRQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
   const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, \
   const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
 {

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);
   GaugeFieldParam gParam(gauge, *param);
   cpuGaugeField *cpuGauge = new cpuGaugeField(gParam);

   //gParam.pad = getFatLinkPadding(param->X);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   gParam.link_type   = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.order       = (gParam.precision == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
   cudaGaugeField *cudaInGauge = new cudaGaugeField(gParam);

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);

   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);


   cudaInGauge->loadCPUField(*cpuGauge);
  /* } else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = NULL;
   } */

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);

   checkCudaError();

   if (comm_size() == 1) {
     // perform the update
     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
     gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
       reunit_interval, stopWtheta);
     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
   } else {
     cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);

     // perform the update
     GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
     gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
       reunit_interval, stopWtheta);
     GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);

     //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge
     copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
   }

   checkCudaError();
   // copy the gauge field back to the host
   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
   cudaInGauge->saveCPUField(*cpuGauge);
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);

   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);

   if (param->make_resident_gauge) {
     if (gaugePrecise != NULL) delete gaugePrecise;
     gaugePrecise = cudaInGauge;
   } else {
     delete cudaInGauge;
   }

   if(timeinfo){
     timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
     timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
     timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
   }

   checkCudaError();
   return 0;
 }


 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
   const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
   const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
 {

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);

   checkGaugeParam(param);

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(gauge, *param);
   cpuGaugeField *cpuGauge = new cpuGaugeField(gParam);

   //gParam.pad = getFatLinkPadding(param->X);
   gParam.create      = QUDA_NULL_FIELD_CREATE;
   gParam.link_type   = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.order       = (gParam.precision == QUDA_DOUBLE_PRECISION || gParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   cudaGaugeField *cudaInGauge = new cudaGaugeField(gParam);


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);

   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);

   //if (!param->use_resident_gauge) {   // load fields onto the device
   cudaInGauge->loadCPUField(*cpuGauge);
   /*} else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
     gaugePrecise = NULL;
   } */


   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);

   // perform the update
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);
   checkCudaError();

   gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);

   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);

   checkCudaError();
   // copy the gauge field back to the host
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
   checkCudaError();
   cudaInGauge->saveCPUField(*cpuGauge);
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
   checkCudaError();

   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);

   if (param->make_resident_gauge) {
     if (gaugePrecise != NULL) delete gaugePrecise;
     gaugePrecise = cudaInGauge;
   } else {
     delete cudaInGauge;
   }

   if(timeinfo){
     timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
     timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
     timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
   }

   checkCudaError();
   return 0;
 }

 void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
 {
   if (x.Precision() == QUDA_DOUBLE_PRECISION) {
     contractCuda(x.Even(), y.Even(), ((double2*)ctrn), cType, QUDA_EVEN_PARITY, profileContract);
     contractCuda(x.Odd(),  y.Odd(),  ((double2*)ctrn), cType, QUDA_ODD_PARITY,  profileContract);
   } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
     contractCuda(x.Even(), y.Even(), ((float2*) ctrn), cType, QUDA_EVEN_PARITY, profileContract);
     contractCuda(x.Odd(),  y.Odd(),  ((float2*) ctrn), cType, QUDA_ODD_PARITY,  profileContract);
   } else {
     errorQuda("Precision not supported for contractions\n");
   }
 }

 void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType, const int tC)
 {
   if (x.Precision() == QUDA_DOUBLE_PRECISION) {
     contractCuda(x.Even(), y.Even(), ((double2*)ctrn), cType, tC, QUDA_EVEN_PARITY, profileContract);
     contractCuda(x.Odd(),  y.Odd(),  ((double2*)ctrn), cType, tC, QUDA_ODD_PARITY,  profileContract);
   } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
     contractCuda(x.Even(), y.Even(), ((float2*) ctrn), cType, tC, QUDA_EVEN_PARITY, profileContract);
     contractCuda(x.Odd(),  y.Odd(),  ((float2*) ctrn), cType, tC, QUDA_ODD_PARITY,  profileContract);
   } else {
     errorQuda("Precision not supported for contractions\n");
   }
 }

 double qChargeCuda ()
 {
   profileQCharge.TPSTART(QUDA_PROFILE_TOTAL);

   cudaGaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileQCharge);
     gauge = extendedGaugeResident;
   } else {
     gauge = gaugeSmeared;
   }
   // Do we keep the smeared extended field on memory, or the unsmeared one?

   profileQCharge.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field

   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   cudaGaugeField Fmunu(tensorParam);

   profileQCharge.TPSTOP(QUDA_PROFILE_INIT);
   profileQCharge.TPSTART(QUDA_PROFILE_COMPUTE);

   computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION);
   double charge = quda::computeQCharge(Fmunu, QUDA_CUDA_FIELD_LOCATION);

   profileQCharge.TPSTOP(QUDA_PROFILE_COMPUTE);
   profileQCharge.TPSTOP(QUDA_PROFILE_TOTAL);

   return charge;
 }
new_quda_invert_param_
void new_quda_invert_param_(QudaInvertParam *param)
Definition: interface_quda.cpp:4780

quda::CloverFieldParam::order
QudaCloverFieldOrder order
Definition: clover_field.h:21

qudaGaugeParam
static QudaGaugeParam qudaGaugeParam
Definition: gauge_force_test.cpp:16

quda::CloverField::setRho
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
Definition: clover_field.cpp:73

invert_quda.h

contract
void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
Definition: interface_quda.cpp:5369

quda::Eig_Solver
Definition: lanczos_quda.h:16

quda::GaugeFieldParam::t_boundary
QudaTboundary t_boundary
Definition: gauge_field.h:18

QudaInvertParam_s::dirac_order
QudaDiracFieldOrder dirac_order
Definition: quda.h:195

QUDA_VERBOSE
Definition: enum_quda.h:237

QudaInvertParam_s::mass_normalization
QudaMassNormalization mass_normalization
Definition: quda.h:185

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32

QudaGaugeParam_s::reconstruct_sloppy
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46

comm_finalize
void comm_finalize(void)
Definition: comm_common.cpp:573

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::DiracParam::c_5
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:28

quda::RNG::Init
void Init()
Initialize CURAND RNG states.
Definition: random.cu:146

quda::MGParam::matSmoothSloppy
DiracMatrix * matSmoothSloppy
Definition: multigrid.h:78

quda::fatLongKSLink
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:524

quda::cloverDerivative
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
Definition: clover_deriv_quda.cu:519

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:71

quda::LatticeFieldParam::ghostExchange
QudaGhostExchange ghostExchange
Definition: lattice_field.h:60

freeCloverQuda
void freeCloverQuda(void)
Definition: interface_quda.cpp:1268

quda::DiracTwistedCloverPC
Definition: dirac_quda.h:596

computeKSLinkQuda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
Definition: interface_quda.cpp:3562

quda::setDiracSloppyParam
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1469

profileStaggeredForce
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.

createGaugeFieldQuda
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
Definition: interface_quda.cpp:3813

exchange_cpu_sitelink_ex
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)

QudaInvertParam_s::b_5
double b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:102

QUDA_MAT_SOLUTION
Definition: enum_quda.h:130

QUDA_GAUGE_LAPLACEPC_DIRAC
Definition: enum_quda.h:289

commDimPartitioned
int commDimPartitioned(int dir)
Definition: comm_common.cpp:674

quda::CloverField::TrLog
double * TrLog() const
Definition: clover_field.h:87

loadCloverQuda
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:859

QUDA_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:273

setVerbosityQuda
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
Definition: interface_quda.cpp:251

quda::plaquette
double3 plaquette(const GaugeField &U, QudaFieldLocation location)
Definition: gauge_plaq.cu:138

QudaInvertParam_s::clover_location
QudaFieldLocation clover_location
Definition: quda.h:199

QudaInvertParam_s::solve_type
QudaSolveType solve_type
Definition: quda.h:182

verbosity
QudaVerbosity verbosity
Definition: dslash_ctest.cpp:81

QudaPrecision
enum QudaPrecision_s QudaPrecision

tol_hq
double tol_hq
Definition: test_util.cpp:1648

freeGaugeQuda
void freeGaugeQuda(void)
Definition: interface_quda.cpp:1090

kinetic_quda_
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
Definition: interface_quda.cpp:4922

QUDA_GAUGE_COVDEV_DIRAC
Definition: enum_quda.h:290

profileFatLink
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.

QudaGaugeParam_s::ga_pad
int ga_pad
Definition: quda.h:53

QUDA_WILSONPC_DIRAC
Definition: enum_quda.h:270

load_gauge_quda_
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:4722

QudaGaugeParam_s::make_resident_mom
int make_resident_mom
Definition: quda.h:74

quda::ColorSpinorParam::is_component
bool is_component
Definition: color_spinor_field.h:104

quda::ColorSpinorField
Definition: color_spinor_field.h:271

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
Definition: cuda_gauge_field.cu:702

computeStaggeredForceQuda
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:3855

momActionQuda
double momActionQuda(void *momentum, QudaGaugeParam *param)
Definition: interface_quda.cpp:4657

QudaGaugeParam_s::gauge_offset
size_t gauge_offset
Definition: quda.h:78

QudaInvertParam_s::mu
double mu
Definition: quda.h:105

quda::CloverField::V
void * V(bool inverse=false)
Definition: clover_field.h:73

momentum.h

computeHISQForceQuda
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4004

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896

quda::MultiShiftCG
Definition: invert_quda.h:744

profileGaugeUpdate
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.

deg_tm_dslash_cuda_gen.pack
bool pack
Definition: deg_tm_dslash_cuda_gen.py:964

quda::fermion_force::setUnitarizeForceConstants
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.

QUDA_MAX_MULTI_SHIFT
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
Definition: quda_constants.h:31

QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:197

gaugeExtended
cudaGaugeField * gaugeExtended
Definition: interface_quda.cpp:118

quda::createDirac
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1513

quda::ColorSpinorParam::nColor
int nColor
Definition: color_spinor_field.h:85

printQudaGaugeParam
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:92

func
const void * func
Definition: CMakeCUDACompilerId.cpp1.ii:2248

QudaGaugeParam_s::type
QudaLinkType type
Definition: quda.h:35

dslash_cuda_gen.twisted
def twisted()
Definition: dslash_cuda_gen.py:922

fflush
int fflush(FILE *)

QudaInvertParam_s::kappa
double kappa
Definition: quda.h:97

quda::blas::end
void end(void)
Definition: blas_quda.cu:70

gauge_tools.h

quda::computeCloverForce
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
Definition: clover_outer_product.cu:468

profileQCharge
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.

quda::QUDA_PROFILE_FREE
Definition: quda_internal.h:175

QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:32

quda::MG::createSmoother
void createSmoother()
Create the smoothers.
Definition: multigrid.cpp:213

check_params.h

quda::DiracM
Definition: dirac_quda.h:1014

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::blas::norm2
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241

color_spinor_field.h

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)

quda::blas::init
void init()
Definition: blas_quda.cu:64

QudaInvertParam_s::dslash_type
QudaDslashType dslash_type
Definition: quda.h:93

QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:46

quda::GaugeFieldParam::setPrecision
void setPrecision(QudaPrecision precision)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:113

QudaGaugeParam_s::reconstruct_precondition
QudaReconstructType reconstruct_precondition
Definition: quda.h:49

quda::CloverFieldParam::create
QudaFieldCreate create
Definition: clover_field.h:22

QudaInvertParam_s::inv_type
QudaInverterType inv_type
Definition: quda.h:94

quda_fortran.h
Fortran interface functions.

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

ritz_quda.h

QudaInvertParam_s::c_5
double c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:103

QudaInvertParam_s::return_clover_inverse
int return_clover_inverse
Definition: quda.h:217

host_free
#define host_free(ptr)
Definition: malloc_quda.h:59

quda::computeFmunu
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location)
Definition: field_strength_tensor.cu:283

QudaMultigridParam_s::smoother_solve_type
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:426

quda::MG::destroySmoother
void destroySmoother()
Free the smoothers.
Definition: multigrid.cpp:254

QUDA_SMEARED_LINKS
Definition: enum_quda.h:28

QUDA_ODD_PARITY
Definition: enum_quda.h:260

performSTOUTnStep
void performSTOUTnStep(unsigned int nSteps, double rho)
Definition: interface_quda.cpp:5131

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:321

quda::SolverParam::true_res_hq
double true_res_hq
Definition: invert_quda.h:103

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

quda::GaugeFieldParam::site_offset
size_t site_offset
Definition: gauge_field.h:44

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

QUDA_TWISTED_CLOVERPC_DIRAC
Definition: enum_quda.h:285

quda::CloverField
Definition: clover_field.h:44

gaugeFatExtended
cudaGaugeField *& gaugeFatExtended
Definition: interface_quda.cpp:124

quda::blas::cDotProduct
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500

quda::STOUTStep
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Definition: gauge_stout.cu:300

QUDA_WILSON_LINKS
Definition: enum_quda.h:29

cudaMom
cudaGaugeField * cudaMom
Definition: hisq_paths_force_test.cpp:27

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

quda::DiracMobiusPC
Definition: dirac_quda.h:430

setOutputPrefix
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:68

loadGaugeQuda
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:602

gaussGaugeQuda
void gaussGaugeQuda(long seed)
Definition: interface_quda.cpp:4961

plaq_quda_
void plaq_quda_(double plaq[3])
Definition: interface_quda.cpp:4998

profileMulti
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for computeFatLinkQuda.

QUDA_SUMMARIZE
Definition: enum_quda.h:236

reunit_svd_only
static bool reunit_svd_only
Definition: unitarize_link_test.cpp:38

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

profileOvrImpSTOUT
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.

quda::MGParam
Definition: multigrid.h:26

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

gaugeLongPrecise
cudaGaugeField * gaugeLongPrecise
Definition: interface_quda.cpp:128

QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:304

rank
static int rank
Definition: comm_mpi.cpp:42

profileGaugeForce
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.

QUDA_WILSON_DIRAC
Definition: enum_quda.h:269

quda::ColorSpinorField::Create
static ColorSpinorField * Create(const ColorSpinorParam &param)
Definition: color_spinor_field.cpp:748

QUDA_COVDEV_DSLASH
Definition: enum_quda.h:96

QUDA_NORMERR_SOLVE
Definition: enum_quda.h:144

profileAPE
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.

QUDA_DIRECT_SOLVE
Definition: enum_quda.h:140

quda::Nstream
const int Nstream
Definition: quda_internal.h:330

QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:330

QUDA_VERSION_MINOR
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2

QudaInvertParam_s::trlogA
double trlogA[2]
Definition: quda.h:212

QUDA_EIGCG_INVERTER
Definition: enum_quda.h:110

z
int int z
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::assertAllMemFree
void assertAllMemFree()
Definition: malloc.cpp:379

quda::CloverFieldParam::clover
void * clover
Definition: clover_field.h:12

quda::copy
__host__ __device__ void copy(T1 &a, const T2 &b)
Definition: register_traits.h:114

quda::LatticeFieldParam::precision
QudaPrecision precision
Definition: lattice_field.h:54

invertMultiSrcQuda
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2848

R
static int R[4]
Definition: interface_quda.cpp:83

QudaInvertParam_s::dagger
QudaDagType dagger
Definition: quda.h:184

quda::blas::copy
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263

quda::multigrid_solver::dSmoothSloppy
Dirac * dSmoothSloppy
Definition: multigrid.h:385

quda::blas::ax
void ax(const double &a, ColorSpinorField &x)
Definition: blas_quda.cu:209

quda::cudaColorSpinorField
Definition: color_spinor_field.h:504

cpuMom
cpuGaugeField * cpuMom
Definition: hisq_paths_force_test.cpp:28

profilePlaq
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.

QUDA_TWIST_SINGLET
Definition: enum_quda.h:367

free_clover_quda_
void free_clover_quda_(void)
Definition: interface_quda.cpp:4727

gauge_param
QudaGaugeParam gauge_param
Definition: dslash_ctest.cpp:36

initCommsGridQuda
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
Definition: interface_quda.cpp:291

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212

createExtendedGauge
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
Definition: interface_quda.cpp:566

pow
double pow(double, double)

computeGaugeFixingOVRQuda
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
Definition: interface_quda.cpp:5202

QUDA_CG_INVERTER
Definition: enum_quda.h:101

quda::massRescale
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
Definition: interface_quda.cpp:1532

QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:48

QUDA_MOBIUS_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:276

quda::ColorSpinorField::Component
ColorSpinorField & Component(const int idx) const
Definition: color_spinor_field.cpp:649

chronoResident
std::vector< std::vector< std::pair< ColorSpinorField *, ColorSpinorField * > > > chronoResident(QUDA_MAX_CHRONO)

loadSloppyGaugeQuda
void loadSloppyGaugeQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
Definition: interface_quda.cpp:1156

destroyDeflationQuda
void destroyDeflationQuda(void *df)
Definition: interface_quda.cpp:2514

QudaGaugeParam_s::gauge_order
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36

computeCloverForceQuda
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4254

QudaGaugeParam_s::mom_offset
size_t mom_offset
Definition: quda.h:79

quda::CloverFieldParam::inverse
bool inverse
Definition: clover_field.h:11

profileInit2End
static TimeProfile profileInit2End("initQuda-endQuda", false)

quda::SolverParam::iter
int iter
Definition: invert_quda.h:109

svd_rel_error
static double svd_rel_error
Definition: unitarize_link_test.cpp:39

QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:368

QUDA_MG_INVERTER
Definition: enum_quda.h:116

QudaMultigridParam_s::secs
double secs
Definition: quda.h:468

comm_gpuid
int comm_gpuid(void)
Definition: comm_mpi.cpp:132

QudaInvertParam_s::return_clover
int return_clover
Definition: quda.h:216

LexMapData::ndim
int ndim
Definition: interface_quda.cpp:260

quda::gaugeForce
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:339

saveGaugeQuda
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:804

QUDA_UKQCD_GAMMA_BASIS
Definition: enum_quda.h:339

quda::CloverFieldParam::invNorm
void * invNorm
Definition: clover_field.h:15

QUDA_DAG_YES
Definition: enum_quda.h:191

quda::gaugefixingOVR
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
Definition: gauge_fix_ovr.cu:1790

spinorSiteSize
#define spinorSiteSize
Definition: interface_quda.cpp:56

invert_multishift_quda_
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:4744

QudaGaugeParam_s::overwrite_mom
int overwrite_mom
Definition: quda.h:69

profileMomAction
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.

gauge_force_quda.h

InitMagma
static bool InitMagma
Definition: interface_quda.cpp:90

quda::CloverFieldParam::mu2
double mu2
Definition: clover_field.h:18

QudaInvertParam_s::compute_clover_trlog
int compute_clover_trlog
Definition: quda.h:211

quda
Definition: blas_cublas.h:6

quda::LatticeFieldParam::siteSubset
QudaSiteSubset siteSubset
Definition: lattice_field.h:55

gitversion
char * gitversion
Definition: version.cpp:4

exit
void exit(int) __attribute__((noreturn))

QudaInvertParam_s::clover_cuda_prec_sloppy
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:202

greatestPriority
int * greatestPriority
Definition: CMakeCUDACompilerId.cpp1.ii:2342

gaugeLongExtended
cudaGaugeField * gaugeLongExtended
Definition: interface_quda.cpp:127

QudaInvertParam_s::input_location
QudaFieldLocation input_location
Definition: quda.h:90

quda::solve
void solve(Complex *psi, std::vector< ColorSpinorField *> &p, std::vector< ColorSpinorField *> &q, ColorSpinorField &b)
Solve the equation A p_k psi_k = b by minimizing the residual and using Gaussian elimination.
Definition: inv_mre.cpp:64

destroyGaugeFieldQuda
void destroyGaugeFieldQuda(void *gauge)
Definition: interface_quda.cpp:3849

__sFILE
Definition: CMakeCUDACompilerId.cpp1.ii:7683

quda::computeCloverSigmaTrace
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
Definition: clover_trace_quda.cu:242

dslashQuda_4dpc
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:1708

QUDA_MEMORY_PINNED
Definition: enum_quda.h:13

blas_magma.h

quda::DiracParam::gauge
cudaGaugeField * gauge
Definition: dirac_quda.h:31

dirac_quda.h

quda::deflated_solver
Definition: deflation.h:180

QudaGaugeParam_s::site_size
size_t site_size
Definition: quda.h:80

CloseMagma
void CloseMagma()
Definition: blas_magma.cu:326

computeGaugeFixingFFTQuda
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: interface_quda.cpp:5287

cloverPrecondition
cudaCloverField * cloverPrecondition
Definition: interface_quda.cpp:136

quda::deflated_solver::deflParam
DeflationParam * deflParam
Definition: deflation.h:187

QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:333

QudaMultigridParam_s::n_vec
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:407

prec_sloppy
QudaPrecision prec_sloppy
Definition: test_util.cpp:1616

init_quda_
void init_quda_(int *dev)
Definition: interface_quda.cpp:4718

quda::pool::flush_pinned
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:533

getGaugePadding
int getGaugePadding(GaugeFieldParam &param)
Definition: interface_quda.cpp:3651

quda::gaugeGauss
void gaugeGauss(GaugeField &dataDs, RNG &rngstate)
Definition: gauge_random.cu:182

lanczos_quda.h

quda::APEStep
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Definition: gauge_ape.cu:240

index
char * index(const char *, int)

param
QudaGaugeParam param
Definition: pack_test.cpp:17

comm_quda.h

openMagma
void openMagma()
Definition: interface_quda.cpp:92

quda::computeMomAction
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:113

b
#define b
Definition: dw_dslash4_core.h:83

quda::setDiracParam
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1386

comm_init
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_mpi.cpp:61

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

ndim
static int ndim
Definition: layout_hyper.c:53

QudaInvertParam_s::solution_type
QudaSolutionType solution_type
Definition: quda.h:181

QUDA_MATPC_SOLUTION
Definition: enum_quda.h:132

QudaEigParam_s::mem_type_ritz
QudaMemoryType mem_type_ritz
Definition: quda.h:367

quda::QUDA_PROFILE_D2H
Definition: quda_internal.h:169

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:202

strcmp
int strcmp(const char *__s1, const char *__s2)

quda::LatticeFieldParam::x
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:50

quda::multigrid_solver::dSmooth
Dirac * dSmooth
Definition: multigrid.h:384

QudaInvertParam_s::clover_cuda_prec
QudaPrecision clover_cuda_prec
Definition: quda.h:201

QUDA_TWISTED_MASSPC_DIRAC
Definition: enum_quda.h:283

staggered_oprod.h

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:452

solutionResident
std::vector< cudaColorSpinorField * > solutionResident
Definition: interface_quda.cpp:141

QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:184

QUDA_SILENT
Definition: enum_quda.h:235

quda::GaugeFieldParam
Definition: gauge_field.h:10

invertMultiShiftQuda
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:3206

longlink
void * longlink[4]
Definition: staggered_dslash_ctest.cpp:47

quda::ColorSpinorParam::is_composite
bool is_composite
for deflation solvers:
Definition: color_spinor_field.h:102

QUDA_TWISTED_MASS_DIRAC
Definition: enum_quda.h:282

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
Definition: cuda_gauge_field.cu:690

quda::DiracParam
Definition: dirac_quda.h:19

QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:436

multigrid.h

quda::TimeProfile::Last
double Last(QudaProfileType idx)
Definition: quda_internal.h:312

quda::ColorSpinorParam::composite_dim
int composite_dim
Definition: color_spinor_field.h:103

initQuda
void initQuda(int dev)
Definition: interface_quda.cpp:546

quda::DiracParam::mu
double mu
Definition: dirac_quda.h:36

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:126

QudaEigParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:346

quda::cudaCloverField
Definition: clover_field.h:132

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:152

QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:332

quda::cloverInvert
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
Definition: clover_invert.cu:183

quda::ax
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
Definition: gauge_field.cpp:322

QudaEigParam_s::secs
double secs
Definition: quda.h:385

QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:311

tol
double tol
Definition: test_util.cpp:1647

delta
static unsigned int delta
Definition: CMakeCUDACompilerId.cpp1.ii:12998

quda::pool::init
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:424

quda::DiracParam::mass
double mass
Definition: dirac_quda.h:24

quda::fermion_force::hisqLongLinkForce
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff, long long *flops=nullptr)
Compute the long-link contribution to the fermion force.

QudaInvertParam_s::output_location
QudaFieldLocation output_location
Definition: quda.h:91

QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:397

quda::unitarizeLinks
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
Definition: unitarize_links_quda.cu:495

QudaInvertParam_s::clover_cuda_prec_precondition
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:203

quda::ColorSpinorParam::location
QudaFieldLocation location
Definition: color_spinor_field.h:83

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

inv_param
QudaInvertParam inv_param
Definition: covdev_test.cpp:37

setNumaAffinityNVML
int setNumaAffinityNVML(int deviceid)
Definition: numa_affinity.cpp:15

quda::CloverFieldParam
Definition: clover_field.h:9

QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION
Definition: enum_quda.h:135

quda::canReuseResidentGauge
bool canReuseResidentGauge(QudaInvertParam *inv_param)
Definition: interface_quda.cpp:1997

quda::DiracMdag
Definition: dirac_quda.h:1134

apply_staggered_phase_quda_
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:4901

quda::deflated_solver::profile
TimeProfile & profile
Definition: deflation.h:190

kappa
VOLATILE spinorFloat kappa
Definition: dw_dslash5inv_core.h:153

quda::fermion_force::hisqStaplesForce
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6], long long *flops=nullptr)
Compute the fat-link contribution to the fermion force.

LexMapData
Definition: interface_quda.cpp:259

QUDA_CLOVERPC_DIRAC
Definition: enum_quda.h:272

quda::SolverParam::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:151

quda::SolverParam::updateInvertParam
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:296

QudaInvertParam_s::m5
double m5
Definition: quda.h:99

QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:208

QudaGaugeParam_s
Definition: quda.h:25

quda::DiracParam::Ls
int Ls
Definition: dirac_quda.h:26

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:242

QudaMultigridParam_s
Definition: quda.h:393

cpuFatLink
cpuGaugeField * cpuFatLink
Definition: hisq_unitarize_force_test.cpp:18

quda::ColorSpinorParam::fieldOrder
QudaFieldOrder fieldOrder
Definition: color_spinor_field.h:92

quda::GaugeField::StaggeredPhaseApplied
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:214

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cu:471

updateMultigridQuda
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
Definition: interface_quda.cpp:2381

quda::CloverField::Csw
bool Csw() const
Definition: clover_field.h:107

quda::flushProfile
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:462

quda::deflated_solver::m
DiracMatrix * m
Definition: deflation.h:183

initialized
static bool initialized
Profiler for initQuda.
Definition: interface_quda.cpp:158

quda::RNG::Release
void Release()
Release Device memory for CURAND RNG states.
Definition: random.cu:168

cloverSloppy
cudaCloverField * cloverSloppy
Definition: interface_quda.cpp:135

quda::multigrid_solver::multigrid_solver
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
Definition: interface_quda.cpp:2297

QudaInvertParam_s::verbosity
QudaVerbosity verbosity
Definition: quda.h:219

tmp2
#define tmp2
Definition: tmc_core.h:16

quda::QUDA_PROFILE_EPILOGUE
Definition: quda_internal.h:174

free_gauge_quda_
void free_gauge_quda_()
Definition: interface_quda.cpp:4723

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:44

quda::DiracParam::commDim
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:43

MatQuda
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:1853

profileSTOUT
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.

quda::QUDA_PROFILE_H2D
Definition: quda_internal.h:168

load_clover_quda_
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4725

projectSU3Quda
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
Definition: interface_quda.cpp:4544

quda::DiracParam::dagger
QudaDagType dagger
Definition: dirac_quda.h:30

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:44

gaugeFatPrecondition
cudaGaugeField *& gaugeFatPrecondition
Definition: interface_quda.cpp:123

newQudaInvertParam
QudaInvertParam newQudaInvertParam(void)

cudaFatLink
cudaGaugeField * cudaFatLink
Definition: hisq_unitarize_force_test.cpp:17

quda::quda_version
static const std::string quda_version
Definition: tune.cpp:96

quda::multigrid_solver::profile
TimeProfile & profile
Definition: multigrid.h:396

quda::CloverFieldParam::setPrecision
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23

quda::pool::flush_device
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:545

quda::Dirac::Dagger
void Dagger(QudaDagType dag) const
Definition: dirac_quda.h:153

quda::Solver::create
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:13

for
for(int s=0;s< param.dc.Ls;s++)
Definition: dw_dslash5inv_core.h:181

cpuULink
cpuGaugeField * cpuULink
Definition: unitarize_link_test.cpp:53

QudaEigParam_s::gflops
double gflops
Definition: quda.h:382

quda::cpuGaugeField
Definition: gauge_field.h:464

quda::TimeProfile::Print
void Print()
Definition: timer.cpp:6

QudaGaugeParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:48

free_sloppy_gauge_quda_
void free_sloppy_gauge_quda_()
Definition: interface_quda.cpp:4724

QUDA_DOMAIN_WALL_4DPC_DIRAC
Definition: enum_quda.h:275

QudaInvertParam_s::clover_order
QudaCloverFieldOrder clover_order
Definition: quda.h:205

quda::RNG
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23

p
static __inline__ size_t p
Definition: CMakeCUDACompilerId.cpp1.ii:2995

quda::createDslashEvents
void createDslashEvents()
Definition: dslash_quda.cu:86

updateGaugeFieldQuda
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:4441

quda::GaugeField::Anisotropy
double Anisotropy() const
Definition: gauge_field.h:205

quda::ColorSpinorParam::gammaBasis
QudaGammaBasis gammaBasis
Definition: color_spinor_field.h:93

quda::MGParam::matSmooth
DiracMatrix * matSmooth
Definition: multigrid.h:75

remove_staggered_phase_quda_
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
Definition: interface_quda.cpp:4911

lex_rank_from_coords
static int lex_rank_from_coords(const int *coords, void *fdata)
Definition: interface_quda.cpp:267

QUDA_PERIODIC_T
Definition: enum_quda.h:54

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:172

QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:87

freeSloppyCloverQuda
void freeSloppyCloverQuda()
Definition: interface_quda.cpp:1259

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::GaugeFieldParam::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:15

quda::multigrid_solver::m
DiracM * m
Definition: multigrid.h:387

double
double
Definition: CMakeCUDACompilerId.cpp1.ii:8010

quda::exchangeExtendedGhost
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
Definition: extended_color_spinor_utilities.cu:25

warningQuda
#define warningQuda(...)
Definition: util_quda.h:101

quda::cudaGaugeField
Definition: gauge_field.h:298

performAPEnStep
void performAPEnStep(unsigned int nSteps, double alpha)
Definition: interface_quda.cpp:5096

profileClover
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.

performWuppertalnStep
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
Definition: interface_quda.cpp:5023

shift
static unsigned int unsigned int shift
Definition: CMakeCUDACompilerId.cpp1.ii:13738

quda::DiracParam::b_5
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
Definition: dirac_quda.h:27

QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3

QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:303

quda::DiracParam::type
QudaDiracType type
Definition: dirac_quda.h:22

QUDA_MAX_CHRONO
#define QUDA_MAX_CHRONO
Definition: interface_quda.cpp:144

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:319

quda::cudaGaugeField::zero
void zero()
Definition: cuda_gauge_field.cu:793

QUDA_PCG_INVERTER
Definition: enum_quda.h:108

OpenMagma
void OpenMagma()
Definition: blas_magma.cu:310

unregister_pinned_quda_
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:4772

quda::SolverParam::tol_hq
double tol_hq
Definition: invert_quda.h:94

quda::multigrid_solver::mg
MG * mg
Definition: multigrid.h:395

QudaGaugeParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45

quda::cudaGaugeField::exchangeGhost
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
Definition: cuda_gauge_field.cu:179

invalidate_clover
static bool invalidate_clover
Definition: interface_quda.cpp:600

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:434

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

unitarize_eps
static double unitarize_eps
Definition: unitarize_link_test.cpp:36

gauge_update_quda.h

quda::LatticeField::Volume
int Volume() const
Definition: lattice_field.h:420

quda::QUDA_PROFILE_PREAMBLE
Definition: quda_internal.h:171

tmp1
#define tmp1
Definition: tmc_core.h:15

quda::DiracParam::matpcType
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:29

quda::SolverParam::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:145

comm_set_gridsize_
void comm_set_gridsize_(int *grid)
Definition: interface_quda.cpp:4943

quda::projectSU3
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
Definition: unitarize_links_quda.cu:584

QudaCommsMap
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12

initQudaMemory
void initQudaMemory()
Definition: interface_quda.cpp:500

profileDslash
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.

quda::saveProfile
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:472

saveGaugeFieldQuda
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:3836

QudaEigParam_s::RitzMat_lanczos
QudaSolutionType RitzMat_lanczos
Definition: quda.h:348

init_quda_memory_
void init_quda_memory_()
Definition: interface_quda.cpp:4720

profileGauge
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.

QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:95

cloverPrecise
cudaCloverField * cloverPrecise
Definition: interface_quda.cpp:134

QudaParity
enum QudaParity_s QudaParity

register_pinned_quda_
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
Definition: interface_quda.cpp:4767

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:43

lanczosQuda
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
Definition: interface_quda.cpp:2144

size_t
long unsigned int size_t
Definition: CMakeCUDACompilerId.cpp1.ii:409

QudaGaugeParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:42

quda::DiracCloverPC
Definition: dirac_quda.h:272

quda::applyU
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:340

comms_initialized
static bool comms_initialized
Definition: interface_quda.cpp:289

QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:196

quda::OvrImpSTOUTStep
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Definition: gauge_stout.cu:801

num_failures_h
static int * num_failures_h
Definition: interface_quda.cpp:149

QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:31

quda::DiracDomainWall4DPC
Definition: dirac_quda.h:363

QudaInvertParam_s::mass
double mass
Definition: quda.h:96

dslashQuda_mdwf
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:1779

quda::Eig_Solver::create
static Eig_Solver * create(QudaEigParam &param, RitzMat &ritz_mat, TimeProfile &profile)
Definition: eig_solver.cpp:12

QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:168

QudaEigParam_s::location
QudaFieldLocation location
Definition: quda.h:370

quda::deflated_solver::defl
Deflation * defl
Definition: deflation.h:189

quda::cpuColorSpinorField::freeGhostBuffer
static void freeGhostBuffer(void)
Definition: cpu_color_spinor_field.cpp:283

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

profileGauss
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.

clover_field.h

quda::GaugeFieldParam::nFace
int nFace
Definition: gauge_field.h:12

QUDA_MEMORY_DEVICE
Definition: enum_quda.h:12

MatDagMatQuda
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:1923

quda::deflated_solver::d
Dirac * d
Definition: deflation.h:182

QudaEigParam_s
Definition: quda.h:344

profileProject
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.

memcpy
void * memcpy(void *__dst, const void *__src, size_t __n)

QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:427

quda::fermion_force::unitarizeForce
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed, long long *flops=NULL)
Unitarize the fermion force.

ptr
const void * ptr
Definition: CMakeCUDACompilerId.cpp1.ii:2613

invert_quda_
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:4736

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:54

quda::blas::zero
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:45

quda::DiracParam::kappa
double kappa
Definition: dirac_quda.h:23

quda::cudaCloverField::copy
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
Definition: clover_field.cpp:266

QUDA_TWISTED_CLOVER_DSLASH
Definition: enum_quda.h:94

LexMapData::dims
int dims[QUDA_MAX_DIM]
Definition: interface_quda.cpp:261

quda::LatticeFieldParam::nDim
int nDim
Definition: lattice_field.h:47

quda::DiracMdagM::shift
double shift
Shift term added onto operator (M^dag M + shift)
Definition: dirac_quda.h:1058

init_default_comms
static void init_default_comms()
Definition: interface_quda.cpp:334

tune_quda.h

quda::Dirac::setMass
void setMass(double mass)
Definition: dirac_quda.h:140

pushVerbosity
void pushVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:82

qChargeCuda
double qChargeCuda()
Definition: interface_quda.cpp:5395

num_failures_d
static int * num_failures_d
Definition: interface_quda.cpp:150

QUDA_ASQTADPC_DIRAC
Definition: enum_quda.h:281

init_quda_device_
void init_quda_device_(int *dev)
Definition: interface_quda.cpp:4719

gaugeLongSloppy
cudaGaugeField * gaugeLongSloppy
Definition: interface_quda.cpp:129

QudaInvertParam_s::compute_clover_inverse
int compute_clover_inverse
Definition: quda.h:215

prec_precondition
QudaPrecision prec_precondition
Definition: test_util.cpp:1617

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:47

loadSloppyCloverQuda
void loadSloppyCloverQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
Definition: interface_quda.cpp:1051

QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:322

checkCudaErrorNoSync
#define checkCudaErrorNoSync()
Definition: util_quda.h:113

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_WILSON_DSLASH
Definition: enum_quda.h:86

quda::MG
Definition: multigrid.h:172

update_gauge_field_quda_
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:4784

quda::Dirac::Mdag
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: dirac.cpp:73

blas_quda.h

plaqQuda
void plaqQuda(double plq[3])
Definition: interface_quda.cpp:5003

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

reunit_allow_svd
static bool reunit_allow_svd
Definition: unitarize_link_test.cpp:37

deflation.h

quda::QUDA_PROFILE_INIT
Definition: quda_internal.h:170

quda::gaugefixingFFT
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: gauge_fix_fft.cu:1202

quda::CloverFieldParam::cloverInv
void * cloverInv
Definition: clover_field.h:14

quda::SolverParam::true_res
double true_res
Definition: invert_quda.h:100

invertQuda
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2521

printQudaInvertParam
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:191

QUDA_TWISTED_CLOVER_DIRAC
Definition: enum_quda.h:284

quda::Solver
Definition: invert_quda.h:325

STR
#define STR(x)
Definition: interface_quda.cpp:355

QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:238

clover_quda_
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
Definition: interface_quda.cpp:4730

quda::ColorSpinorParam
Definition: color_spinor_field.h:80

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

unitarization_links.h

quda::multigrid_solver::mSmoothSloppy
DiracM * mSmoothSloppy
Definition: multigrid.h:389

quda::wuppertalStep
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
Definition: color_spinor_wuppertal.cu:189

quda::GaugeCovDev::M
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: gauge_covdev.cpp:61

quda::fermion_force::hisqCompleteForce
void hisqCompleteForce(GaugeField &momentum, const GaugeField &oprod, const GaugeField &link, long long *flops=nullptr)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...

QUDA_NORMERR_PC_SOLVE
Definition: enum_quda.h:145

QudaGaugeParam_s::location
QudaFieldLocation location
Definition: quda.h:27

quda::GaugeFieldParam::site_size
size_t site_size
Definition: gauge_field.h:47

QudaInvertParam_s::clover_rho
double clover_rho
Definition: quda.h:209

dw_dslash_4D_cuda_gen.coeff
def coeff()
Definition: dw_dslash_4D_cuda_gen.py:1099

dirac
GaugeCovDev * dirac
Definition: covdev_test.cpp:75

profileInvert
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:45

QudaGaugeParam_s::gaugeGiB
double gaugeGiB
Definition: quda.h:60

profilePhase
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.

GaugeFixOVRQuda
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.

gaugePrecondition
cudaGaugeField * gaugePrecondition
Definition: interface_quda.cpp:117

QUDA_TWISTED_MASS_DSLASH
Definition: enum_quda.h:93

quda::CloverFieldParam::twisted
bool twisted
Clover coefficient.
Definition: clover_field.h:17

QUDA_MATPC_ODD_ODD_ASYMMETRIC
Definition: enum_quda.h:185

profileCovDev
static TimeProfile profileCovDev("covDevQuda")
Profiler for contractions.

quda::DeflationParam
Definition: deflation.h:13

QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:91

gParam
GaugeFieldParam gParam
Definition: hisq_paths_force_test.cpp:64

quda::CloverFieldParam::csw
double csw
Definition: clover_field.h:16

quda::multigrid_solver::mSmooth
DiracM * mSmooth
Definition: multigrid.h:388

QUDA_SOURCE_NORMALIZATION
Definition: enum_quda.h:204

quda::DiracMdagM
Definition: dirac_quda.h:1051

quda::CG
Definition: invert_quda.h:402

fatlink
void * fatlink[4]
Definition: staggered_dslash_ctest.cpp:47

quda::deflated_solver::deflated_solver
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
Definition: interface_quda.cpp:2435

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::MinResExt
Definition: invert_quda.h:771

quda::cudaGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:429

svd_abs_error
static double svd_abs_error
Definition: unitarize_link_test.cpp:40

quda::ColorSpinorParam::nSpin
int nSpin
Definition: color_spinor_field.h:86

QUDA_GENERAL_LINKS
Definition: enum_quda.h:24

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

quda.h
Main header file for the QUDA library.

quda::LatticeField::r
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:146

QUDA_STAGGEREDPC_DIRAC
Definition: enum_quda.h:279

profileInit
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.

quda::DiracMMdag
Definition: dirac_quda.h:1093

quda::DiracParam::clover
cudaCloverField * clover
Definition: dirac_quda.h:34

redundant_comms
static bool redundant_comms
Definition: interface_quda.cpp:85

QUDA_MAX_MG_LEVEL
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
Definition: quda_constants.h:56

quda::GaugeFieldParam::link_type
QudaLinkType link_type
Definition: gauge_field.h:17

quda::multigrid_solver::B
std::vector< ColorSpinorField * > B
Definition: multigrid.h:391

set_kernel_pack_t_
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
Definition: interface_quda.cpp:4953

quda::GaugeCovDev::MdagM
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: gauge_covdev.cpp:66

quda::CloverFieldParam::direct
bool direct
Definition: clover_field.h:10

quda::printPeakMemUsage
void printPeakMemUsage()
Definition: malloc.cpp:371

QUDA_MAX_DWF_LS
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
Definition: quda_constants.h:49

quda::DiracMatrix
Definition: dirac_quda.h:979

QUDA_DEGRAND_ROSSI_GAMMA_BASIS
Definition: enum_quda.h:338

quda::GaugeField::applyStaggeredPhase
void applyStaggeredPhase()
Definition: gauge_field.cpp:112

quda::loadTuneCache
void loadTuneCache()
Definition: tune.cpp:302

int
int
Definition: CMakeCUDACompilerId.cpp1.ii:3962

quda::ColorSpinorField::Bytes
size_t Bytes() const
Definition: color_spinor_field.h:372

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:40

QUDA_TIFR_PADDED_DIRAC_ORDER
Definition: enum_quda.h:220

newDeflationQuda
void * newDeflationQuda(QudaEigParam *eig_param)
Definition: interface_quda.cpp:2500

dslash_quda.h

quda::fermion_force
Definition: ks_improved_force.h:8

staggeredPhaseQuda
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
Definition: interface_quda.cpp:4602

printQudaMultigridParam
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:504

quda::cpuCloverField
Definition: clover_field.h:208

freeSloppyGaugeQuda
void freeSloppyGaugeQuda(void)
Definition: interface_quda.cpp:1133

quda::computeQCharge
double computeQCharge(GaugeField &Fmunu, QudaFieldLocation location)
Definition: qcharge_quda.cu:143

mat_dag_mat_quda_
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4734

if
if(err !=cudaSuccess)
Definition: CMakeCUDACompilerId.cpp1.ii:15963

quda::GaugeCovDev::prepare
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
Definition: gauge_covdev.cpp:71

QudaGaugeParam_s::use_resident_gauge
int use_resident_gauge
Definition: quda.h:71

QudaInvertParam_s::Ls
int Ls
Definition: quda.h:100

quda::QUDA_PROFILE_TOTAL
Definition: quda_internal.h:205

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

new_quda_gauge_param_
void new_quda_gauge_param_(QudaGaugeParam *param)
Definition: interface_quda.cpp:4777

QUDA_DAG_NO
Definition: enum_quda.h:190

quda::contractCuda
void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity, TimeProfile &profile)
Definition: contract.cu:202

quda::DiracParam::fatGauge
cudaGaugeField * fatGauge
Definition: dirac_quda.h:32

QudaInvertParam_s::twist_flavor
QudaTwistFlavorType twist_flavor
Definition: quda.h:108

atoi
int atoi(const char *)

cloverQuda
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
Definition: interface_quda.cpp:2074

quda::GaugeCovDev::Dslash
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Definition: gauge_covdev.cpp:47

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:425

QudaGaugeParam_s::return_result_gauge
int return_result_gauge
Definition: quda.h:75

gaugeSmeared
cudaGaugeField * gaugeSmeared
Definition: interface_quda.cpp:132

GaugeFixFFTQuda
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")

QudaInvertParam_s::residue
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:171

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:111

quda::ColorSpinorParam::siteOrder
QudaSiteOrder siteOrder
Definition: color_spinor_field.h:90

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

quda::GaugeFieldParam::gauge
void * gauge
Definition: gauge_field.h:23

cudaGauge
cudaGaugeField * cudaGauge
Definition: hisq_paths_force_test.cpp:21

profileExtendedGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.

QudaInvertParam_s::overlap
int overlap
Definition: quda.h:150

QUDA_ASYMMETRIC_MASS_NORMALIZATION
Definition: enum_quda.h:198

checkGauge
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
Definition: interface_quda.cpp:2023

quda::updateMomentum
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
Definition: momentum.cu:224

streams
cudaStream_t * streams
Definition: interface_quda.cpp:153

QudaGaugeParam_s::use_resident_mom
int use_resident_mom
Definition: quda.h:72

quda::ColorSpinorParam::v
void * v
Definition: color_spinor_field.h:98

QUDA_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:274

quda::GaugeFieldParam::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:14

quda::multigrid_solver::d
Dirac * d
Definition: multigrid.h:383

quda::setKernelPackT
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59

closeMagma
void closeMagma()
Definition: interface_quda.cpp:103

quda::TimeProfile
Definition: quda_internal.h:232

quda::GaugeFieldParam::create
QudaFieldCreate create
Definition: gauge_field.h:25

quda::printAPIProfile
void printAPIProfile()
Print out the timer profile for CUDA API calls.
Definition: quda_cuda_api.cpp:303

quda::printLaunchTimer
void printLaunchTimer()
Definition: tune.cpp:797

gaugeFatSloppy
cudaGaugeField *& gaugeFatSloppy
Definition: interface_quda.cpp:122

quda::gamma5
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:427

QudaContractType
enum QudaContractType_s QudaContractType

QudaInvertParam_s::cl_pad
int cl_pad
Definition: quda.h:222

profileEnd
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.

QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:142

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::ColorSpinorField::X
const int * X() const
Definition: color_spinor_field.h:364

QUDA_MATPC_DAG_SOLUTION
Definition: enum_quda.h:133

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

quda::SolverParam::use_init_guess
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:50

QUDA_EVEN_PARITY
Definition: enum_quda.h:259

QudaInvertParam_s::num_offset
int num_offset
Definition: quda.h:146

flushChronoQuda
void flushChronoQuda(int i)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:1276

quda::DiracParam::longGauge
cudaGaugeField * longGauge
Definition: dirac_quda.h:33

QUDA_MATDAG_MAT_SOLUTION
Definition: enum_quda.h:131

dslash_quda_
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
Definition: interface_quda.cpp:4728

popVerbosity
void popVerbosity()
Definition: util_quda.cpp:93

QudaVerbosity
enum QudaVerbosity_s QudaVerbosity

QudaInvertParam_s::cloverGiB
double cloverGiB
Definition: quda.h:226

quda::updateGaugeField
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
Definition: gauge_update_quda.cu:308

QUDA_CLOVER_DIRAC
Definition: enum_quda.h:271

createCloverQuda
void createCloverQuda(QudaInvertParam *invertParam)
Definition: interface_quda.cpp:3780

QudaGaugeParam_s::return_result_mom
int return_result_mom
Definition: quda.h:76

test_type
int test_type
Definition: test_util.cpp:1634

QUDA_CPS_WILSON_DIRAC_ORDER
Definition: enum_quda.h:218

end_quda_
void end_quda_()
Definition: interface_quda.cpp:4721

QudaInvertParam_s::compute_clover
int compute_clover
Definition: quda.h:214

computeGaugeForceQuda
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
Definition: interface_quda.cpp:3663

quda::computeStaggeredOprod
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
Definition: staggered_oprod.cu:451

ks_force_quda.h

QudaInvertParam_s::epsilon
double epsilon
Definition: quda.h:106

numa_affinity.h

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

cpuGauge
cpuGaugeField * cpuGauge
Definition: hisq_paths_force_test.cpp:22

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

quda::GaugeFieldParam::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:27

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:426

QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:331

quda::TimeProfile::PrintGlobal
static void PrintGlobal()
Definition: timer.cpp:55

setOutputFile
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:74

gaugePrecise
cudaGaugeField * gaugePrecise
Definition: interface_quda.cpp:115

random_quda.h

quda::RitzMat
Definition: ritz_quda.h:19

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:56

QudaInvertParam_s::use_resident_solution
int use_resident_solution
Definition: quda.h:316

quda::Dirac::create
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:142

gaugeFatPrecise
cudaGaugeField *& gaugeFatPrecise
Definition: interface_quda.cpp:121

QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:88

quda::unscaled_shifts
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
Definition: interface_quda.cpp:1530

QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:277

dslashQuda
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
Definition: interface_quda.cpp:1616

compute_gauge_force_quda_
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
Definition: interface_quda.cpp:4840

updateR
void updateR()
update the radius for halos.
Definition: interface_quda.cpp:541

profileWuppertal
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.

QUDA_FLOAT4_FIELD_ORDER
Definition: enum_quda.h:320

cudaForce
cudaGaugeField * cudaForce
Definition: hisq_paths_force_test.cpp:24

flush_chrono_quda_
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
Definition: interface_quda.cpp:4765

quda::multigrid_solver
Definition: multigrid.h:382

quda::ColorSpinorParam::create
QudaFieldCreate create
Definition: color_spinor_field.h:94

profileContract
static TimeProfile profileContract("contractQuda")
Profiler for contractions.

quda::SolverParam
Definition: invert_quda.h:15

quda::DiracParam::m5
double m5
Definition: dirac_quda.h:25

QudaGaugeParam_s::make_resident_gauge
int make_resident_gauge
Definition: quda.h:73

quda::cudaGaugeField::copy
void copy(const GaugeField &src)
Definition: cuda_gauge_field.cu:587

QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:143

contractQuda.h

QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:435

QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:134

mat
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: covdev_reference.cpp:117

QUDA_GAUGE_LAPLACE_DIRAC
Definition: enum_quda.h:288

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

quda::computeCloverSigmaOprod
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
Definition: clover_sigma_outer_product.cu:178

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

QUDA_ASQTAD_DIRAC
Definition: enum_quda.h:280

quda::CloverFieldParam::norm
void * norm
Definition: clover_field.h:13

extendedGaugeResident
cudaGaugeField * extendedGaugeResident
Definition: interface_quda.cpp:139

quda::computeClover
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204

QUDA_SU3_LINKS
Definition: enum_quda.h:23

quda::saveTuneCache
void saveTuneCache()
Definition: tune.cpp:388

QudaInvertParam_s::dslash_type_precondition
QudaDslashType dslash_type_precondition
Definition: quda.h:259

QudaInvertParam_s::clover_cpu_prec
QudaPrecision clover_cpu_prec
Definition: quda.h:200

quda::ColorSpinorField::siteSubset
QudaSiteSubset siteSubset
Definition: color_spinor_field.h:322

QudaEigParam_s::cuda_prec_ritz
QudaPrecision cuda_prec_ritz
Definition: quda.h:364

parity
QudaParity parity
Definition: covdev_test.cpp:53

QudaInvertParam_s
Definition: quda.h:88

QUDA_TENSOR_GEOMETRY
Definition: enum_quda.h:428

opp
static int opp(int dir)
Definition: interface_quda.cpp:4790

profileHISQForce
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:296

quda::reorder_location_set
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
Definition: lattice_field.cpp:586

QudaMultigridParam_s::n_level
int n_level
Definition: quda.h:398

quda::LatticeField::freeGhostBuffer
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
Definition: lattice_field.cpp:167

destroyMultigridQuda
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
Definition: interface_quda.cpp:2377

quda::destroyDslashEvents
void destroyDslashEvents()
Definition: dslash_quda.cu:118

getenv
char * getenv(const char *)

llfat_quda.h

quda::deflated_solver::RV
ColorSpinorField * RV
Definition: deflation.h:185

performOvrImpSTOUTnStep
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
Definition: interface_quda.cpp:5166

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:30

quda::SolverParam::tol
double tol
Definition: invert_quda.h:88

gauge_field.h

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Definition: copy_gauge_extended.cu:321

eig_variables.h

QUDA_VERSION_MAJOR
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1

setVerbosity
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24

QudaInvertParam_s::matpc_type
QudaMatPCType matpc_type
Definition: quda.h:183

QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:89

momResident
cudaGaugeField * momResident
Definition: interface_quda.cpp:138

quda::DiracParam::tmp1
ColorSpinorField * tmp1
Definition: dirac_quda.h:40

quda::MGParam::matResidual
DiracMatrix * matResidual
Definition: multigrid.h:72

QudaMultigridParam_s::gflops
double gflops
Definition: quda.h:465

quda::Deflation
Definition: deflation.h:78

quda::GaugeCovDev::reconstruct
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
Definition: gauge_covdev.cpp:78

cpuForce
cpuGaugeField * cpuForce
Definition: hisq_paths_force_test.cpp:25

profileCloverForce
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.

kappa5
double kappa5
Definition: dslash_ctest.cpp:31

createGaugeForcePaths
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
Definition: interface_quda.cpp:4792

newMultigridQuda
void * newMultigridQuda(QudaMultigridParam *mg_param)
Definition: interface_quda.cpp:2364

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::cpuColorSpinorField
Definition: color_spinor_field.h:789

QudaGaugeParam_s::cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:40

gaugeSloppy
cudaGaugeField * gaugeSloppy
Definition: interface_quda.cpp:116

quda::ColorSpinorField::V
void * V()
Definition: color_spinor_field.h:380

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

QUDA_STAGGERED_DIRAC
Definition: enum_quda.h:278

quda::multigrid_solver::mgParam
MGParam * mgParam
Definition: multigrid.h:393

ks_improved_force.h

initQudaDevice
void initQudaDevice(int dev)
Definition: interface_quda.cpp:365

endQuda
void endQuda(void)
Definition: interface_quda.cpp:1290

quda::Dirac
Definition: dirac_quda.h:86

QudaGaugeParam_s::overlap
int overlap
Definition: quda.h:67

newQudaGaugeParam
QudaGaugeParam newQudaGaugeParam(void)

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::LatticeField::GBytes
size_t GBytes() const
Definition: lattice_field.h:488

QUDA_RECONSTRUCT_INVALID
Definition: enum_quda.h:72

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

QudaMultigridParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:395

checkClover
void checkClover(QudaInvertParam *param)
Definition: interface_quda.cpp:2002

quda_internal.h

quda::LatticeFieldParam::pad
int pad
Definition: lattice_field.h:52

mat_quda_
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4732

QudaInvertParam_s::clover_coeff
double clover_coeff
Definition: quda.h:208

quda::GaugeField
Definition: gauge_field.h:123

QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:90

compute_staggered_force_quda_
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:4896

quda::setDiracPreParam
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
Definition: interface_quda.cpp:1485

gaugeLongPrecondition
cudaGaugeField * gaugeLongPrecondition
Definition: interface_quda.cpp:130

quda::GaugeField::removeStaggeredPhase
void removeStaggeredPhase()
Definition: gauge_field.cpp:125

quda::DiracParam::epsilon
double epsilon
Definition: dirac_quda.h:38

QUDA_MEMORY_MAPPED
Definition: enum_quda.h:14