quda-ref/v0.7.0/interface__quda_8cpp_source.html

 #include <iostream>

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <string.h>

 #include <sys/time.h>


 #include <quda.h>

 #include <quda_fortran.h>

 #include <quda_internal.h>

 #include <comm_quda.h>

 #include <tune_quda.h>

 #include <blas_quda.h>

 #include <gauge_field.h>

 #include <dirac_quda.h>

 #include <ritz_quda.h>

 #include <dslash_quda.h>

 #include <invert_quda.h>

 #include <lanczos_quda.h>

 #include <color_spinor_field.h>

 #include <eig_variables.h>

 #include <clover_field.h>

 #include <llfat_quda.h>

 #include <fat_force_quda.h>

 #include <hisq_links_quda.h>

 #include <algorithm>

 #include <staggered_oprod.h>

 #include <ks_improved_force.h>

 #include <ks_force_quda.h>


 #ifdef NUMA_AFFINITY

 #include <numa_affinity.h>

 #endif


 #include <cuda.h>

 #include "face_quda.h"


 #ifdef MULTI_GPU

 extern void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,

     QudaPrecision gPrecision, int optflag, int geom);

 #endif // MULTI_GPU


 #include <ks_force_quda.h>


 #ifdef GPU_GAUGE_FORCE

 #include <gauge_force_quda.h>

 #endif

 #include <gauge_update_quda.h>


 #define MAX(a,b) ((a)>(b)? (a):(b))

 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))


 #define spinorSiteSize 24 // real numbers per spinor


 #define MAX_GPU_NUM_PER_NODE 16


 // define newQudaGaugeParam() and newQudaInvertParam()

 #define INIT_PARAM

 #include "check_params.h"

 #undef INIT_PARAM


 // define (static) checkGaugeParam() and checkInvertParam()

 #define CHECK_PARAM

 #include "check_params.h"

 #undef CHECK_PARAM


 // define printQudaGaugeParam() and printQudaInvertParam()

 #define PRINT_PARAM

 #include "check_params.h"

 #undef PRINT_PARAM


 #include <gauge_tools.h>


 int numa_affinity_enabled = 1;


 using namespace quda;


 static cudaGaugeField* cudaStapleField = NULL;

 static cudaGaugeField* cudaStapleField1 = NULL;


 //for MAGMA lib:

 #include <blas_magma.h>


 static bool InitMagma = false;


 void openMagma(){


    if(!InitMagma){

       BlasMagmaArgs::OpenMagma();

       InitMagma = true;

    }

    else printfQuda("\nMAGMA library was already initialized..\n");


    return;

 }


 void closeMagma(){


    if(InitMagma) BlasMagmaArgs::CloseMagma();

    else printfQuda("\nMAGMA library was not initialized..\n");


    return;

 }


 cudaGaugeField *gaugePrecise = NULL;

 cudaGaugeField *gaugeSloppy = NULL;

 cudaGaugeField *gaugePrecondition = NULL;

 cudaGaugeField *gaugeExtended = NULL;


 // It's important that these alias the above so that constants are set correctly in Dirac::Dirac()

 cudaGaugeField *&gaugeFatPrecise = gaugePrecise;

 cudaGaugeField *&gaugeFatSloppy = gaugeSloppy;

 cudaGaugeField *&gaugeFatPrecondition = gaugePrecondition;

 cudaGaugeField *&gaugeFatExtended = gaugeExtended;


 cudaGaugeField *gaugeLongExtended = NULL;

 cudaGaugeField *gaugeLongPrecise = NULL;

 cudaGaugeField *gaugeLongSloppy = NULL;

 cudaGaugeField *gaugeLongPrecondition = NULL;


 cudaGaugeField *gaugeSmeared = NULL;


 cudaCloverField *cloverPrecise = NULL;

 cudaCloverField *cloverSloppy = NULL;

 cudaCloverField *cloverPrecondition = NULL;


 cudaCloverField *cloverInvPrecise = NULL;

 cudaCloverField *cloverInvSloppy = NULL;

 cudaCloverField *cloverInvPrecondition = NULL;


 cudaGaugeField *momResident = NULL;

 cudaGaugeField *extendedGaugeResident = NULL;


 cudaColorSpinorField *solutionResident = NULL;


 cudaDeviceProp deviceProp;

 cudaStream_t *streams;

 #ifdef PTHREADS

 pthread_mutex_t pthread_mutex;

 #endif


 static bool initialized = false;


 static TimeProfile profileInit("initQuda");


 static TimeProfile profileGauge("loadGaugeQuda");


 static TimeProfile profileClover("loadCloverQuda");


 static TimeProfile profileInvert("invertQuda");


 static TimeProfile profileMulti("invertMultiShiftQuda");


 static TimeProfile profileMultiMixed("invertMultiShiftMixedQuda");


 static TimeProfile profileFatLink("computeKSLinkQuda");


 static TimeProfile profileGaugeForce("computeGaugeForceQuda");


 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");


 static TimeProfile profileExtendedGauge("createExtendedGaugeField");


 static TimeProfile profileCloverCreate("createCloverQuda");


 static TimeProfile profileCloverDerivative("computeCloverDerivativeQuda");


 static TimeProfile profileCloverTrace("computeCloverTraceQuda");


 static TimeProfile profileStaggeredOprod("computeStaggeredOprodQuda");


 static TimeProfile profileAsqtadForce("computeAsqtadForceQuda");


 static TimeProfile profileHISQForce("computeHISQForceQuda");


 static TimeProfile profileHISQForceComplete("computeHISQForceCompleteQuda");


 static TimeProfile profileAPE("APEQuda");


 static TimeProfile profileContract("contractQuda");


 static TimeProfile profileCovDev("covDevCuda");


 static TimeProfile profileEnd("endQuda");


 namespace quda {

   void printLaunchTimer();

 }


 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)

 {

   setVerbosity(verbosity);

   setOutputPrefix(prefix);

   setOutputFile(outfile);

 }


 typedef struct {

   int ndim;

   int dims[QUDA_MAX_DIM];

 } LexMapData;


 static int lex_rank_from_coords(const int *coords, void *fdata)

 {

   LexMapData *md = static_cast<LexMapData *>(fdata);


   int rank = coords[0];

   for (int i = 1; i < md->ndim; i++) {

     rank = md->dims[i] * rank + coords[i];

   }

   return rank;

 }


 #ifdef QMP_COMMS


 static int qmp_rank_from_coords(const int *coords, void *fdata)

 {

   return QMP_get_node_number_from(coords);

 }

 #endif


 static bool comms_initialized = false;


 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)

 {

   if (nDim != 4) {

     errorQuda("Number of communication grid dimensions must be 4");

   }


   LexMapData map_data;

   if (!func) {


 #if QMP_COMMS

     if (QMP_logical_topology_is_declared()) {

       if (QMP_get_logical_number_of_dimensions() != 4) {

         errorQuda("QMP logical topology must have 4 dimensions");

       }

       for (int i=0; i<nDim; i++) {

         int qdim = QMP_get_logical_dimensions()[i];

         if(qdim != dims[i]) {

           errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);

         }

       }

       fdata = NULL;

       func = qmp_rank_from_coords;

     } else {

       warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");

 #endif


       map_data.ndim = nDim;

       for (int i=0; i<nDim; i++) {

         map_data.dims[i] = dims[i];

       }

       fdata = (void *) &map_data;

       func = lex_rank_from_coords;


 #if QMP_COMMS

     }

 #endif


   }

   comm_init(nDim, dims, func, fdata);

   comms_initialized = true;

 }


 static void init_default_comms()

 {

 #if defined(QMP_COMMS)

   if (QMP_logical_topology_is_declared()) {

     int ndim = QMP_get_logical_number_of_dimensions();

     const int *dims = QMP_get_logical_dimensions();

     initCommsGridQuda(ndim, dims, NULL, NULL);

   } else {

     errorQuda("initQuda() called without prior call to initCommsGridQuda(),"

         " and QMP logical topology has not been declared");

   }

 #elif defined(MPI_COMMS)

   errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");

 #else // single-GPU

   const int dims[4] = {1, 1, 1, 1};

   initCommsGridQuda(4, dims, NULL, NULL);

 #endif

 }


 /*

  * Set the device that QUDA uses.

  */

 void initQudaDevice(int dev) {


   //static bool initialized = false;

   if (initialized) return;

   initialized = true;


 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)

   //check if CUDA_NIC_INTEROP is set to 1 in the enviroment

   // not needed for CUDA >= 4.1

   char* cni_str = getenv("CUDA_NIC_INTEROP");

   if(cni_str == NULL){

     errorQuda("Environment variable CUDA_NIC_INTEROP is not set");

   }

   int cni_int = atoi(cni_str);

   if (cni_int != 1){

     errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");

   }

 #endif


   int deviceCount;

   cudaGetDeviceCount(&deviceCount);

   if (deviceCount == 0) {

     errorQuda("No CUDA devices found");

   }


   for(int i=0; i<deviceCount; i++) {

     cudaGetDeviceProperties(&deviceProp, i);

     checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode

     if (getVerbosity() >= QUDA_SUMMARIZE) {

       printfQuda("Found device %d: %s\n", i, deviceProp.name);

     }

   }


 #ifdef MULTI_GPU

   if (dev < 0) {

     if (!comms_initialized) {

       errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");

     }

     dev = comm_gpuid();

   }

 #else

   if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);

 #endif


   cudaGetDeviceProperties(&deviceProp, dev);

   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode

   if (deviceProp.major < 1) {

     errorQuda("Device %d does not support CUDA", dev);

   }


   if (getVerbosity() >= QUDA_SUMMARIZE) {

     printfQuda("Using device %d: %s\n", dev, deviceProp.name);

   }

 #ifndef USE_QDPJIT

   cudaSetDevice(dev);

   checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode

 #endif


 #ifdef NUMA_AFFINITY

   if(numa_affinity_enabled){

     setNumaAffinity(dev);

   }

 #endif


   // if the device supports host-mapped memory, then enable this

 #ifndef USE_QDPJIT

   if(deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);

   checkCudaError();

 #endif


   cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

   //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);

   cudaGetDeviceProperties(&deviceProp, dev);

 }


 /*

  * Any persistent memory allocations that QUDA uses are done here.

  */

 void initQudaMemory()

 {

   if (!comms_initialized) init_default_comms();


   streams = new cudaStream_t[Nstream];


 #if (CUDA_VERSION >= 5050)

   int greatestPriority;

   int leastPriority;

   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);

   for (int i=0; i<Nstream-1; i++) {

     cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);

   }

   cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);

 #else

   for (int i=0; i<Nstream; i++) {

     cudaStreamCreate(&streams[i]);

   }

 #endif


   checkCudaError();

   createDslashEvents();

 #ifdef GPU_STAGGERED_OPROD

   createStaggeredOprodEvents();

 #endif

   initBlas();


   loadTuneCache(getVerbosity());

 }


 void initQuda(int dev)

 {

   profileInit.Start(QUDA_PROFILE_TOTAL);


   // initialize communications topology, if not already done explicitly via initCommsGridQuda()

   if (!comms_initialized) init_default_comms();


   // set the device that QUDA uses

   initQudaDevice(dev);


   // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)

   initQudaMemory();


 #ifdef PTHREADS

   pthread_mutexattr_t mutex_attr;

   pthread_mutexattr_init(&mutex_attr);

   pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);

   pthread_mutex_init(&pthread_mutex, &mutex_attr);

 #endif


   profileInit.Stop(QUDA_PROFILE_TOTAL);

 }


 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)

 {

   //printfQuda("loadGaugeQuda use_resident_gauge = %d phase=%d\n",

   //param->use_resident_gauge, param->staggered_phase_applied);


   profileGauge.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");

   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);


   checkGaugeParam(param);


   profileGauge.Start(QUDA_PROFILE_INIT);

   // Set the specific input parameters and create the cpu gauge field

   GaugeFieldParam gauge_param(h_gauge, *param);


   // if we are using half precision then we need to compute the fat

   // link maximum while still on the cpu

   // FIXME get a kernel for this

   if ((param->cuda_prec == QUDA_HALF_PRECISION ||

         param->cuda_prec_sloppy == QUDA_HALF_PRECISION ||

         param->cuda_prec_precondition == QUDA_HALF_PRECISION) &&

       param->type == QUDA_ASQTAD_FAT_LINKS)

     gauge_param.compute_fat_link_max = true;


   GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :

     static_cast<GaugeField*>(new cudaGaugeField(gauge_param));


   // if not preserving then copy the gauge field passed in

   cudaGaugeField *precise = NULL;


   // switch the parameters for creating the mirror precise cuda gauge field

   gauge_param.create = QUDA_NULL_FIELD_CREATE;

   gauge_param.precision = param->cuda_prec;

   gauge_param.reconstruct = param->reconstruct;

   gauge_param.pad = param->ga_pad;

   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||

                        gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?

     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;


   precise = new cudaGaugeField(gauge_param);


   if (param->use_resident_gauge) {

     if(gaugePrecise == NULL) errorQuda("No resident gauge field");

     // copy rather than point at to ensure that the padded region is filled in

     precise->copy(*gaugePrecise);

     precise->exchangeGhost();

     delete gaugePrecise;

     gaugePrecise = NULL;

     profileGauge.Stop(QUDA_PROFILE_INIT);

   } else {

     profileGauge.Stop(QUDA_PROFILE_INIT);

     profileGauge.Start(QUDA_PROFILE_H2D);

     precise->copy(*in);

     profileGauge.Stop(QUDA_PROFILE_H2D);

   }


   param->gaugeGiB += precise->GBytes();


   // creating sloppy fields isn't really compute, but it is work done on the gpu

   profileGauge.Start(QUDA_PROFILE_COMPUTE);


   // switch the parameters for creating the mirror sloppy cuda gauge field

   gauge_param.precision = param->cuda_prec_sloppy;

   gauge_param.reconstruct = param->reconstruct_sloppy;

   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||

       gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?

     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   cudaGaugeField *sloppy = NULL;

   if (param->cuda_prec != param->cuda_prec_sloppy ||

       param->reconstruct != param->reconstruct_sloppy) {

     sloppy = new cudaGaugeField(gauge_param);

 #if (__COMPUTE_CAPABILITY__ >= 200)

     sloppy->copy(*precise);

 #else

     sloppy->copy(*in);

 #endif

     param->gaugeGiB += sloppy->GBytes();

   } else {

     sloppy = precise;

   }


   // switch the parameters for creating the mirror preconditioner cuda gauge field

   gauge_param.precision = param->cuda_prec_precondition;

   gauge_param.reconstruct = param->reconstruct_precondition;

   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||

       gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?

     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   cudaGaugeField *precondition = NULL;

   if (param->cuda_prec_sloppy != param->cuda_prec_precondition ||

       param->reconstruct_sloppy != param->reconstruct_precondition) {

     precondition = new cudaGaugeField(gauge_param);

 #if (__COMPUTE_CAPABILITY__ >= 200)

     precondition->copy(*sloppy);

 #else

     precondition->copy(*in);

 #endif

     param->gaugeGiB += precondition->GBytes();

   } else {

     precondition = sloppy;

   }


   // create an extended preconditioning field

   cudaGaugeField* extended = NULL;

   if(param->overlap){

     int R[4]; // domain-overlap widths in different directions

     for(int i=0; i<4; ++i){

       R[i] = param->overlap*commDimPartitioned(i);

       gauge_param.x[i] += 2*R[i];

     }

     // the extended field does not require any ghost padding

     gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

     extended = new cudaGaugeField(gauge_param);


     // copy the unextended preconditioning field into the interior of the extended field

     copyExtendedGauge(*extended, *precondition, QUDA_CUDA_FIELD_LOCATION);

     // now perform communication and fill the overlap regions

     extended->exchangeExtendedGhost(R);

   }


   profileGauge.Stop(QUDA_PROFILE_COMPUTE);


   switch (param->type) {

     case QUDA_WILSON_LINKS:

       //if (gaugePrecise) errorQuda("Precise gauge field already allocated");

       gaugePrecise = precise;

       //if (gaugeSloppy) errorQuda("Sloppy gauge field already allocated");

       gaugeSloppy = sloppy;

       //if (gaugePrecondition) errorQuda("Precondition gauge field already allocated");

       gaugePrecondition = precondition;


       if(param->overlap) gaugeExtended = extended;

       break;

     case QUDA_ASQTAD_FAT_LINKS:

       if (gaugeFatPrecise) errorQuda("Precise gauge fat field already allocated");

       gaugeFatPrecise = precise;

       if (gaugeFatSloppy) errorQuda("Sloppy gauge fat field already allocated");

       gaugeFatSloppy = sloppy;

       if (gaugeFatPrecondition) errorQuda("Precondition gauge fat field already allocated");

       gaugeFatPrecondition = precondition;


       if(param->overlap){

         if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");

         gaugeFatExtended = extended;

       }

       break;

     case QUDA_ASQTAD_LONG_LINKS:

       if (gaugeLongPrecise) errorQuda("Precise gauge long field already allocated");

       gaugeLongPrecise = precise;

       if (gaugeLongSloppy) errorQuda("Sloppy gauge long field already allocated");

       gaugeLongSloppy = sloppy;

       if (gaugeLongPrecondition) errorQuda("Precondition gauge long field already allocated");

       gaugeLongPrecondition = precondition;

       if(param->overlap){

         if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");

         gaugeLongExtended = extended;

       }

       break;

     default:

       errorQuda("Invalid gauge type");

   }


   profileGauge.Start(QUDA_PROFILE_FREE);

   delete in;

   profileGauge.Stop(QUDA_PROFILE_FREE);


   profileGauge.Stop(QUDA_PROFILE_TOTAL);

 }


 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)

 {

   profileGauge.Start(QUDA_PROFILE_TOTAL);


   if (param->location != QUDA_CPU_FIELD_LOCATION)

     errorQuda("Non-cpu output location not yet supported");


   if (!initialized) errorQuda("QUDA not initialized");

   checkGaugeParam(param);


   // Set the specific cpu parameters and create the cpu gauge field

   GaugeFieldParam gauge_param(h_gauge, *param);

   cpuGaugeField cpuGauge(gauge_param);

   cudaGaugeField *cudaGauge = NULL;

   switch (param->type) {

     case QUDA_WILSON_LINKS:

       cudaGauge = gaugePrecise;

       break;

     case QUDA_ASQTAD_FAT_LINKS:

       cudaGauge = gaugeFatPrecise;

       break;

     case QUDA_ASQTAD_LONG_LINKS:

       cudaGauge = gaugeLongPrecise;

       break;

     default:

       errorQuda("Invalid gauge type");

   }


   profileGauge.Start(QUDA_PROFILE_D2H);

   cudaGauge->saveCPUField(cpuGauge, QUDA_CPU_FIELD_LOCATION);

   profileGauge.Stop(QUDA_PROFILE_D2H);


   profileGauge.Stop(QUDA_PROFILE_TOTAL);

 }


 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)

 {

   profileClover.Start(QUDA_PROFILE_TOTAL);

   bool device_calc = false; // calculate clover and inverse on the device?


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   if (!initialized) errorQuda("QUDA not initialized");


   if (!h_clover && !h_clovinv) {

     printfQuda("clover_coeff: %lf\n", inv_param->clover_coeff);

     if(inv_param->clover_coeff != 0){

       device_calc = true;

     }else{

       errorQuda("loadCloverQuda() called with neither clover term nor inverse");

     }

   }


   if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {

     errorQuda("Half precision not supported on CPU");

   }

   if (gaugePrecise == NULL) {

     errorQuda("Gauge field must be loaded before clover");

   }

   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {

     errorQuda("Wrong dslash_type in loadCloverQuda()");

   }


   // determines whether operator is preconditioned when calling invertQuda()

   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||

       inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);


   // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()

   bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||

       inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);


   // We issue a warning only when it seems likely that the user is screwing up:


   // uninverted clover term is required when applying unpreconditioned operator,

   // but note that dslashQuda() is always preconditioned

   if (!h_clover && !pc_solve && !pc_solution) {

     //warningQuda("Uninverted clover term not loaded");

   }


   // uninverted clover term is also required for "asymmetric" preconditioning

   if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {

     warningQuda("Uninverted clover term not loaded");

   }


   CloverFieldParam clover_param;

   CloverField *in=NULL, *inInv=NULL;


   if(!device_calc){

     // create a param for the cpu clover field

     profileClover.Start(QUDA_PROFILE_INIT);

     CloverFieldParam cpuParam;

     cpuParam.nDim = 4;

     for (int i=0; i<4; i++) cpuParam.x[i] = gaugePrecise->X()[i];

     cpuParam.precision = inv_param->clover_cpu_prec;

     cpuParam.order = inv_param->clover_order;

     cpuParam.direct = h_clover ? true : false;

     cpuParam.inverse = h_clovinv ? true : false;

     cpuParam.clover = h_clover;

     cpuParam.norm = 0;

     cpuParam.cloverInv = h_clovinv;

     cpuParam.invNorm = 0;

     cpuParam.create = QUDA_REFERENCE_FIELD_CREATE;

     cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET;

     cpuParam.twisted = false;

     cpuParam.mu2 = 0.;


     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       cpuParam.direct = true;

       cpuParam.inverse = false;

       cpuParam.cloverInv = NULL;

       cpuParam.clover = h_clover;

       in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?

         static_cast<CloverField*>(new cpuCloverField(cpuParam)) :

         static_cast<CloverField*>(new cudaCloverField(cpuParam));


       cpuParam.cloverInv = h_clovinv;

       cpuParam.clover = NULL;

       cpuParam.twisted = true;

       cpuParam.direct = true;

       cpuParam.inverse = false;

       cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;


       inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?

         static_cast<CloverField*>(new cpuCloverField(cpuParam)) :

         static_cast<CloverField*>(new cudaCloverField(cpuParam));

     } else {

       in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?

         static_cast<CloverField*>(new cpuCloverField(cpuParam)) :

         static_cast<CloverField*>(new cudaCloverField(cpuParam));

     }


     clover_param.nDim = 4;

     for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];

     clover_param.setPrecision(inv_param->clover_cuda_prec);

     clover_param.pad = inv_param->cl_pad;

     clover_param.direct = h_clover ? true : false;

     clover_param.inverse = (h_clovinv || pc_solve) ? true : false;

     clover_param.create = QUDA_NULL_FIELD_CREATE;

     clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;


     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       clover_param.direct = true;

       clover_param.inverse = false;

       cloverPrecise = new cudaCloverField(clover_param);

       clover_param.direct = false;

       clover_param.inverse = true;

       clover_param.twisted = true;

       cloverInvPrecise = new cudaCloverField(clover_param);

       clover_param.twisted = false;

     } else {

       cloverPrecise = new cudaCloverField(clover_param);

     }


     profileClover.Stop(QUDA_PROFILE_INIT);


     profileClover.Start(QUDA_PROFILE_H2D);

     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       cloverPrecise->copy(*in, false);

       cloverInvPrecise->copy(*in, true);

       cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);

     } else {

       cloverPrecise->copy(*in, h_clovinv ? true : false);

     }


     profileClover.Stop(QUDA_PROFILE_H2D);

   } else {

     profileClover.Start(QUDA_PROFILE_COMPUTE);


     createCloverQuda(inv_param);


     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);

       if (inv_param->compute_clover_trlog) {

         inv_param->trlogA[0] = cloverInvPrecise->TrLog()[0];

         inv_param->trlogA[1] = cloverInvPrecise->TrLog()[1];

       }

     }

     profileClover.Stop(QUDA_PROFILE_COMPUTE);

   }


   // inverted clover term is required when applying preconditioned operator

   if ((!h_clovinv && pc_solve) && inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {

     profileClover.Start(QUDA_PROFILE_COMPUTE);

     cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);

     profileClover.Stop(QUDA_PROFILE_COMPUTE);

     if (inv_param->compute_clover_trlog) {

       inv_param->trlogA[0] = cloverPrecise->TrLog()[0];

       inv_param->trlogA[1] = cloverPrecise->TrLog()[1];

     }

   }


   if (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)

     inv_param->cloverGiB = cloverPrecise->GBytes();

   else

     inv_param->cloverGiB = cloverPrecise->GBytes() + cloverInvPrecise->GBytes();


   clover_param.norm    = 0;

   clover_param.invNorm = 0;

   clover_param.mu2 = 0.;

   clover_param.nDim = 4;

   for(int dir=0; dir<4; ++dir) clover_param.x[dir] = gaugePrecise->X()[dir];

   clover_param.pad = inv_param->cl_pad;

   clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;

   clover_param.create = QUDA_NULL_FIELD_CREATE;

   clover_param.direct = true;

   clover_param.inverse = true;


   // create the mirror sloppy clover field

   if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {

     profileClover.Start(QUDA_PROFILE_INIT);

     clover_param.setPrecision(inv_param->clover_cuda_prec_sloppy);


     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       clover_param.direct = false;

       clover_param.inverse = true;

       clover_param.twisted = true;

       clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;

       cloverInvSloppy = new cudaCloverField(clover_param);

       cloverInvSloppy->copy(*cloverInvPrecise, true);

       clover_param.direct = true;

       clover_param.inverse = false;

       clover_param.twisted = false;

       cloverSloppy = new cudaCloverField(clover_param);

       cloverSloppy->copy(*cloverPrecise);

       inv_param->cloverGiB += cloverSloppy->GBytes() + cloverInvSloppy->GBytes();

     } else {

       cloverSloppy = new cudaCloverField(clover_param);

       cloverSloppy->copy(*cloverPrecise);

       inv_param->cloverGiB += cloverSloppy->GBytes();

     }

     profileClover.Stop(QUDA_PROFILE_INIT);

   } else {

     cloverSloppy = cloverPrecise;

     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

       cloverInvSloppy = cloverInvPrecise;

   }


   // create the mirror preconditioner clover field

   if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition &&

       inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) {

     profileClover.Start(QUDA_PROFILE_INIT);

     clover_param.setPrecision(inv_param->clover_cuda_prec_precondition);

     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {

       clover_param.direct = true;

       clover_param.inverse = false;

       clover_param.twisted = false;

       cloverPrecondition = new cudaCloverField(clover_param);

       cloverPrecondition->copy(*cloverSloppy);

       clover_param.direct = false;

       clover_param.inverse = true;

       clover_param.twisted = true;

       cloverInvPrecondition = new cudaCloverField(clover_param);

       cloverInvPrecondition->copy(*cloverInvSloppy, true);

       inv_param->cloverGiB += cloverPrecondition->GBytes() + cloverInvPrecondition->GBytes();

     } else {

       cloverPrecondition = new cudaCloverField(clover_param);

       cloverPrecondition->copy(*cloverSloppy);

       inv_param->cloverGiB += cloverPrecondition->GBytes();

     }

     profileClover.Stop(QUDA_PROFILE_INIT);

   } else {

     cloverPrecondition = cloverSloppy;

     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

       cloverInvPrecondition = cloverInvSloppy;

   }


   // need to copy back the odd inverse field into the application clover field

   if (!h_clovinv && pc_solve && !device_calc) {

     // copy the inverted clover term into host application order on the device

     clover_param.setPrecision(inv_param->clover_cpu_prec);

     clover_param.direct = false;

     clover_param.inverse = true;

     clover_param.order = inv_param->clover_order;


     // this isn't really "epilogue" but this label suffices

     profileClover.Start(QUDA_PROFILE_EPILOGUE);

     cudaCloverField hack(clover_param);

     hack.copy(*cloverPrecise);

     profileClover.Stop(QUDA_PROFILE_EPILOGUE);


     // copy the odd components into the host application's clover field

     profileClover.Start(QUDA_PROFILE_D2H);

     cudaMemcpy((char*)(in->V(false))+in->Bytes()/2, (char*)(hack.V(true))+hack.Bytes()/2,

         in->Bytes()/2, cudaMemcpyDeviceToHost);

     profileClover.Stop(QUDA_PROFILE_D2H);


     checkCudaError();

   }


   if(!device_calc)

   {

     if (in) delete in; // delete object referencing input field

     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv;

   }


   popVerbosity();


   profileClover.Stop(QUDA_PROFILE_TOTAL);

 }


 void freeGaugeQuda(void)

 {

   if (!initialized) errorQuda("QUDA not initialized");

   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;

   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;

   if (gaugePrecise) delete gaugePrecise;

   if (gaugeExtended) delete gaugeExtended;


   gaugePrecondition = NULL;

   gaugeSloppy = NULL;

   gaugePrecise = NULL;

   gaugeExtended = NULL;


   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;

   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;

   if (gaugeLongPrecise) delete gaugeLongPrecise;

   if (gaugeLongExtended) delete gaugeLongExtended;


   gaugeLongPrecondition = NULL;

   gaugeLongSloppy = NULL;

   gaugeLongPrecise = NULL;

   gaugeLongExtended = NULL;


   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;

   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;

   if (gaugeFatPrecise) delete gaugeFatPrecise;


   gaugeFatPrecondition = NULL;

   gaugeFatSloppy = NULL;

   gaugeFatPrecise = NULL;

   gaugeFatExtended = NULL;


   if (gaugeSmeared) delete gaugeSmeared;


   gaugeSmeared = NULL;

   // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise

   if (extendedGaugeResident) {

     delete extendedGaugeResident;

     extendedGaugeResident = NULL;

   }

 }


 // just free the sloppy fields used in mixed-precision solvers

 void freeSloppyGaugeQuda(void)

 {

   if (!initialized) errorQuda("QUDA not initialized");

   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;

   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;


   gaugePrecondition = NULL;

   gaugeSloppy = NULL;


   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;

   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;


   gaugeLongPrecondition = NULL;

   gaugeLongSloppy = NULL;


   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;

   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;


   gaugeFatPrecondition = NULL;

   gaugeFatSloppy = NULL;

 }


 void freeCloverQuda(void)

 {

   if (!initialized) errorQuda("QUDA not initialized");

   if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition;

   if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;

   if (cloverPrecise) delete cloverPrecise;


   cloverPrecondition = NULL;

   cloverSloppy = NULL;

   cloverPrecise = NULL;


   if (cloverInvPrecise != NULL) {

      if (cloverInvPrecondition != cloverInvSloppy && cloverInvPrecondition) delete cloverInvPrecondition;

      if (cloverInvSloppy != cloverInvPrecise && cloverInvSloppy) delete cloverInvSloppy;

      if (cloverInvPrecise) delete cloverInvPrecise;


      cloverInvPrecondition = NULL;

      cloverInvSloppy = NULL;

      cloverInvPrecise = NULL;

   }

 }


 void endQuda(void)

 {

   profileEnd.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) return;


   LatticeField::freeBuffer(0);

   LatticeField::freeBuffer(1);

   cudaColorSpinorField::freeBuffer(0);

   cudaColorSpinorField::freeBuffer(1);

   cudaColorSpinorField::freeGhostBuffer();

   cpuColorSpinorField::freeGhostBuffer();

   FaceBuffer::flushPinnedCache();

   freeGaugeQuda();

   freeCloverQuda();


   if(cudaStapleField) delete cudaStapleField; cudaStapleField=NULL;

   if(cudaStapleField1) delete cudaStapleField1; cudaStapleField1=NULL;


   if(solutionResident) delete solutionResident;

   if(momResident) delete momResident;


   endBlas();


   if (streams) {

     for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);

     delete []streams;

     streams = NULL;

   }

   destroyDslashEvents();


 #ifdef GPU_STAGGERED_OPROD

   destroyStaggeredOprodEvents();

 #endif


   saveTuneCache(getVerbosity());


 #if (!defined(USE_QDPJIT) && !defined(GPU_COMMS))

   // end this CUDA context

   cudaDeviceReset();

 #endif


   initialized = false;


   comm_finalize();

   comms_initialized = false;


   profileEnd.Stop(QUDA_PROFILE_TOTAL);


   // print out the profile information of the lifetime of the library

   if (getVerbosity() >= QUDA_SUMMARIZE) {

     profileInit.Print();

     profileGauge.Print();

     profileCloverCreate.Print();

     profileClover.Print();

     profileInvert.Print();

     profileMulti.Print();

     profileMultiMixed.Print();

     profileFatLink.Print();

     profileGaugeForce.Print();

     profileGaugeUpdate.Print();

     profileExtendedGauge.Print();

     profileCloverDerivative.Print();

     profileCloverTrace.Print();

     profileStaggeredOprod.Print();

     profileAsqtadForce.Print();

     profileHISQForce.Print();

     profileContract.Print();

     profileCovDev.Print();

     profileEnd.Print();


     printLaunchTimer();


     printfQuda("\n");

     printPeakMemUsage();

     printfQuda("\n");

   }


   assertAllMemFree();

 }


 namespace quda {


   void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     double kappa = inv_param->kappa;

     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

       kappa *= gaugePrecise->Anisotropy();

     }


     switch (inv_param->dslash_type) {

     case QUDA_WILSON_DSLASH:

       diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;

       break;

     case QUDA_CLOVER_WILSON_DSLASH:

       diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;

       break;

     case QUDA_DOMAIN_WALL_DSLASH:

       diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC;

       diracParam.Ls = inv_param->Ls;

       break;

     case QUDA_DOMAIN_WALL_4D_DSLASH:

       if(pc) {

         diracParam.type = QUDA_DOMAIN_WALL_4DPC_DIRAC;

         diracParam.Ls = inv_param->Ls;

       } else errorQuda("For 4D type of DWF dslash, pc must be turned on, %d", inv_param->dslash_type);

       break;

     case QUDA_MOBIUS_DWF_DSLASH:

       if (inv_param->Ls > QUDA_MAX_DWF_LS)

         errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);

       if(pc) {

         diracParam.type = QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC;

         diracParam.Ls = inv_param->Ls;

         memcpy(diracParam.b_5, inv_param->b_5, sizeof(double)*inv_param->Ls);

         memcpy(diracParam.c_5, inv_param->c_5, sizeof(double)*inv_param->Ls);

       } else errorQuda("At currently, only preconditioned Mobius DWF is supported, %d", inv_param->dslash_type);

       break;

     case QUDA_STAGGERED_DSLASH:

       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;

       break;

     case QUDA_ASQTAD_DSLASH:

       diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;

       break;

     case QUDA_TWISTED_MASS_DSLASH:

       diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC;

       if (inv_param->twist_flavor == QUDA_TWIST_MINUS || inv_param->twist_flavor == QUDA_TWIST_PLUS) {

         diracParam.Ls = 1;

         diracParam.epsilon = 0.0;

       } else {

         diracParam.Ls = 2;

         diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;

       }

       break;

     case QUDA_TWISTED_CLOVER_DSLASH:

       diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC;

       if (inv_param->twist_flavor == QUDA_TWIST_MINUS || inv_param->twist_flavor == QUDA_TWIST_PLUS)  {

         diracParam.Ls = 1;

         diracParam.epsilon = 0.0;

       } else {

         diracParam.Ls = 2;

         diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;

       }

       break;

     default:

       errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);

     }


     diracParam.matpcType = inv_param->matpc_type;

     diracParam.dagger = inv_param->dagger;

     diracParam.gauge = gaugePrecise;

     diracParam.fatGauge = gaugeFatPrecise;

     diracParam.longGauge = gaugeLongPrecise;

     diracParam.clover = cloverPrecise;

     diracParam.cloverInv = cloverInvPrecise;

     diracParam.kappa = kappa;

     diracParam.mass = inv_param->mass;

     diracParam.m5 = inv_param->m5;

     diracParam.mu = inv_param->mu;


     for (int i=0; i<4; i++) diracParam.commDim[i] = 1;   // comms are always on

   }


   void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     setDiracParam(diracParam, inv_param, pc);


     diracParam.gauge = gaugeSloppy;

     diracParam.fatGauge = gaugeFatSloppy;

     diracParam.longGauge = gaugeLongSloppy;

     diracParam.clover = cloverSloppy;

     diracParam.cloverInv = cloverInvSloppy;


     for (int i=0; i<4; i++) {

       diracParam.commDim[i] = 1;   // comms are always on

     }


   }


   // The preconditioner currently mimicks the sloppy operator with no comms

   void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)

   {

     setDiracParam(diracParam, inv_param, pc);


     if(inv_param->overlap){

       diracParam.gauge = gaugeExtended;

       diracParam.fatGauge = gaugeFatExtended;

       diracParam.longGauge = gaugeLongExtended;

     }else{

       diracParam.gauge = gaugePrecondition;

       diracParam.fatGauge = gaugeFatPrecondition;

       diracParam.longGauge = gaugeLongPrecondition;

     }

     diracParam.clover = cloverPrecondition;

     diracParam.cloverInv = cloverInvPrecondition;


     for (int i=0; i<4; i++) {

       diracParam.commDim[i] = 0; // comms are always off

     }


     // In the preconditioned staggered CG allow a different dlsash type in the preconditioning

     if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH

        && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) {

        diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;

        diracParam.gauge = gaugeFatPrecondition;

     }

   }


   void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)

   {

     DiracParam diracParam;

     DiracParam diracSloppyParam;

     DiracParam diracPreParam;


     setDiracParam(diracParam, &param, pc_solve);

     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);

     setDiracPreParam(diracPreParam, &param, pc_solve);


     d = Dirac::create(diracParam); // create the Dirac operator

     dSloppy = Dirac::create(diracSloppyParam);

     dPre = Dirac::create(diracPreParam);

   }


   static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];


   void massRescale(cudaColorSpinorField &b, QudaInvertParam &param) {


     double kappa5 = (0.5/(5.0 + param.m5));

     double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

                     param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

                     param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;


     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       printfQuda("Mass rescale: Kappa is: %g\n", kappa);

       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);

       double nin = norm2(b);

       printfQuda("Mass rescale: norm of source in = %g\n", nin);

     }


     // staggered dslash uses mass normalization internally

     if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {

       switch (param.solution_type) {

         case QUDA_MAT_SOLUTION:

         case QUDA_MATPC_SOLUTION:

           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) axCuda(2.0*param.mass, b);

           break;

         case QUDA_MATDAG_MAT_SOLUTION:

         case QUDA_MATPCDAG_MATPC_SOLUTION:

           if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) axCuda(4.0*param.mass*param.mass, b);

           break;

         default:

           errorQuda("Not implemented");

       }

       return;

     }


     for(int i=0; i<param.num_offset; i++) {

       unscaled_shifts[i] = param.offset[i];

     }


     // multiply the source to compensate for normalization of the Dirac operator, if necessary

     switch (param.solution_type) {

       case QUDA_MAT_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||

             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           axCuda(2.0*kappa, b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;

         }

         break;

       case QUDA_MATDAG_MAT_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||

             param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           axCuda(4.0*kappa*kappa, b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;

         }

         break;

       case QUDA_MATPC_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {

           axCuda(4.0*kappa*kappa, b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;

         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           axCuda(2.0*kappa, b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 2.0*kappa;

         }

         break;

       case QUDA_MATPCDAG_MATPC_SOLUTION:

         if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {

           axCuda(16.0*pow(kappa,4), b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 16.0*pow(kappa,4);

         } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

           axCuda(4.0*kappa*kappa, b);

           for(int i=0; i<param.num_offset; i++)  param.offset[i] *= 4.0*kappa*kappa;

         }

         break;

       default:

         errorQuda("Solution type %d not supported", param.solution_type);

     }


     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");

     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {

       printfQuda("Mass rescale: Kappa is: %g\n", kappa);

       printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);

       double nin = norm2(b);

       printfQuda("Mass rescale: norm of source out = %g\n", nin);

     }


   }

 }


 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)

 {

   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);


   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");

   if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

     errorQuda("Clover field not allocated");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);


   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&

       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||

        inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )

     axCuda(1.0/(2.0*inv_param->mass), in);


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     axCuda(gaugePrecise->Anisotropy(), in);

   }

   bool pc = true;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {

     cudaParam.create = QUDA_NULL_FIELD_CREATE;

     cudaColorSpinorField tmp1(in, cudaParam);

     ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist

     dirac->Dslash(out, tmp1, parity); // apply the operator

   } else {

     dirac->Dslash(out, in, parity); // apply the operator

   }


   delete dirac; // clean up


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)

 {

   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH )

     setKernelPackT(true);

   else

     errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");


   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);


   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     axCuda(gaugePrecise->Anisotropy(), in);

   }

   bool pc = true;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator

   printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);

   switch (test_type) {

     case 0:

       dirac.Dslash4(out, in, parity);

       break;

     case 1:

       dirac.Dslash5(out, in, parity);

       break;

     case 2:

       dirac.Dslash5inv(out, in, parity, inv_param->kappa);

       break;

   }


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)

 {

   if ( inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)

     setKernelPackT(true);

   else

     errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");


   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");


   pushVerbosity(inv_param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);


   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     axCuda(gaugePrecise->Anisotropy(), in);

   }

   bool pc = true;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   DiracMobiusDomainWallPC dirac(diracParam); // create the Dirac operator

   double kappa5 = 0.0;  // Kappa5 is dummy argument

   switch (test_type) {

     case 0:

       dirac.Dslash4(out, in, parity);

       break;

     case 1:

       dirac.Dslash5(out, in, parity);

       break;

     case 2:

       dirac.Dslash4pre(out, in, parity);

       break;

     case 3:

       dirac.Dslash5inv(out, in, parity, kappa5);

       break;

   }


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)

 {

   pushVerbosity(inv_param->verbosity);


   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);


   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");

   if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

     errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);

   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   dirac->M(out, in); // apply the operator

   delete dirac; // clean up


   double kappa = inv_param->kappa;

   if (pc) {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {

       axCuda(0.25/(kappa*kappa), out);

     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       axCuda(0.5/kappa, out);

     }

   } else {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||

         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       axCuda(0.5/kappa, out);

     }

   }


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)

 {

   pushVerbosity(inv_param->verbosity);


   if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);


   if (!initialized) errorQuda("QUDA not initialized");

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))

     errorQuda("Clover field not allocated");

   if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

     errorQuda("Clover field not allocated");

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||

       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);

   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE){

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   //  double kappa = inv_param->kappa;

   //  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);


   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator

   dirac->MdagM(out, in); // apply the operator

   delete dirac; // clean up


   double kappa = inv_param->kappa;

   if (pc) {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {

       axCuda(1.0/pow(2.0*kappa,4), out);

     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       axCuda(0.25/(kappa*kappa), out);

     }

   } else {

     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||

         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {

       axCuda(0.25/(kappa*kappa), out);

     }

   }


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE){

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 quda::cudaGaugeField* checkGauge(QudaInvertParam *param) {

   quda::cudaGaugeField *cudaGauge = NULL;

   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {

     if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");

     if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");

     if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");

     if(param->overlap){

       if(gaugeExtended == NULL) errorQuda("Extended gauge field doesn't exist");

     }

     cudaGauge = gaugePrecise;

   } else {

     if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");

     if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");

     if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");

     if(param->overlap){

       if(gaugeFatExtended == NULL) errorQuda("Extended gauge fat field doesn't exist");

     }


     if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");

     if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");

     if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");

     if(param->overlap){

       if(gaugeLongExtended == NULL) errorQuda("Extended gauge long field doesn't exist");

     }

     cudaGauge = gaugeFatPrecise;

   }

   return cudaGauge;

 }


 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)

 {

   pushVerbosity(inv_param->verbosity);


   if (!initialized) errorQuda("QUDA not initialized");

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");

   if (cloverPrecise == NULL) errorQuda("Clover field not allocated");


   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);


   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))

     errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");


   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);


   ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorParam cudaParam(cpuParam, *inv_param);

   cudaColorSpinorField in(*in_h, cudaParam);


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*in_h);

     double gpu = norm2(in);

     printfQuda("In CPU %e CUDA %e\n", cpu, gpu);

   }


   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   cudaColorSpinorField out(in, cudaParam);


   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {

     if (parity == QUDA_EVEN_PARITY) {

       parity = QUDA_ODD_PARITY;

     } else {

       parity = QUDA_EVEN_PARITY;

     }

     axCuda(gaugePrecise->Anisotropy(), in);

   }

   bool pc = true;


   DiracParam diracParam;

   setDiracParam(diracParam, inv_param, pc);

         //FIXME: Do we need this for twisted clover???

   DiracCloverPC dirac(diracParam); // create the Dirac operator

   if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator

   else dirac.CloverInv(out, in, parity);


   cpuParam.v = h_out;


   ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   *out_h = out;


   if (getVerbosity() >= QUDA_VERBOSE) {

     double cpu = norm2(*out_h);

     double gpu = norm2(out);

     printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);

   }


   /*for (int i=0; i<in_h->Volume(); i++) {

     ((cpuColorSpinorField*)out_h)->PrintVector(i);

     }*/


   delete out_h;

   delete in_h;


   popVerbosity();

 }


 void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V,

                  void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)

 {

   QudaInvertParam *param;

   param = eig_param->invert_param;

   setTuning(param->tune);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);

   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");


   profileInvert.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);


   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);


   checkInvertParam(param);

   checkEigParam(eig_param);


   bool pc_solution = (param->solution_type == QUDA_MATPC_DAG_SOLUTION) ||

                      (param->solution_type == QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION);


   // create the dirac operator

   DiracParam diracParam;

   setDiracParam(diracParam, param, pc_solution);

   Dirac *d = Dirac::create(diracParam); // create the Dirac operator


   Dirac &dirac = *d;


   profileInvert.Start(QUDA_PROFILE_H2D);


   cudaColorSpinorField *r = NULL;

   cudaColorSpinorField *Apsi = NULL;

   const int *X = cudaGauge->X();


   // wrap CPU host side pointers

   ColorSpinorParam cpuParam(hp_r, *param, X, pc_solution);

   ColorSpinorField *h_r = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

                           static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

                           static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   cpuParam.v = hp_Apsi;

   ColorSpinorField *h_Apsi = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

                              static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

                              static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   //Make Eigen vector data set

   cpuColorSpinorField **h_Eig_Vec;

   h_Eig_Vec =(cpuColorSpinorField **)safe_malloc( m*sizeof(cpuColorSpinorField*));

   for( int k = 0 ; k < m ; k++)

   {

     cpuParam.v = ((double**)hp_V)[k];

     h_Eig_Vec[k] = new cpuColorSpinorField(cpuParam);

   }


   // download source

   ColorSpinorParam cudaParam(cpuParam, *param);

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   r = new cudaColorSpinorField(*h_r, cudaParam);

   Apsi = new cudaColorSpinorField(*h_Apsi, cudaParam);


   double cpu;

   double gpu;


   if (getVerbosity() >= QUDA_VERBOSE) {

     cpu = norm2(*h_r);

     gpu = norm2(*r);

     printfQuda("r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);

     cpu = norm2(*h_Apsi);

     gpu = norm2(*Apsi);

     printfQuda("Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);

   }


   // download Eigen vector set

   cudaColorSpinorField **Eig_Vec;

   Eig_Vec = (cudaColorSpinorField **)safe_malloc( m*sizeof(cudaColorSpinorField*));


   for( int k = 0 ; k < m ; k++)

   {

     Eig_Vec[k] = new cudaColorSpinorField(*h_Eig_Vec[k], cudaParam);

     if (getVerbosity() >= QUDA_VERBOSE) {

       cpu = norm2(*h_Eig_Vec[k]);

       gpu = norm2(*Eig_Vec[k]);

       printfQuda("Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);

     }

   }

   profileInvert.Stop(QUDA_PROFILE_H2D);


   if(eig_param->RitzMat_lanczos == QUDA_MATPC_DAG_SOLUTION)

   {

     DiracMdag mat(dirac);

     RitzMat ritz_mat(mat,*eig_param);

     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);

     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);

     delete eig_solve;

   }

   else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SOLUTION)

   {

     DiracMdagM mat(dirac);

     RitzMat ritz_mat(mat,*eig_param);

     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);

     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);

     delete eig_solve;

   }

   else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)

   {

     DiracMdagM mat(dirac);

     RitzMat ritz_mat(mat,*eig_param);

     Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);

     (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);

     delete eig_solve;

   }

   else

   {

     errorQuda("invalid ritz matrix type\n");

     exit(0);

   }


   //Write back calculated eigen vector

   profileInvert.Start(QUDA_PROFILE_D2H);

   for( int k = 0 ; k < m ; k++)

   {

     *h_Eig_Vec[k] = *Eig_Vec[k];

   }

   *h_r = *r;

   *h_Apsi = *Apsi;

   profileInvert.Stop(QUDA_PROFILE_D2H);


   delete h_r;

   delete h_Apsi;

   for( int k = 0 ; k < m ; k++)

   {

     delete Eig_Vec[k];

     delete h_Eig_Vec[k];

   }

   host_free(Eig_Vec);

   host_free(h_Eig_Vec);


   delete d;


   popVerbosity();


   saveTuneCache(getVerbosity());

   profileInvert.Stop(QUDA_PROFILE_TOTAL);

 }


 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)

 {

   setTuning(param->tune);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);


   profileInvert.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);


   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);


   checkInvertParam(param);


   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in

   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it

   // for now, though, so here we factorize everything for convenience.


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||

     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||

     (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||

     (param->solve_type == QUDA_DIRECT_PC_SOLVE);

   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||

     (param->solve_type == QUDA_NORMERR_PC_SOLVE);


   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;

   if (!pc_solve) param->spinorGiB *= 2;

   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));

   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {

     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);

   } else {

     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);

   }


   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   Dirac *d = NULL;

   Dirac *dSloppy = NULL;

   Dirac *dPre = NULL;


   // create the dirac operator

   createDirac(d, dSloppy, dPre, *param, pc_solve);


   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;

   Dirac &diracPre = *dPre;


   profileInvert.Start(QUDA_PROFILE_H2D);


   cudaColorSpinorField *b = NULL;

   cudaColorSpinorField *x = NULL;

   cudaColorSpinorField *in = NULL;

   cudaColorSpinorField *out = NULL;


   const int *X = cudaGauge->X();


   // wrap CPU host side pointers

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);

   ColorSpinorField *h_b = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   cpuParam.v = hp_x;

   ColorSpinorField *h_x = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   // download source

   ColorSpinorParam cudaParam(cpuParam, *param);

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam);


   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess

     // initial guess only supported for single-pass solvers

     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&

         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {

       errorQuda("Initial guess not supported for two-pass solver");

     }


     x = new cudaColorSpinorField(*h_x, cudaParam); // solution

   } else { // zero initial guess

     cudaParam.create = QUDA_ZERO_FIELD_CREATE;

     x = new cudaColorSpinorField(cudaParam); // solution

   }


   profileInvert.Stop(QUDA_PROFILE_H2D);


   double nb = norm2(*b);

   if (nb==0.0) errorQuda("Source has zero norm");


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nh_b = norm2(*h_b);

     double nh_x = norm2(*h_x);

     double nx = norm2(*x);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

     printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);

   }


   // rescale the source and solution vectors to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     axCuda(1.0/sqrt(nb), *b);

     axCuda(1.0/sqrt(nb), *x);

   }


   massRescale(*b, *param);


   dirac.prepare(in, out, *x, *b, param->solution_type);

   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     double nout = norm2(*out);

     printfQuda("Prepared source = %g\n", nin);

     printfQuda("Prepared solution = %g\n", nout);

   }


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     printfQuda("Prepared source post mass rescale = %g\n", nin);

   }


   // solution_type specifies *what* system is to be solved.

   // solve_type specifies *how* the system is to be solved.

   //

   // We have the following four cases (plus preconditioned variants):

   //

   // solution_type    solve_type    Effect

   // -------------    ----------    ------

   // MAT              DIRECT        Solve Ax=b

   // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y

   // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)

   // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b

   // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y

   //

   // We generally require that the solution_type and solve_type

   // preconditioning match.  As an exception, the unpreconditioned MAT

   // solution_type may be used with any solve_type, including

   // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the

   // preconditioned source and reconstruction of the full solution are

   // taken care of by Dirac::prepare() and Dirac::reconstruct(),

   // respectively.


   if (pc_solution && !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }


   if (!mat_solution && !pc_solution && pc_solve) {

     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");

   }


   if (!mat_solution && norm_error_solve) {

     errorQuda("Normal-error solve requires Mat solution");

   }


   if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b

     cudaColorSpinorField tmp(*in);

     dirac.Mdag(*in, tmp);

   } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b

     DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     copyCuda(*in, *out);

     solverParam.updateInvertParam(*param);

     delete solve;

   }


   if (direct_solve) {

     DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     solverParam.updateInvertParam(*param);

     delete solve;

   } else if (!norm_error_solve) {

     DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     solverParam.updateInvertParam(*param);

     delete solve;

   } else { // norm_error_solve

     DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     cudaColorSpinorField tmp(*out);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(tmp, *in); // y = (M M^\dag) b

     dirac.Mdag(*out, tmp);  // x = M^dag y

     solverParam.updateInvertParam(*param);

     delete solve;

   }


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     printfQuda("Solution = %g\n",nx);

   }

   dirac.reconstruct(*x, *b, param->solution_type);


   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     // rescale the solution

     axCuda(sqrt(nb), *x);

   }


   profileInvert.Start(QUDA_PROFILE_D2H);

   *h_x = *x;

   profileInvert.Stop(QUDA_PROFILE_D2H);


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     double nh_x = norm2(*h_x);

     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);

   }


   delete h_b;

   delete h_x;

   delete b;

   delete x;


   delete d;

   delete dSloppy;

   delete dPre;


   popVerbosity();


   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache(getVerbosity());


   profileInvert.Stop(QUDA_PROFILE_TOTAL);

 }


 void invertMDQuda(void *hp_x, void *hp_b, QudaInvertParam *param)

 {

   setTuning(param->tune);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);


   profileInvert.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);


   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);


   checkInvertParam(param);


   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in

   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it

   // for now, though, so here we factorize everything for convenience.


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||

     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||

     (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||

     (param->solve_type == QUDA_DIRECT_PC_SOLVE);

   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||

     (param->solve_type == QUDA_NORMERR_PC_SOLVE);


   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;

   if (!pc_solve) param->spinorGiB *= 2;

   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));

   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {

     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);

   } else {

     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);

   }


   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   Dirac *d = NULL;

   Dirac *dSloppy = NULL;

   Dirac *dPre = NULL;


   // create the dirac operator

   createDirac(d, dSloppy, dPre, *param, pc_solve);


   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;

   Dirac &diracPre = *dPre;


   profileInvert.Start(QUDA_PROFILE_H2D);


   cudaColorSpinorField *b = NULL;

   cudaColorSpinorField *x = NULL;

   cudaColorSpinorField *in = NULL;

   cudaColorSpinorField *out = NULL;


   const int *X = cudaGauge->X();


   // wrap CPU host side pointers

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);

   ColorSpinorField *h_b = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   cpuParam.v = hp_x;

   ColorSpinorField *h_x = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   // download source

   ColorSpinorParam cudaParam(cpuParam, *param);

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam);


   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess

     // initial guess only supported for single-pass solvers

     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&

         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {

       errorQuda("Initial guess not supported for two-pass solver");

     }


     x = new cudaColorSpinorField(*h_x, cudaParam); // solution

   } else { // zero initial guess

     cudaParam.create = QUDA_ZERO_FIELD_CREATE;

     x = new cudaColorSpinorField(cudaParam); // solution

   }


   profileInvert.Stop(QUDA_PROFILE_H2D);


   double nb = norm2(*b);

   if (nb==0.0) errorQuda("Source has zero norm");


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nh_b = norm2(*h_b);

     double nh_x = norm2(*h_x);

     double nx = norm2(*x);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

     printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);

   }


   // rescale the source and solution vectors to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     axCuda(1.0/sqrt(nb), *b);

     axCuda(1.0/sqrt(nb), *x);

   }


   massRescale(*b, *param);


   dirac.prepare(in, out, *x, *b, param->solution_type);

   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     double nout = norm2(*out);

     printfQuda("Prepared source = %g\n", nin);

     printfQuda("Prepared solution = %g\n", nout);

   }


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     printfQuda("Prepared source post mass rescale = %g\n", nin);

   }


   // solution_type specifies *what* system is to be solved.

   // solve_type specifies *how* the system is to be solved.

   //

   // We have the following four cases (plus preconditioned variants):

   //

   // solution_type    solve_type    Effect

   // -------------    ----------    ------

   // MAT              DIRECT        Solve Ax=b

   // MATDAG_MAT       DIRECT        Solve A^dag y = b, followed by Ax=y

   // MAT              NORMOP        Solve (A^dag A) x = (A^dag b)

   // MATDAG_MAT       NORMOP        Solve (A^dag A) x = b

   // MAT              NORMERR       Solve (A A^dag) y = b, then x = A^dag y

   //

   // We generally require that the solution_type and solve_type

   // preconditioning match.  As an exception, the unpreconditioned MAT

   // solution_type may be used with any solve_type, including

   // DIRECT_PC and NORMOP_PC.  In these cases, preparation of the

   // preconditioned source and reconstruction of the full solution are

   // taken care of by Dirac::prepare() and Dirac::reconstruct(),

   // respectively.


   if (pc_solution && !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }


   if (!mat_solution && !pc_solution && pc_solve) {

     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");

   }


   if (!mat_solution && norm_error_solve) {

     errorQuda("Normal-error solve requires Mat solution");

   }


   if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b

     cudaColorSpinorField tmp(*in);

     dirac.Mdag(*in, tmp);

   } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b

     DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     copyCuda(*in, *out);

     solverParam.updateInvertParam(*param);

     delete solve;

   }


   if (direct_solve) {

     DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     solverParam.updateInvertParam(*param);

     delete solve;

   } else if (!norm_error_solve){

     DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(*out, *in);

     solverParam.updateInvertParam(*param);

     delete solve;

   } else { // norm_error_solve

     DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);

     cudaColorSpinorField tmp(*out);

     SolverParam solverParam(*param);

     Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);

     (*solve)(tmp, *in); // y = (M M^\dag) b

     dirac.Mdag(*out, tmp);  // x = M^dag y

     solverParam.updateInvertParam(*param);

     delete solve;

   }


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     printfQuda("Solution = %g\n",nx);

   }

   dirac.reconstruct(*x, *b, param->solution_type);


   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     // rescale the solution

     axCuda(sqrt(nb), *x);

   }


   if (solutionResident) delete solutionResident;

   //errorQuda("solutionResident already allocated");

   cudaParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   cudaParam.x[0] *= 2;

   cudaParam.create = QUDA_NULL_FIELD_CREATE;

   solutionResident = new cudaColorSpinorField(cudaParam);


   dirac.Dslash(solutionResident->Odd(), solutionResident->Even(), QUDA_ODD_PARITY);


   profileInvert.Start(QUDA_PROFILE_D2H);

   *h_x = *x;

   profileInvert.Stop(QUDA_PROFILE_D2H);


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     double nh_x = norm2(*h_x);

     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);

   }


   delete h_b;

   delete h_x;

   delete b;

   delete x;


   delete d;

   delete dSloppy;

   delete dPre;


   popVerbosity();


   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache(getVerbosity());


   profileInvert.Stop(QUDA_PROFILE_TOTAL);

 }


 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)

 {

   setTuning(param->tune);


   profileMulti.Start(QUDA_PROFILE_TOTAL);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||

       param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||

       param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);


   if (!initialized) errorQuda("QUDA not initialized");

   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);

   checkInvertParam(param);


   if (param->num_offset > QUDA_MAX_MULTI_SHIFT)

     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",

         param->num_offset, QUDA_MAX_MULTI_SHIFT);


   pushVerbosity(param->verbosity);


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);


   if (mat_solution) {

     errorQuda("Multi-shift solver does not support MAT or MATPC solution types");

   }

   if (direct_solve) {

     errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");

   }

   if (pc_solution & !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }

   if (!pc_solution & pc_solve) {

     errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");

   }


   // No of GiB in a checkerboard of a spinor

   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;

   if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve


   // **** WARNING *** this may not match implementation...

   if( param->inv_type == QUDA_CG_INVERTER ) {

     // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift

     param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);

   } else {

     errorQuda("QUDA only currently supports multi-shift CG");

     // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary

     // (Jegerlehner hep-lat/9612014 eq (3.13)

     param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);

   }


   // Timing and FLOP counters

   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   for (int i=0; i<param->num_offset-1; i++) {

     for (int j=i+1; j<param->num_offset; j++) {

       if (param->offset[i] > param->offset[j])

         errorQuda("Offsets must be ordered from smallest to largest");

     }

   }


   // Host pointers for x, take a copy of the input host pointers

   void** hp_x;

   hp_x = new void* [ param->num_offset ];


   void* hp_b = _hp_b;

   for(int i=0;i < param->num_offset;i++){

     hp_x[i] = _hp_x[i];

   }


   // Create the matrix.

   // The way this works is that createDirac will create 'd' and 'dSloppy'

   // which are global. We then grab these with references...

   //

   // Balint: Isn't there a nice construction pattern we could use here? This is

   // expedient but yucky.

   //  DiracParam diracParam;

   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

       param->dslash_type == QUDA_STAGGERED_DSLASH){

     param->mass = sqrt(param->offset[0]/4);

   }


   Dirac *d = NULL;

   Dirac *dSloppy = NULL;

   Dirac *dPre = NULL;


   // create the dirac operator

   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;


   cudaColorSpinorField *b = NULL;   // Cuda RHS

   cudaColorSpinorField **x = NULL;  // Cuda Solutions


   // Grab the dimension array of the input gauge field.

   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?

     gaugeFatPrecise->X() : gaugePrecise->X();


   // This creates a ColorSpinorParam struct, from the host data

   // pointer, the definitions in param, the dimensions X, and whether

   // the solution is on a checkerboard instruction or not. These can

   // then be used as 'instructions' to create the actual

   // ColorSpinorField

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);

   ColorSpinorField *h_b = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorField **h_x = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION

   for(int i=0; i < param->num_offset; i++) {

     cpuParam.v = hp_x[i];

     h_x[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

       static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

       static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   }


   profileMulti.Start(QUDA_PROFILE_H2D);

   // Now I need a colorSpinorParam for the device

   ColorSpinorParam cudaParam(cpuParam, *param);

   // This setting will download a host vector

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it

   profileMulti.Stop(QUDA_PROFILE_H2D);


   // Create the solution fields filled with zero

   x = new cudaColorSpinorField* [ param->num_offset ];

   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   for(int i=0; i < param->num_offset; i++) {

     x[i] = new cudaColorSpinorField(cudaParam);

   }


   // Check source norms

   double nb = norm2(*b);

   if (nb==0.0) errorQuda("Solution has zero norm");


   if(getVerbosity() >= QUDA_VERBOSE ) {

     double nh_b = norm2(*h_b);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

   }


   // rescale the source vector to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     axCuda(1.0/sqrt(nb), *b);

   }


   massRescale(*b, *param);


   // use multi-shift CG

   {

     DiracMdagM m(dirac), mSloppy(diracSloppy);

     SolverParam solverParam(*param);

     MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);

     cg_m(x, *b);

     solverParam.updateInvertParam(*param);

   }


   // experimenting with Minimum residual extrapolation

   /*

      cudaColorSpinorField **q = new cudaColorSpinorField* [ param->num_offset ];

      cudaColorSpinorField **z = new cudaColorSpinorField* [ param->num_offset ];

      cudaColorSpinorField tmp(cudaParam);


      for(int i=0; i < param->num_offset; i++) {

      cudaParam.create = QUDA_ZERO_FIELD_CREATE;

      q[i] = new cudaColorSpinorField(cudaParam);

      cudaParam.create = QUDA_COPY_FIELD_CREATE;

      z[i] = new cudaColorSpinorField(*x[i], cudaParam);

      }


      for(int i=0; i < param->num_offset; i++) {

      dirac.setMass(sqrt(param->offset[i]/4));

      DiracMdagM m(dirac);

      MinResExt mre(m, profileMulti);

      copyCuda(tmp, *b);

      mre(*x[i], tmp, z, q, param -> num_offset);

      dirac.setMass(sqrt(param->offset[0]/4));

      }


      for(int i=0; i < param->num_offset; i++) {

      delete q[i];

      delete z[i];

      }

      delete []q;

      delete []z;

      */


   // check each shift has the desired tolerance and use sequential CG to refine


   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   cudaColorSpinorField r(*b, cudaParam);

   for(int i=0; i < param->num_offset; i++) {

     double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

       param->true_res_hq_offset[i] : 0;

     double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

       param->tol_hq_offset[i] : 0;


     // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0

     if (param->tol_offset[i] > 0 && (param->true_res_offset[i] > param->tol_offset[i] || rsd_hq > tol_hq)) {

       if (getVerbosity() >= QUDA_VERBOSE)

         printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",

             i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);


       // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)

       if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

           param->dslash_type == QUDA_STAGGERED_DSLASH) {

         dirac.setMass(sqrt(param->offset[i]/4));

         diracSloppy.setMass(sqrt(param->offset[i]/4));

       }


       DiracMdagM m(dirac), mSloppy(diracSloppy);


       // need to curry in the shift if we are not doing staggered

       if (param->dslash_type != QUDA_ASQTAD_DSLASH &&

           param->dslash_type != QUDA_STAGGERED_DSLASH) {

         m.shift = param->offset[i];

         mSloppy.shift = param->offset[i];

       }


       SolverParam solverParam(*param);

       solverParam.iter = 0;

       solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES;

       solverParam.tol = param->tol_offset[i]; // set L2 tolerance

       solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance


       CG cg(m, mSloppy, solverParam, profileMulti);

       cg(*x[i], *b);


       solverParam.true_res_offset[i] = solverParam.true_res;

       solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;

       solverParam.updateInvertParam(*param,i);


       if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

           param->dslash_type == QUDA_STAGGERED_DSLASH) {

         dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case

         diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case

       }

     }

   }


   // restore shifts -- avoid side effects

   for(int i=0; i < param->num_offset; i++) {

     param->offset[i] = unscaled_shifts[i];

   }


   profileMulti.Start(QUDA_PROFILE_D2H);

   for(int i=0; i < param->num_offset; i++) {

     if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution

       axCuda(sqrt(nb), *x[i]);

     }


     if (getVerbosity() >= QUDA_VERBOSE){

       double nx = norm2(*x[i]);

       printfQuda("Solution %d = %g\n", i, nx);

     }


     *h_x[i] = *x[i];

   }

   profileMulti.Stop(QUDA_PROFILE_D2H);


   for(int i=0; i < param->num_offset; i++){

     delete h_x[i];

     delete x[i];

   }


   delete h_b;

   delete b;


   delete [] h_x;

   delete [] x;


   delete [] hp_x;


   delete d;

   delete dSloppy;

   delete dPre;


   popVerbosity();


   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache(getVerbosity());


   profileMulti.Stop(QUDA_PROFILE_TOTAL);

 }


 /*

  * Hacked multi-shift solver for Wilson RHMC molecular dynamics

  * FIXME!!

  */

 void invertMultiShiftMDQuda(void **_hp_xe, void **_hp_xo, void **_hp_ye, void **_hp_yo,

     void *_hp_b, QudaInvertParam *param)

 {

   setTuning(param->tune);


   profileMulti.Start(QUDA_PROFILE_TOTAL);


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);


   if (!initialized) errorQuda("QUDA not initialized");

   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);

   checkInvertParam(param);


   if (param->num_offset > QUDA_MAX_MULTI_SHIFT)

     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",

         param->num_offset, QUDA_MAX_MULTI_SHIFT);


   pushVerbosity(param->verbosity);


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);


   if (mat_solution) {

     errorQuda("Multi-shift solver does not support MAT or MATPC solution types");

   }

   if (direct_solve) {

     errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");

   }

   if (pc_solution & !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }

   if (!pc_solution & pc_solve) {

     errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");

   }


   // No of GiB in a checkerboard of a spinor

   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;

   if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve


   // **** WARNING *** this may not match implementation...

   if( param->inv_type == QUDA_CG_INVERTER ) {

     // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift

     param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);

   } else {

     errorQuda("QUDA only currently supports multi-shift CG");

     // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary

     // (Jegerlehner hep-lat/9612014 eq (3.13)

     param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);

   }


   // Timing and FLOP counters

   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   for (int i=0; i<param->num_offset-1; i++) {

     for (int j=i+1; j<param->num_offset; j++) {

       if (param->offset[i] > param->offset[j])

         errorQuda("Offsets must be ordered from smallest to largest");

     }

   }


   // Host pointers for x, take a copy of the input host pointers

   void **hp_xe = new void* [ param->num_offset ];

   void **hp_xo = new void* [ param->num_offset ];

   void **hp_ye = new void* [ param->num_offset ];

   void **hp_yo = new void* [ param->num_offset ];


   void* hp_b = _hp_b;

   for(int i=0;i < param->num_offset;i++){

     hp_xe[i] = _hp_xe[i];

     hp_xo[i] = _hp_xo[i];

     hp_ye[i] = _hp_ye[i];

     hp_yo[i] = _hp_yo[i];

   }


   // Create the matrix.

   // The way this works is that createDirac will create 'd' and 'dSloppy'

   // which are global. We then grab these with references...

   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

       param->dslash_type == QUDA_STAGGERED_DSLASH){

     param->mass = sqrt(param->offset[0]/4);

   }


   Dirac *d = NULL;

   Dirac *dSloppy = NULL;

   Dirac *dPre = NULL;


   // create the dirac operator

   createDirac(d, dSloppy, dPre, *param, pc_solve);

   Dirac &dirac = *d;

   Dirac &diracSloppy = *dSloppy;


   cudaColorSpinorField *b = NULL;   // Cuda RHS

   cudaColorSpinorField **xe = NULL;  // Cuda Solutions

   cudaColorSpinorField *xo, *ye, *yo = NULL;  // Cuda Solutions


   // Grab the dimension array of the input gauge field.

   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?

     gaugeFatPrecise->X() : gaugePrecise->X();


   // This creates a ColorSpinorParam struct, from the host data

   // pointer, the definitions in param, the dimensions X, and whether

   // the solution is on a checkerboard instruction or not. These can

   // then be used as 'instructions' to create the actual

   // ColorSpinorField

   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);

   ColorSpinorField *h_b = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   ColorSpinorField **h_xe = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION

   ColorSpinorField **h_xo = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION

   ColorSpinorField **h_ye = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION

   ColorSpinorField **h_yo = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION

   for(int i=0; i < param->num_offset; i++) {

     cpuParam.v = hp_xe[i];

     h_xe[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

       static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

       static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


     cpuParam.v = hp_xo[i];

     h_xo[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

       static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

       static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


     cpuParam.v = hp_ye[i];

     h_ye[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

       static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

       static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


     cpuParam.v = hp_yo[i];

     h_yo[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

       static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

       static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));

   }


   profileMulti.Start(QUDA_PROFILE_H2D);

   // Now I need a colorSpinorParam for the device

   ColorSpinorParam cudaParam(cpuParam, *param);

   // This setting will download a host vector

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it

   profileMulti.Stop(QUDA_PROFILE_H2D);


   // Create the solution fields filled with zero

   xe = new cudaColorSpinorField* [ param->num_offset ];

   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   for(int i=0; i < param->num_offset; i++) {

     xe[i] = new cudaColorSpinorField(cudaParam);

   }


   xo = new cudaColorSpinorField(cudaParam);

   ye = new cudaColorSpinorField(cudaParam);

   yo = new cudaColorSpinorField(cudaParam);


   // Check source norms

   double nb = norm2(*b);

   if (nb==0.0) errorQuda("Solution has zero norm");


   if(getVerbosity() >= QUDA_VERBOSE ) {

     double nh_b = norm2(*h_b);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

   }


   // rescale the source vector to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     axCuda(1.0/sqrt(nb), *b);

   }


   massRescale(*b, *param);


   // use multi-shift CG

   {

     DiracMdagM m(dirac), mSloppy(diracSloppy);

     SolverParam solverParam(*param);

     MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);

     cg_m(xe, *b);

     solverParam.updateInvertParam(*param);

   }


   // check each shift has the desired tolerance and use sequential CG to refine


   cudaParam.create = QUDA_ZERO_FIELD_CREATE;

   cudaColorSpinorField r(*b, cudaParam);

   for(int i=0; i < param->num_offset; i++) {

     double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

       param->true_res_hq_offset[i] : 0;


     double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?

       param->tol_hq_offset[i] : 0;


     // refine if either L2 or heavy quark residual tolerances have not been met

     if (param->true_res_offset[i] > param->tol_offset[i] || rsd_hq > tol_hq) {

       if (getVerbosity() >= QUDA_VERBOSE)

         printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",

             i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);


       // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)

       if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

           param->dslash_type == QUDA_STAGGERED_DSLASH) {

         dirac.setMass(sqrt(param->offset[i]/4));

         diracSloppy.setMass(sqrt(param->offset[i]/4));

       }


       DiracMdagM m(dirac), mSloppy(diracSloppy);


       // need to curry in the shift if we are not doing staggered

       if (param->dslash_type != QUDA_ASQTAD_DSLASH &&

           param->dslash_type != QUDA_STAGGERED_DSLASH) {

         m.shift = param->offset[i];

         mSloppy.shift = param->offset[i];

       }


       SolverParam solverParam(*param);

       solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES;

       solverParam.tol = param->tol_offset[i]; // set L2 tolerance

       solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance


       CG cg(m, mSloppy, solverParam, profileMulti);

       cg(*xe[i], *b);


       solverParam.updateInvertParam(*param);

       param->true_res_offset[i] = param->true_res;

       param->true_res_hq_offset[i] = param->true_res_hq;


       if (param->dslash_type == QUDA_ASQTAD_DSLASH ||

           param->dslash_type == QUDA_STAGGERED_DSLASH) {

         dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case

         diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case

       }

     }

   }


   // restore shifts -- avoid side effects

   for(int i=0; i < param->num_offset; i++) {

     param->offset[i] = unscaled_shifts[i];

   }


   profileMulti.Start(QUDA_PROFILE_D2H);

   for(int i=0; i < param->num_offset; i++) {

     if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution

       axCuda(sqrt(nb), *xe[i]);

     }


     if (getVerbosity() >= QUDA_VERBOSE){

       double nx = norm2(*xe[i]);

       printfQuda("Solution %d = %g\n", i, nx);

     }


     dirac.Dslash(*xo, *xe[i], QUDA_ODD_PARITY);

     dirac.M(*ye, *xe[i]);

     dirac.Dagger(QUDA_DAG_YES);

     dirac.Dslash(*yo, *ye, QUDA_ODD_PARITY);

     dirac.Dagger(QUDA_DAG_NO);


     *h_xe[i] = *xe[i];

     *h_xo[i] = *xo;

     *h_ye[i] = *ye;

     *h_yo[i] = *yo;

   }

   profileMulti.Stop(QUDA_PROFILE_D2H);


   for(int i=0; i < param->num_offset; i++){

     delete h_xe[i];

     delete h_xo[i];

     delete h_ye[i];

     delete h_yo[i];

     delete xe[i];

   }


   delete h_b;

   delete b;


   delete [] h_xe;

   delete [] h_xo;

   delete [] h_ye;

   delete [] h_yo;


   delete [] xe;

   delete xo;

   delete ye;

   delete yo;


   delete [] hp_xe;

   delete [] hp_xo;

   delete [] hp_ye;

   delete [] hp_yo;


   delete d;

   delete dSloppy;

   delete dPre;


   popVerbosity();


   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache(getVerbosity());


   profileMulti.Stop(QUDA_PROFILE_TOTAL);

 }


 void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h_u, double *inv_eigenvals, int last_rhs)

 {

   setTuning(param->tune);


   if(!InitMagma) openMagma();


   if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);


   profileInvert.Start(QUDA_PROFILE_TOTAL);


   if (!initialized) errorQuda("QUDA not initialized");


   pushVerbosity(param->verbosity);

   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(param);


   // check the gauge fields have been created

   cudaGaugeField *cudaGauge = checkGauge(param);


   checkInvertParam(param);


   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in

   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it

   // for now, though, so here we factorize everything for convenience.


   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||

     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||

     (param->solve_type == QUDA_NORMOP_PC_SOLVE);

   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||

     (param->solution_type ==  QUDA_MATPC_SOLUTION);

   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||

     (param->solve_type == QUDA_DIRECT_PC_SOLVE);


   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;

   if (!pc_solve) param->spinorGiB *= 2;

   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));

   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {

     param->spinorGiB *= ((param->inv_type == QUDA_EIGCG_INVERTER || param->inv_type == QUDA_INC_EIGCG_INVERTER) ? 5 : 7)/(double)(1<<30);

   } else {

     param->spinorGiB *= ((param->inv_type == QUDA_EIGCG_INVERTER || param->inv_type == QUDA_INC_EIGCG_INVERTER) ? 8 : 9)/(double)(1<<30);

   }


   param->secs = 0;

   param->gflops = 0;

   param->iter = 0;


   DiracParam diracParam;

   DiracParam diracSloppyParam;

   //DiracParam diracDeflateParam;

   DiracParam diracHalfPrecParam;//sloppy precision for initCG

   setDiracParam(diracParam, param, pc_solve);

   setDiracSloppyParam(diracSloppyParam, param, pc_solve);


   if(param->cuda_prec_precondition != QUDA_HALF_PRECISION)

   {

      errorQuda("\nInitCG requires sloppy gauge field in half precision. It seems that the half precision field is not loaded,\n please check you cuda_prec_precondition parameter.\n");

   }


   setDiracParam(diracHalfPrecParam, param, pc_solve);


   diracHalfPrecParam.gauge = gaugePrecondition;

   diracHalfPrecParam.fatGauge = gaugeFatPrecondition;

   diracHalfPrecParam.longGauge = gaugeLongPrecondition;


   diracHalfPrecParam.clover = cloverPrecondition;

   diracHalfPrecParam.cloverInv = cloverInvPrecondition;


   for (int i=0; i<4; i++) {

       diracHalfPrecParam.commDim[i] = 1; // comms are on.

   }


   Dirac *d        = Dirac::create(diracParam); // create the Dirac operator

   Dirac *dSloppy  = Dirac::create(diracSloppyParam);

   //Dirac *dDeflate = Dirac::create(diracPreParam);

   Dirac *dHalfPrec = Dirac::create(diracHalfPrecParam);


   Dirac &dirac = *d;

   //Dirac &diracSloppy = param->rhs_idx < param->deflation_grid ? *d : *dSloppy; //hack!!!

   //Dirac &diracSloppy   = param->rhs_idx < param->deflation_grid ? *dSloppy : *dHalfPrec;

   Dirac &diracSloppy   = *dSloppy;

   Dirac &diracHalf     = *dHalfPrec;

   Dirac &diracDeflate  = *d;//full precision deflation

   //Dirac &diracHalfPrec = *dHalfPrec;


   profileInvert.Start(QUDA_PROFILE_H2D);


   cudaColorSpinorField *b = NULL;

   cudaColorSpinorField *x = NULL;

   cudaColorSpinorField *in = NULL;

   cudaColorSpinorField *out = NULL;


   const int *X = cudaGauge->X();


   // wrap CPU host side pointers

   ColorSpinorParam cpuParam(_h_b, *param, X, pc_solution);

   ColorSpinorField *h_b = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   cpuParam.v = _h_x;

   ColorSpinorField *h_x = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?

     static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :

     static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));


   // download source

   ColorSpinorParam cudaParam(cpuParam, *param);

   cudaParam.create = QUDA_COPY_FIELD_CREATE;

   b = new cudaColorSpinorField(*h_b, cudaParam);


   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess

     // initial guess only supported for single-pass solvers

     if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&

         (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {

       errorQuda("Initial guess not supported for two-pass solver");

     }


     x = new cudaColorSpinorField(*h_x, cudaParam); // solution

   } else { // zero initial guess

     cudaParam.create = QUDA_ZERO_FIELD_CREATE;

     x = new cudaColorSpinorField(cudaParam); // solution

   }


   profileInvert.Stop(QUDA_PROFILE_H2D);


   double nb = norm2(*b);

   if (nb==0.0) errorQuda("Source has zero norm");


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nh_b = norm2(*h_b);

     double nh_x = norm2(*h_x);

     double nx = norm2(*x);

     printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);

     printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);

   }


   // rescale the source and solution vectors to help prevent the onset of underflow

   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     axCuda(1.0/sqrt(nb), *b);

     axCuda(1.0/sqrt(nb), *x);

   }


   massRescale(*b, *param);


   dirac.prepare(in, out, *x, *b, param->solution_type);

 //here...

   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     double nout = norm2(*out);

     printfQuda("Prepared source = %g\n", nin);

     printfQuda("Prepared solution = %g\n", nout);

   }


   if (getVerbosity() >= QUDA_VERBOSE) {

     double nin = norm2(*in);

     printfQuda("Prepared source post mass rescale = %g\n", nin);

   }


   if (param->max_search_dim == 0 || param->nev == 0 || (param->max_search_dim < param->nev))

      errorQuda("\nIncorrect eigenvector space setup...\n");


   if (pc_solution && !pc_solve) {

     errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");

   }


   if (!mat_solution && !pc_solution && pc_solve) {

     errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");

   }


   if (mat_solution && !direct_solve) { // prepare source: b' = A^dag b

     cudaColorSpinorField tmp(*in);

     dirac.Mdag(*in, tmp);

   }


   if(param->inv_type == QUDA_INC_EIGCG_INVERTER || param->inv_type == QUDA_EIGCG_INVERTER)

   {

     DiracMdagM m(dirac), mSloppy(diracSloppy), mHalf(diracHalf), mDeflate(diracDeflate);

     SolverParam solverParam(*param);


     DeflatedSolver *solve = DeflatedSolver::create(solverParam, m, mSloppy, mHalf, mDeflate, profileInvert);


     (*solve)(out, in);//run solver


     solverParam.updateInvertParam(*param);//will update rhs_idx as well...


     if(last_rhs)

     {

       if(_h_u) solve->StoreRitzVecs(_h_u, inv_eigenvals, X, param, param->nev);

       printfQuda("\nDelete incremental EigCG solver resources...\n");

       //clean resources:

       solve->CleanResources();

       //

       printfQuda("\n...done.\n");

     }


     delete solve;

   }

   else

   {

     errorQuda("\nUnknown deflated solver...\n");

   }


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     printfQuda("Solution = %g\n",nx);

   }

   dirac.reconstruct(*x, *b, param->solution_type);


   if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {

     // rescale the solution

     axCuda(sqrt(nb), *x);

   }


   profileInvert.Start(QUDA_PROFILE_D2H);

   *h_x = *x;

   profileInvert.Stop(QUDA_PROFILE_D2H);


   if (getVerbosity() >= QUDA_VERBOSE){

     double nx = norm2(*x);

     double nh_x = norm2(*h_x);

     printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);

   }


   delete h_b;

   delete h_x;

   delete b;

   delete x;


   delete d;

   delete dSloppy;

 //  delete dDeflate;

   delete dHalfPrec;


   popVerbosity();


   // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted

   saveTuneCache(getVerbosity());


   profileInvert.Stop(QUDA_PROFILE_TOTAL);

 }


 #ifdef GPU_FATLINK

 /*   @method

  *   QUDA_COMPUTE_FAT_STANDARD: standard method (default)

  *   QUDA_COMPUTE_FAT_EXTENDED_VOLUME, extended volume method

  *

  */

 #include <sys/time.h>


 void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam* param)

 {

   int* X    = param->X;

 #ifdef MULTI_GPU

   int Vsh_x = X[1]*X[2]*X[3]/2;

   int Vsh_y = X[0]*X[2]*X[3]/2;

   int Vsh_z = X[0]*X[1]*X[3]/2;

 #endif

   int Vsh_t = X[0]*X[1]*X[2]/2;


   int E[4];

   for (int i=0; i<4; i++) E[i] = X[i] + 4;


   // fat-link padding

   param->llfat_ga_pad = Vsh_t;


   // site-link padding

   if(method ==  QUDA_COMPUTE_FAT_STANDARD) {

 #ifdef MULTI_GPU

     int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);

     Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);

     Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);

     Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);

     Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);

     param->site_ga_pad = 3*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + 4*Vh_2d_max;

 #else

     param->site_ga_pad = Vsh_t;

 #endif

   } else {

     param->site_ga_pad = (E[0]*E[1]*E[2]/2)*3;

   }

   param->ga_pad = param->site_ga_pad;


   // staple padding

   if(method == QUDA_COMPUTE_FAT_STANDARD) {

 #ifdef MULTI_GPU

     param->staple_pad = 3*(Vsh_x + Vsh_y + Vsh_z+ Vsh_t);

 #else

     param->staple_pad = 3*Vsh_t;

 #endif

   } else {

     param->staple_pad = (E[0]*E[1]*E[2]/2)*3;

   }


   return;

 }


 namespace quda {

   void computeFatLinkCore(cudaGaugeField* cudaSiteLink, double* act_path_coeff,

                           QudaGaugeParam* qudaGaugeParam, QudaComputeFatMethod method,

                           cudaGaugeField* cudaFatLink, cudaGaugeField* cudaLongLink,

                           TimeProfile &profile)

   {


     profile.Start(QUDA_PROFILE_INIT);

     const int flag = qudaGaugeParam->preserve_gauge;

     GaugeFieldParam gParam(0,*qudaGaugeParam);


     if (method == QUDA_COMPUTE_FAT_STANDARD) {

       for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir];

     } else {

       for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir] + 4;

     }


     if (cudaStapleField == NULL || cudaStapleField1 == NULL) {

       gParam.pad    = qudaGaugeParam->staple_pad;

       gParam.create = QUDA_NULL_FIELD_CREATE;

       gParam.reconstruct = QUDA_RECONSTRUCT_NO;

       gParam.geometry = QUDA_SCALAR_GEOMETRY; // only require a scalar matrix field for the staple

       gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

 #ifdef MULTI_GPU

       if(method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME) gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

 #else

       gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

 #endif

       cudaStapleField  = new cudaGaugeField(gParam);

       cudaStapleField1 = new cudaGaugeField(gParam);

     }

     profile.Stop(QUDA_PROFILE_INIT);


     profile.Start(QUDA_PROFILE_COMPUTE);

     if (method == QUDA_COMPUTE_FAT_STANDARD) {

       llfat_cuda(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);

     } else { //method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME

       llfat_cuda_ex(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);

     }

     profile.Stop(QUDA_PROFILE_COMPUTE);


     profile.Start(QUDA_PROFILE_FREE);

     if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){

       delete cudaStapleField; cudaStapleField = NULL;

       delete cudaStapleField1; cudaStapleField1 = NULL;

     }

     profile.Stop(QUDA_PROFILE_FREE);


     return;

   }

 } // namespace quda


 namespace quda {

   namespace fatlink {

 #include <dslash_init.cuh>

   }

 }


 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)

 {

   profileFatLink.Start(QUDA_PROFILE_TOTAL);

   profileFatLink.Start(QUDA_PROFILE_INIT);

   // Initialize unitarization parameters

   if(ulink){

     const double unitarize_eps = 1e-14;

     const double max_error = 1e-10;

     const int reunit_allow_svd = 1;

     const int reunit_svd_only  = 0;

     const double svd_rel_error = 1e-6;

     const double svd_abs_error = 1e-6;

     quda::setUnitarizeLinksConstants(unitarize_eps, max_error,

         reunit_allow_svd, reunit_svd_only,

         svd_rel_error, svd_abs_error);

   }


   cudaGaugeField* cudaFatLink        = NULL;

   cudaGaugeField* cudaLongLink       = NULL;

   cudaGaugeField* cudaUnitarizedLink = NULL;

   cudaGaugeField* cudaInLinkEx       = NULL;


   QudaGaugeParam qudaGaugeParam_ex_buf;

   QudaGaugeParam* qudaGaugeParam_ex = &qudaGaugeParam_ex_buf;

   memcpy(qudaGaugeParam_ex, param, sizeof(QudaGaugeParam));

   for(int dir=0; dir<4; ++dir){ qudaGaugeParam_ex->X[dir] = param->X[dir]+4; }


   // fat-link padding

   setFatLinkPadding(method, param);

   qudaGaugeParam_ex->llfat_ga_pad = param->llfat_ga_pad;

   qudaGaugeParam_ex->staple_pad   = param->staple_pad;

   qudaGaugeParam_ex->site_ga_pad  = param->site_ga_pad;


   GaugeFieldParam gParam(0, *param);

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   // create the host fatlink

   gParam.create = QUDA_REFERENCE_FIELD_CREATE;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.order = QUDA_MILC_GAUGE_ORDER;

   gParam.gauge = fatlink;

   cpuGaugeField cpuFatLink(gParam);

   gParam.gauge = longlink;

   cpuGaugeField cpuLongLink(gParam);

   gParam.gauge = ulink;

   cpuGaugeField cpuUnitarizedLink(gParam);


   // create the device fatlink

   gParam.pad    = param->llfat_ga_pad;

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.reconstruct = QUDA_RECONSTRUCT_NO;

   cudaFatLink = new cudaGaugeField(gParam);

   if(longlink) cudaLongLink = new cudaGaugeField(gParam);

   if(ulink){

     cudaUnitarizedLink = new cudaGaugeField(gParam);

     quda::setUnitarizeLinksPadding(param->llfat_ga_pad,param->llfat_ga_pad);

   }

   // create the host sitelink

   gParam.pad = 0;

   gParam.create    = QUDA_REFERENCE_FIELD_CREATE;

   gParam.link_type = param->type;

   gParam.order = QUDA_MILC_GAUGE_ORDER;

   gParam.gauge     = inlink;

   cpuGaugeField cpuInLink(gParam);


   gParam.pad         = param->site_ga_pad;

   gParam.create      = QUDA_NULL_FIELD_CREATE;

   gParam.link_type   = param->type;

   gParam.reconstruct = param->reconstruct;

   gParam.order       = (param->reconstruct == QUDA_RECONSTRUCT_12) ? QUDA_FLOAT4_GAUGE_ORDER : QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField* cudaInLink = new cudaGaugeField(gParam);


   if(method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME){

     for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam_ex->X[dir];

     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

     cudaInLinkEx = new cudaGaugeField(gParam);

   }


   profileFatLink.Stop(QUDA_PROFILE_INIT);

   fatlink::initLatticeConstants(*cudaFatLink, profileFatLink);

   profileFatLink.Start(QUDA_PROFILE_INIT);


   cudaGaugeField* inlinkPtr;

   if(method == QUDA_COMPUTE_FAT_STANDARD){

     llfat_init_cuda(param);

     param->ga_pad = param->site_ga_pad;

     inlinkPtr = cudaInLink;

   }else{

     llfat_init_cuda_ex(qudaGaugeParam_ex);

     inlinkPtr = cudaInLinkEx;

   }

   profileFatLink.Stop(QUDA_PROFILE_INIT);


   profileFatLink.Start(QUDA_PROFILE_H2D);

   cudaInLink->loadCPUField(cpuInLink, QUDA_CPU_FIELD_LOCATION);

   profileFatLink.Stop(QUDA_PROFILE_H2D);


   if(method != QUDA_COMPUTE_FAT_STANDARD){

     profileFatLink.Start(QUDA_PROFILE_COMMS);

     copyExtendedGauge(*cudaInLinkEx, *cudaInLink, QUDA_CUDA_FIELD_LOCATION);

 #ifdef MULTI_GPU

     int R[4] = {2, 2, 2, 2};

     cudaInLinkEx->exchangeExtendedGhost(R,true); // instead of exchange_cpu_sitelink_ex

 #endif

     profileFatLink.Stop(QUDA_PROFILE_COMMS);

   } // Initialise and load siteLinks


   quda::computeFatLinkCore(inlinkPtr, const_cast<double*>(path_coeff), param, method, cudaFatLink, cudaLongLink, profileFatLink);


   if(ulink){

     profileFatLink.Start(QUDA_PROFILE_INIT);

     int num_failures=0;

     int* num_failures_dev;

     cudaMalloc((void**)&num_failures_dev, sizeof(int));

     cudaMemset(num_failures_dev, 0, sizeof(int));

     if(num_failures_dev == NULL) errorQuda("cudaMalloc fialed for dev_pointer\n");

     profileFatLink.Stop(QUDA_PROFILE_INIT);


     profileFatLink.Start(QUDA_PROFILE_COMPUTE);

     quda::unitarizeLinksCuda(*param, *cudaFatLink, cudaUnitarizedLink, num_failures_dev); // unitarize on the gpu

     profileFatLink.Stop(QUDA_PROFILE_COMPUTE);


     profileFatLink.Start(QUDA_PROFILE_D2H);

     cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);

     profileFatLink.Stop(QUDA_PROFILE_D2H);

     cudaFree(num_failures_dev);

     if(num_failures>0){

       errorQuda("Error in the unitarization component of the hisq fattening\n");

     }

     profileFatLink.Start(QUDA_PROFILE_D2H);

     cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, QUDA_CPU_FIELD_LOCATION);

     profileFatLink.Stop(QUDA_PROFILE_D2H);

   }


   profileFatLink.Start(QUDA_PROFILE_D2H);

   if(fatlink) cudaFatLink->saveCPUField(cpuFatLink, QUDA_CPU_FIELD_LOCATION);

   if(longlink) cudaLongLink->saveCPUField(cpuLongLink, QUDA_CPU_FIELD_LOCATION);

   profileFatLink.Stop(QUDA_PROFILE_D2H);


   profileFatLink.Start(QUDA_PROFILE_FREE);

   if(longlink) delete cudaLongLink;

   delete cudaFatLink;

   delete cudaInLink;

   delete cudaUnitarizedLink;

   if(cudaInLinkEx) delete cudaInLinkEx;

   profileFatLink.Stop(QUDA_PROFILE_FREE);


   profileFatLink.Stop(QUDA_PROFILE_TOTAL);


   return;

 }


 #endif // GPU_FATLINK


 int getGaugePadding(GaugeFieldParam& param){

   int pad = 0;

 #ifdef MULTI_GPU

   int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];

   int face_size[4];

   for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;

   pad = *std::max_element(face_size, face_size+4);

 #endif


   return pad;

 }


 #ifdef GPU_GAUGE_FORCE

 namespace quda {

   namespace gaugeforce {

 #include <dslash_init.cuh>

   }

 }

 #endif


 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,

                           double* loop_coeff, int num_paths, int max_length, double eb3,

                           QudaGaugeParam* qudaGaugeParam, double* timeinfo)

 {


   /*printfQuda("GaugeForce: use_resident_gauge = %d, make_resident_gauge = %d\n",

     qudaGaugeParam->use_resident_gauge, qudaGaugeParam->make_resident_gauge);

     printfQuda("GaugeForce: use_resident_mom = %d, make_resident_mom = %d\n",

     qudaGaugeParam->use_resident_mom, qudaGaugeParam->make_resident_mom);*/


 #ifdef GPU_GAUGE_FORCE

   profileGaugeForce.Start(QUDA_PROFILE_TOTAL);

   profileGaugeForce.Start(QUDA_PROFILE_INIT);


   checkGaugeParam(qudaGaugeParam);


   GaugeFieldParam gParam(0, *qudaGaugeParam);

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   gParam.pad = 0;


 #ifdef MULTI_GPU

   GaugeFieldParam gParamEx(gParam);

   for (int d=0; d<4; d++) gParamEx.x[d] = gParam.x[d] + 4;

 #endif


   gParam.create = QUDA_REFERENCE_FIELD_CREATE;

   gParam.gauge = siteLink;

   cpuGaugeField *cpuSiteLink = new cpuGaugeField(gParam);


   cudaGaugeField* cudaSiteLink = NULL;


   if (qudaGaugeParam->use_resident_gauge) {

     if (!gaugePrecise) errorQuda("No resident gauge field to use");

     cudaSiteLink = gaugePrecise;

     profileGaugeForce.Stop(QUDA_PROFILE_INIT);

     printfQuda("GaugeForce: Using resident gauge field\n");

   } else {

     gParam.create = QUDA_NULL_FIELD_CREATE;

     gParam.reconstruct = qudaGaugeParam->reconstruct;

     gParam.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||

         qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?

       QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;


     cudaSiteLink = new cudaGaugeField(gParam);

     profileGaugeForce.Stop(QUDA_PROFILE_INIT);


     profileGaugeForce.Start(QUDA_PROFILE_H2D);

     cudaSiteLink->loadCPUField(*cpuSiteLink, QUDA_CPU_FIELD_LOCATION);

     profileGaugeForce.Stop(QUDA_PROFILE_H2D);

   }


   profileGaugeForce.Start(QUDA_PROFILE_INIT);


 #ifndef MULTI_GPU

   cudaGaugeField *cudaGauge = cudaSiteLink;

   qudaGaugeParam->site_ga_pad = gParam.pad; //need to set this value

 #else


   gParamEx.create = QUDA_ZERO_FIELD_CREATE;

   gParamEx.reconstruct = qudaGaugeParam->reconstruct;

   gParamEx.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||

       qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?

     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;

   qudaGaugeParam->site_ga_pad = gParamEx.pad;//need to set this value


   cudaGaugeField *cudaGauge = new cudaGaugeField(gParamEx);


   copyExtendedGauge(*cudaGauge, *cudaSiteLink, QUDA_CUDA_FIELD_LOCATION);

   int R[4] = {2, 2, 2, 2}; // radius of the extended region in each dimension / direction


   profileGaugeForce.Stop(QUDA_PROFILE_INIT);


   profileGaugeForce.Start(QUDA_PROFILE_COMMS);

   cudaGauge->exchangeExtendedGhost(R);

   profileGaugeForce.Stop(QUDA_PROFILE_COMMS);

   profileGaugeForce.Start(QUDA_PROFILE_INIT);

 #endif


   GaugeFieldParam &gParamMom = gParam;

   gParamMom.order = qudaGaugeParam->gauge_order;

   // FIXME - test program always uses MILC for mom but can use QDP for gauge

   if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;

   gParamMom.precision = qudaGaugeParam->cpu_prec;

   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;

   gParamMom.create = QUDA_REFERENCE_FIELD_CREATE;

   gParamMom.gauge=mom;

   if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;

   else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;


   cpuGaugeField* cpuMom = new cpuGaugeField(gParamMom);


   cudaGaugeField* cudaMom = NULL;

   if (qudaGaugeParam->use_resident_mom) {

     if (!gaugePrecise) errorQuda("No resident momentum field to use");

     cudaMom = momResident;

     printfQuda("GaugeForce: Using resident mom field\n");

     profileGaugeForce.Stop(QUDA_PROFILE_INIT);

   } else {

     gParamMom.create = QUDA_ZERO_FIELD_CREATE;

     gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;

     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;

     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;

     gParamMom.precision = qudaGaugeParam->cuda_prec;

     cudaMom = new cudaGaugeField(gParamMom);

     profileGaugeForce.Stop(QUDA_PROFILE_INIT);


     profileGaugeForce.Start(QUDA_PROFILE_H2D);

     cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);

     profileGaugeForce.Stop(QUDA_PROFILE_H2D);

   }


   gaugeforce::initLatticeConstants(*cudaMom, profileGaugeForce);


   profileGaugeForce.Start(QUDA_PROFILE_CONSTANT);

   qudaGaugeParam->mom_ga_pad = gParamMom.pad; //need to set this (until we use order classes)

   gauge_force_init_cuda(qudaGaugeParam, max_length);

   profileGaugeForce.Stop(QUDA_PROFILE_CONSTANT);


   // actually do the computation

   profileGaugeForce.Start(QUDA_PROFILE_COMPUTE);

   gauge_force_cuda(*cudaMom, eb3, *cudaGauge, qudaGaugeParam, input_path_buf,

       path_length, loop_coeff, num_paths, max_length);

   profileGaugeForce.Stop(QUDA_PROFILE_COMPUTE);


   // still need to copy this back even when preserving

   profileGaugeForce.Start(QUDA_PROFILE_D2H);

   cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);

   profileGaugeForce.Stop(QUDA_PROFILE_D2H);


   profileGaugeForce.Start(QUDA_PROFILE_FREE);

   if (qudaGaugeParam->make_resident_gauge) {

     if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;

     gaugePrecise = cudaSiteLink;

   } else {

     delete cudaSiteLink;

   }


   if (qudaGaugeParam->make_resident_mom) {

     if (momResident && momResident != cudaMom) delete momResident;

     momResident = cudaMom;

   } else {

     delete cudaMom;

   }


   delete cpuSiteLink;

   delete cpuMom;


 #ifdef MULTI_GPU

   delete cudaGauge;

 #endif

   profileGaugeForce.Stop(QUDA_PROFILE_FREE);


   profileGaugeForce.Stop(QUDA_PROFILE_TOTAL);


   if(timeinfo){

     timeinfo[0] = profileGaugeForce.Last(QUDA_PROFILE_H2D);

     timeinfo[1] = profileGaugeForce.Last(QUDA_PROFILE_COMPUTE);

     timeinfo[2] = profileGaugeForce.Last(QUDA_PROFILE_D2H);

   }


   checkCudaError();

 #else

   errorQuda("Gauge force has not been built");

 #endif // GPU_GAUGE_FORCE

   return 0;

 }


 void createCloverQuda(QudaInvertParam* invertParam)

 {

   profileCloverCreate.Start(QUDA_PROFILE_TOTAL);

   profileCloverCreate.Start(QUDA_PROFILE_INIT);

   if(!cloverPrecise){

     printfQuda("About to create cloverPrecise\n");

     CloverFieldParam cloverParam;

     cloverParam.nDim = 4;

     for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir];

     cloverParam.setPrecision(invertParam->clover_cuda_prec);

     cloverParam.pad = invertParam->cl_pad;

     cloverParam.direct = true;

     cloverParam.inverse = true;

     cloverParam.norm    = 0;

     cloverParam.invNorm = 0;

     cloverParam.twisted = false;

     cloverParam.create = QUDA_NULL_FIELD_CREATE;

     cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET;

     cloverParam.setPrecision(invertParam->cuda_prec);

     if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

     {

       cloverParam.direct = true;

       cloverParam.inverse = false;

       cloverPrecise = new cudaCloverField(cloverParam);

       cloverParam.inverse = true;

       cloverParam.direct = false;

       cloverParam.twisted = true;

       cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu;

       cloverInvPrecise = new cudaCloverField(cloverParam);      //FIXME Only with tmClover

     } else {

       cloverPrecise = new cudaCloverField(cloverParam);

     }

   }


   int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction

   int y[4];

   for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 2*R[dir];

   int pad = 0;

   // clover creation not supported from 8-reconstruct presently so convert to 12

   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ?

     QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();

   GaugeFieldParam gParamEx(y, gaugePrecise->Precision(), recon, pad,

                            QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_EXTENDED);

   gParamEx.create = QUDA_ZERO_FIELD_CREATE;

   gParamEx.order = gaugePrecise->Order();

   gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;

   gParamEx.t_boundary = gaugePrecise->TBoundary();

   gParamEx.nFace = 1;

   for (int d=0; d<4; d++) gParamEx.r[d] = R[d];


   cudaGaugeField *cudaGaugeExtended = NULL;

   if (extendedGaugeResident) {

     cudaGaugeExtended = extendedGaugeResident;

     profileCloverCreate.Stop(QUDA_PROFILE_INIT);

   } else {

     cudaGaugeExtended = new cudaGaugeField(gParamEx);


     // copy gaugePrecise into the extended device gauge field

     copyExtendedGauge(*cudaGaugeExtended, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION);

 #if 1

     profileCloverCreate.Stop(QUDA_PROFILE_INIT);

     profileCloverCreate.Start(QUDA_PROFILE_COMMS);

     cudaGaugeExtended->exchangeExtendedGhost(R,true);

     profileCloverCreate.Stop(QUDA_PROFILE_COMMS);

 #else


     GaugeFieldParam gParam(gaugePrecise->X(), gaugePrecise->Precision(), QUDA_RECONSTRUCT_NO,

         pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);

     gParam.create = QUDA_ZERO_FIELD_CREATE;

     gParam.order = QUDA_MILC_GAUGE_ORDER;

     gParam.siteSubset = QUDA_FULL_SITE_SUBSET;

     gParam.t_boundary = gaugePrecise->TBoundary();

     gParam.nFace = 1;


     // create an extended gauge field on the host

     for(int dir=0; dir<4; ++dir) gParam.x[dir] += 4;

     cpuGaugeField cpuGaugeExtended(gParam);

     cudaGaugeExtended->saveCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);


     profileCloverCreate.Stop(QUDA_PROFILE_INIT);

     // communicate data

     profileCloverCreate.Start(QUDA_PROFILE_COMMS);

     //exchange_cpu_sitelink_ex(const_cast<int*>(gaugePrecise->X()), R, (void**)cpuGaugeExtended.Gauge_p(),

     //                     cpuGaugeExtended.Order(),cpuGaugeExtended.Precision(), 0, 4);

     cpuGaugeExtended.exchangeExtendedGhost(R,true);


     cudaGaugeExtended->loadCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);

     profileCloverCreate.Stop(QUDA_PROFILE_COMMS);

 #endif

   }


   profileCloverCreate.Start(QUDA_PROFILE_COMPUTE);

 #ifdef MULTI_GPU

   computeClover(*cloverPrecise, *cudaGaugeExtended, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);

 #else

   computeClover(*cloverPrecise, *gaugePrecise, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);

 #endif


   if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)

 #ifdef MULTI_GPU

     computeClover(*cloverInvPrecise, *cudaGaugeExtended, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);  //FIXME Only with tmClover

 #else

     computeClover(*cloverInvPrecise, *gaugePrecise, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);       //FIXME Only with tmClover

 #endif


   profileCloverCreate.Stop(QUDA_PROFILE_COMPUTE);


   profileCloverCreate.Stop(QUDA_PROFILE_TOTAL);


   // FIXME always preserve the extended gauge

   extendedGaugeResident = cudaGaugeExtended;


   return;

 }


 void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param)

 {


   GaugeFieldParam gParam(0,*param);

   if(geometry == 1){

     gParam.geometry = QUDA_SCALAR_GEOMETRY;

   }else if(geometry == 4){

     gParam.geometry = QUDA_VECTOR_GEOMETRY;

   }else{

     errorQuda("Only scalar and vector geometries are supported\n");

   }

   gParam.pad = 0;

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   gParam.gauge = gauge;

   gParam.link_type = QUDA_GENERAL_LINKS;


   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   cudaGaugeField* cudaGauge = new cudaGaugeField(gParam);

   if(gauge){

     gParam.order = QUDA_MILC_GAUGE_ORDER;

     gParam.create = QUDA_REFERENCE_FIELD_CREATE;

     cpuGaugeField cpuGauge(gParam);

     cudaGauge->loadCPUField(cpuGauge,QUDA_CPU_FIELD_LOCATION);

   }

   return cudaGauge;

 }


 void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){


   cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);


   GaugeFieldParam gParam(0,*param);

   gParam.geometry = cudaGauge->Geometry();

   gParam.pad = 0;

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   gParam.gauge = gauge;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.order = QUDA_MILC_GAUGE_ORDER;

   gParam.create = QUDA_REFERENCE_FIELD_CREATE;


   cpuGaugeField cpuGauge(gParam);

   cudaGauge->saveCPUField(cpuGauge,QUDA_CPU_FIELD_LOCATION);

 }


 void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param)

 {

   profileExtendedGauge.Start(QUDA_PROFILE_TOTAL);


   if (param->use_resident_gauge && extendedGaugeResident && geometry == 4) {

     profileExtendedGauge.Stop(QUDA_PROFILE_TOTAL);

     return extendedGaugeResident;

   }


   profileExtendedGauge.Start(QUDA_PROFILE_INIT);


   QudaFieldGeometry geom = QUDA_INVALID_GEOMETRY;

   if (geometry == 1) {

     geom = QUDA_SCALAR_GEOMETRY;

   } else if(geometry == 4) {

     geom = QUDA_VECTOR_GEOMETRY;

   } else {

     errorQuda("Only scalar and vector geometries are supported");

   }


   cpuGaugeField* cpuGauge = NULL;

   cudaGaugeField* cudaGauge = NULL;


   // Create the unextended cpu field

   GaugeFieldParam gParam(0, *param);

   gParam.order          =  QUDA_MILC_GAUGE_ORDER;

   gParam.pad            = 0;

   gParam.link_type      = param->type;

   gParam.ghostExchange  = QUDA_GHOST_EXCHANGE_NO;

   gParam.create         = QUDA_REFERENCE_FIELD_CREATE;

   gParam.gauge          = gauge;

   gParam.geometry       = geom;


   if(gauge){

     cpuGauge  = new cpuGaugeField(gParam);

     // Create the unextended GPU field

     gParam.order  = QUDA_FLOAT2_GAUGE_ORDER;

     gParam.create = QUDA_NULL_FIELD_CREATE;

     cudaGauge     = new cudaGaugeField(gParam);

     profileExtendedGauge.Stop(QUDA_PROFILE_INIT);


     // load the data into the unextended device field

     profileExtendedGauge.Start(QUDA_PROFILE_H2D);

     cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);

     profileExtendedGauge.Stop(QUDA_PROFILE_H2D);


     profileExtendedGauge.Start(QUDA_PROFILE_INIT);

   }


   QudaGaugeParam param_ex;

   memcpy(&param_ex, param, sizeof(QudaGaugeParam));

   for(int dir=0; dir<4; ++dir) param_ex.X[dir] = param->X[dir]+4;

   GaugeFieldParam gParam_ex(0, param_ex);

   gParam_ex.link_type     = param->type;

   gParam_ex.geometry      = geom;

   gParam_ex.order         = QUDA_FLOAT2_GAUGE_ORDER;

   gParam_ex.create        = QUDA_ZERO_FIELD_CREATE;

   gParam_ex.pad           = 0;

   gParam_ex.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   // create the extended gauge field

   cudaGaugeField* cudaGaugeEx = new cudaGaugeField(gParam_ex);


   // copy data from the interior into the border region

   if(gauge) copyExtendedGauge(*cudaGaugeEx, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);


   profileExtendedGauge.Stop(QUDA_PROFILE_INIT);

   if(gauge){

     int R[4] = {2,2,2,2};

     // communicate

     profileExtendedGauge.Start(QUDA_PROFILE_COMMS);

     cudaGaugeEx->exchangeExtendedGhost(R, true);

     profileExtendedGauge.Stop(QUDA_PROFILE_COMMS);

     if (cpuGauge) delete cpuGauge;

     if (cudaGauge) delete cudaGauge;

   }

   profileExtendedGauge.Stop(QUDA_PROFILE_TOTAL);


   return cudaGaugeEx;

 }


 // extend field on the GPU

 void extendGaugeField(void* out, void* in){

   cudaGaugeField* inGauge   = reinterpret_cast<cudaGaugeField*>(in);

   cudaGaugeField* outGauge  = reinterpret_cast<cudaGaugeField*>(out);


   copyExtendedGauge(*outGauge, *inGauge, QUDA_CUDA_FIELD_LOCATION);


   int R[4] = {2,2,2,2};

   outGauge->exchangeExtendedGhost(R,true);


   return;

 }


 void destroyQudaGaugeField(void* gauge){

   cudaGaugeField* g = reinterpret_cast<cudaGaugeField*>(gauge);

   delete g;

 }


 void computeCloverTraceQuda(void *out,

     void *clov,

     int mu,

     int nu,

     int dim[4])

 {


   profileCloverTrace.Start(QUDA_PROFILE_TOTAL);


   cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(out);


   if(cloverPrecise){

     computeCloverSigmaTrace(*cudaGauge, *cloverPrecise, mu, nu,  QUDA_CUDA_FIELD_LOCATION);

     //computeCloverSigmaTrace(*cudaGauge, cudaClover, mu, nu,  QUDA_CUDA_FIELD_LOCATION);

   }else{

     errorQuda("cloverPrecise not set\n");

   }

   profileCloverTrace.Stop(QUDA_PROFILE_TOTAL);

   return;

 }


 void computeCloverDerivativeQuda(void* out,

     void* gauge,

     void* oprod,

     int mu, int nu,

     double coeff,

     QudaParity parity,

     QudaGaugeParam* param,

     int conjugate)

 {

   profileCloverDerivative.Start(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   profileCloverDerivative.Start(QUDA_PROFILE_INIT);

 #ifndef USE_EXTENDED_VOLUME

 #define USE_EXTENDED_VOLUME

 #endif


   // create host fields

   GaugeFieldParam gParam(0, *param);

   gParam.order = QUDA_MILC_GAUGE_ORDER;

   gParam.pad = 0;

   gParam.geometry = QUDA_SCALAR_GEOMETRY;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.create = QUDA_REFERENCE_FIELD_CREATE;

   //  gParam.gauge = out;

   //  cpuGaugeField cpuOut(gParam);

 #ifndef USE_EXTENDED_VOLUME

   gParam.geometry = QUDA_SCALAR_GEOMETRY;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.gauge = oprod;

   cpuGaugeField cpuOprod(gParam);


   gParam.geometry = QUDA_VECTOR_GEOMETRY;

   gParam.link_type = QUDA_SU3_LINKS;

   gParam.gauge = gauge;

   cpuGaugeField cpuGauge(gParam);

 #endif


   /*

   // create device fields

   gParam.geometry = QUDA_SCALAR_GEOMETRY;

   gParam.link_type = QUDA_GENERAL_LINKS;

   gParam.create = QUDA_NULL_FIELD_CREATE;

   //  gParam.pad = getGaugePadding(gParam);

   gParam.pad = 0;

   gParam.ghostExchange  = QUDA_GHOST_EXCHANGE_NO;

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   //  cudaGaugeField cudaOut(gParam);

   */


 #ifndef USE_EXTENDED_VOLUME

   cudaGaugeField cudaOprod(gParam);


   gParam.geometry = QUDA_VECTOR_GEOMETRY;

   gParam.link_type = QUDA_SU3_LINKS;

   cudaGaugeField cudaGauge(gParam);

 #endif

   profileCloverDerivative.Stop(QUDA_PROFILE_INIT);


   cudaGaugeField* cudaOut = reinterpret_cast<cudaGaugeField*>(out);

   cudaGaugeField* gPointer = reinterpret_cast<cudaGaugeField*>(gauge);

   cudaGaugeField* oPointer = reinterpret_cast<cudaGaugeField*>(oprod);


   profileCloverDerivative.Start(QUDA_PROFILE_COMPUTE);

   cloverDerivative(*cudaOut, *gPointer, *oPointer, mu, nu, coeff, parity, conjugate);

   profileCloverDerivative.Stop(QUDA_PROFILE_COMPUTE);


   profileCloverDerivative.Start(QUDA_PROFILE_D2H);


   //  saveGaugeField(out, cudaOut, param);

   //  cudaOut->saveCPUField(cpuOut, QUDA_CPU_FIELD_LOCATION);

   profileCloverDerivative.Stop(QUDA_PROFILE_D2H);

   checkCudaError();


   //  delete cudaOut;


   profileCloverDerivative.Stop(QUDA_PROFILE_TOTAL);


   return;

 }


 void computeKSOprodQuda(void* oprod,

     void* fermion,

     double coeff,

     int X[4],

     QudaPrecision prec)


 {

 /*

   using namespace quda;


   cudaGaugeField* cudaOprod;

   cudaColorSpinorField* cudaQuark;


   const int Ls = 1;

   const int Ninternal = 6;

 #ifdef BUILD_TIFR_INTERFACE

   const int Nface = 1;

 #else

   const int Nface = 3;

 #endif

   FaceBuffer fB(X, 4, Ninternal, Nface, prec, Ls);

   cudaOprod = reinterpret_cast<cudaGaugeField*>(oprod);

   cudaQuark = reinterpret_cast<cudaColorSpinorField*>(fermion);


   double new_coeff[2] = {0,0};

   new_coeff[0] = coeff;

   // Operate on even-parity sites

   computeStaggeredOprod(*cudaOprod, *cudaOprod, *cudaQuark, fB, 0, new_coeff);


   // Operator on odd-parity sites

   computeStaggeredOprod(*cudaOprod, *cudaOprod, *cudaQuark, fB, 1, new_coeff);


 */

   return;

 }


 void computeStaggeredForceQuda(void* cudaMom, void* qudaQuark, double coeff)

 {

   bool use_resident_solution = false;

   if (solutionResident) {

     qudaQuark = solutionResident;

     use_resident_solution = true;

   } else {

     errorQuda("No input quark field defined");

   }


   if (momResident) {

     cudaMom = momResident;

   } else {

     errorQuda("No input momentum defined");

   }


   if (!gaugePrecise) {

     errorQuda("No resident gauge field");

   }


   int pad = 0;

   GaugeFieldParam oParam(gaugePrecise->X(), gaugePrecise->Precision(), QUDA_RECONSTRUCT_NO,

       pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);

   oParam.create = QUDA_ZERO_FIELD_CREATE;

   oParam.order  = QUDA_FLOAT2_GAUGE_ORDER;

   oParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   oParam.t_boundary = QUDA_PERIODIC_T;

   oParam.nFace = 1;


   // create temporary field for quark-field outer product

   cudaGaugeField cudaOprod(oParam);


   // compute quark-field outer product

   computeKSOprodQuda(&cudaOprod, qudaQuark, coeff,

       const_cast<int*>(gaugePrecise->X()),

       gaugePrecise->Precision());


   cudaGaugeField* mom = reinterpret_cast<cudaGaugeField*>(cudaMom);


   completeKSForce(*mom, cudaOprod, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION);


   if (use_resident_solution) {

     delete solutionResident;

     solutionResident = NULL;

   }


   return;

 }


 void computeAsqtadForceQuda(void* const milc_momentum,

     long long *flops,

     const double act_path_coeff[6],

     const void* const one_link_src[4],

     const void* const naik_src[4],

     const void* const link,

     const QudaGaugeParam* gParam)

 {


 #ifdef GPU_HISQ_FORCE

   long long partialFlops;

   using namespace quda::fermion_force;

   profileAsqtadForce.Start(QUDA_PROFILE_TOTAL);

   profileAsqtadForce.Start(QUDA_PROFILE_INIT);


   cudaGaugeField *cudaGauge = NULL;

   cpuGaugeField *cpuGauge = NULL;

   cudaGaugeField *cudaInForce = NULL;

   cpuGaugeField *cpuOneLinkInForce = NULL;

   cpuGaugeField *cpuNaikInForce = NULL;

   cudaGaugeField *cudaOutForce = NULL;

   cudaGaugeField *cudaMom = NULL;

   cpuGaugeField *cpuMom = NULL;


 #ifdef MULTI_GPU

   cudaGaugeField *cudaGauge_ex = NULL;

   cudaGaugeField *cudaInForce_ex = NULL;

   cudaGaugeField *cudaOutForce_ex = NULL;

 #endif


   GaugeFieldParam param(0, *gParam);

   param.create = QUDA_NULL_FIELD_CREATE;

   param.anisotropy = 1.0;

   param.siteSubset = QUDA_FULL_SITE_SUBSET;

   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   param.t_boundary = QUDA_PERIODIC_T;

   param.nFace = 1;


   param.link_type = QUDA_GENERAL_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.create = QUDA_REFERENCE_FIELD_CREATE;


   // create host fields

   param.gauge = (void*)link;

   cpuGauge = new cpuGaugeField(param);


   param.order = QUDA_QDP_GAUGE_ORDER;

   param.gauge = (void*)one_link_src;

   cpuOneLinkInForce = new cpuGaugeField(param);


   param.gauge = (void*)naik_src;

   cpuNaikInForce = new cpuGaugeField(param);


   param.order = QUDA_MILC_GAUGE_ORDER;

   param.link_type = QUDA_ASQTAD_MOM_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_10;

   param.gauge = milc_momentum;

   cpuMom = new cpuGaugeField(param);


   // create device fields

   param.create = QUDA_NULL_FIELD_CREATE;

   param.link_type = QUDA_GENERAL_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.order =  QUDA_FLOAT2_GAUGE_ORDER;


   cudaGauge    = new cudaGaugeField(param);

   cudaInForce  = new cudaGaugeField(param);

   cudaOutForce = new cudaGaugeField(param);


   param.link_type = QUDA_ASQTAD_MOM_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_10;

   cudaMom = new cudaGaugeField(param);


 #ifdef MULTI_GPU

   for(int dir=0; dir<4; ++dir) param.x[dir] += 4;

   param.link_type = QUDA_GENERAL_LINKS;

   param.create = QUDA_ZERO_FIELD_CREATE;

   param.reconstruct = QUDA_RECONSTRUCT_NO;


   cudaGauge_ex    = new cudaGaugeField(param);

   cudaInForce_ex  = new cudaGaugeField(param);

   cudaOutForce_ex = new cudaGaugeField(param);

 #endif

   profileAsqtadForce.Stop(QUDA_PROFILE_INIT);


 #ifdef MULTI_GPU

   int R[4] = {2, 2, 2, 2};

 #endif


   profileAsqtadForce.Start(QUDA_PROFILE_H2D);

   cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);

   profileAsqtadForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   cudaMemset((void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());

   copyExtendedGauge(*cudaGauge_ex, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);

   cudaGauge_ex->exchangeExtendedGhost(R,true);

 #endif


   profileAsqtadForce.Start(QUDA_PROFILE_H2D);

   cudaInForce->loadCPUField(*cpuOneLinkInForce, QUDA_CPU_FIELD_LOCATION);

   profileAsqtadForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   cudaMemset((void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());

   copyExtendedGauge(*cudaInForce_ex, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);

   cudaInForce_ex->exchangeExtendedGhost(R,true);

 #endif


   cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());

   profileAsqtadForce.Start(QUDA_PROFILE_COMPUTE);

 #ifdef MULTI_GPU

   cudaMemset((void**)(cudaOutForce_ex->Gauge_p()), 0, cudaOutForce_ex->Bytes());

   hisqStaplesForceCuda(act_path_coeff, *gParam, *cudaInForce_ex, *cudaGauge_ex, cudaOutForce_ex, &partialFlops);

   *flops += partialFlops;

 #else

   hisqStaplesForceCuda(act_path_coeff, *gParam, *cudaInForce, *cudaGauge, cudaOutForce, &partialFlops);

   *flops += partialFlops;

 #endif

   profileAsqtadForce.Stop(QUDA_PROFILE_COMPUTE);


   profileAsqtadForce.Start(QUDA_PROFILE_H2D);

   cudaInForce->loadCPUField(*cpuNaikInForce, QUDA_CPU_FIELD_LOCATION);

 #ifdef MULTI_GPU

   copyExtendedGauge(*cudaInForce_ex, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);

   cudaInForce_ex->exchangeExtendedGhost(R,true);

 #endif

   profileAsqtadForce.Stop(QUDA_PROFILE_H2D);


   profileAsqtadForce.Start(QUDA_PROFILE_COMPUTE);

 #ifdef MULTI_GPU

   hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *cudaInForce_ex, *cudaGauge_ex, cudaOutForce_ex, &partialFlops);

   *flops += partialFlops;

   completeKSForce(*cudaMom, *cudaOutForce_ex, *cudaGauge_ex, QUDA_CUDA_FIELD_LOCATION, &partialFlops);

   *flops += partialFlops;

 #else

   hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *cudaInForce, *cudaGauge, cudaOutForce, &partialFlops);

   *flops += partialFlops;

   hisqCompleteForceCuda(*gParam, *cudaOutForce, *cudaGauge, cudaMom, &partialFlops);

   *flops += partialFlops;

 #endif

   profileAsqtadForce.Stop(QUDA_PROFILE_COMPUTE);


   profileAsqtadForce.Start(QUDA_PROFILE_D2H);

   cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);

   profileAsqtadForce.Stop(QUDA_PROFILE_D2H);


   profileAsqtadForce.Start(QUDA_PROFILE_FREE);

   delete cudaInForce;

   delete cudaOutForce;

   delete cudaGauge;

   delete cudaMom;

 #ifdef MULTI_GPU

   delete cudaInForce_ex;

   delete cudaOutForce_ex;

   delete cudaGauge_ex;

 #endif


   delete cpuGauge;

   delete cpuOneLinkInForce;

   delete cpuNaikInForce;

   delete cpuMom;


   profileAsqtadForce.Stop(QUDA_PROFILE_FREE);


   profileAsqtadForce.Stop(QUDA_PROFILE_TOTAL);

   return;


 #else

   errorQuda("HISQ force has not been built");

 #endif


 }


 void

 computeHISQForceCompleteQuda(void* const milc_momentum,

                              const double level2_coeff[6],

                              const double fat7_coeff[6],

                              void** quark_array,

                              int num_terms,

                              double** quark_coeff,

                              const void* const w_link,

                              const void* const v_link,

                              const void* const u_link,

                              const QudaGaugeParam* gParam)

 {


 /*

   void* oprod[2];


   computeStaggeredOprodQuda(void** oprod,

     void** fermion,

     int num_terms,

     double** coeff,

     QudaGaugeParam* gParam)


   computeHISQForceQuda(milc_momentum,

                        level2_coeff,

                        fat7_coeff,

                        staple_src,

                        one_link_src,

                        naik_src,

                        w_link,

                        v_link,

                        u_link,

                        gParam);


 */

   return;

 }


   void

 computeHISQForceQuda(void* const milc_momentum,

     long long *flops,

     const double level2_coeff[6],

     const double fat7_coeff[6],

     const void* const staple_src[4],

     const void* const one_link_src[4],

     const void* const naik_src[4],

     const void* const w_link,

     const void* const v_link,

     const void* const u_link,

     const QudaGaugeParam* gParam)

 {

 #ifdef GPU_HISQ_FORCE


   long long partialFlops;


   using namespace quda::fermion_force;

   profileHISQForce.Start(QUDA_PROFILE_TOTAL);

   profileHISQForce.Start(QUDA_PROFILE_INIT);


   double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};

   // You have to look at the MILC routine to understand the following

   // Basically, I have already absorbed the one-link coefficient


   GaugeFieldParam param(0, *gParam);

   param.create = QUDA_REFERENCE_FIELD_CREATE;

   param.order  = QUDA_MILC_GAUGE_ORDER;

   param.link_type = QUDA_ASQTAD_MOM_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_10;

   param.gauge = (void*)milc_momentum;

   cpuGaugeField* cpuMom = new cpuGaugeField(param);


   param.create = QUDA_ZERO_FIELD_CREATE;

   param.order  = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField* cudaMom = new cudaGaugeField(param);


   param.order = QUDA_MILC_GAUGE_ORDER;

   param.link_type = QUDA_GENERAL_LINKS;

   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.create = QUDA_REFERENCE_FIELD_CREATE;

   param.gauge = (void*)w_link;

   cpuGaugeField cpuWLink(param);

   param.gauge = (void*)v_link;

   cpuGaugeField cpuVLink(param);

   param.gauge = (void*)u_link;

   cpuGaugeField cpuULink(param);

   param.create = QUDA_ZERO_FIELD_CREATE;


   param.ghostExchange =  QUDA_GHOST_EXCHANGE_NO;

   param.order = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField* cudaGauge = new cudaGaugeField(param);


   cpuGaugeField* cpuStapleForce;

   cpuGaugeField* cpuOneLinkForce;

   cpuGaugeField* cpuNaikForce;


   param.order = QUDA_QDP_GAUGE_ORDER;

   param.create = QUDA_REFERENCE_FIELD_CREATE;

   param.gauge = (void*)staple_src;

   cpuStapleForce = new cpuGaugeField(param);

   param.gauge = (void*)one_link_src;

   cpuOneLinkForce = new cpuGaugeField(param);

   param.gauge = (void*)naik_src;

   cpuNaikForce = new cpuGaugeField(param);

   param.create = QUDA_ZERO_FIELD_CREATE;


   param.ghostExchange =  QUDA_GHOST_EXCHANGE_NO;

   param.link_type = QUDA_GENERAL_LINKS;

   param.precision = gParam->cpu_prec;


   param.order = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField* cudaInForce  = new cudaGaugeField(param);


 #ifdef MULTI_GPU

   for(int dir=0; dir<4; ++dir) param.x[dir] += 4;

   param.reconstruct = QUDA_RECONSTRUCT_NO;

   param.create = QUDA_ZERO_FIELD_CREATE;

   cudaGaugeField* cudaGaugeEx = new cudaGaugeField(param);

   cudaGaugeField* cudaInForceEx = new cudaGaugeField(param);

   cudaGaugeField* cudaOutForceEx = new cudaGaugeField(param);

   cudaGaugeField* gaugePtr = cudaGaugeEx;

   cudaGaugeField* inForcePtr = cudaInForceEx;

   cudaGaugeField* outForcePtr = cudaOutForceEx;

 #else

   cudaGaugeField* cudaOutForce = new cudaGaugeField(param);

   cudaGaugeField* gaugePtr = cudaGauge;

   cudaGaugeField* inForcePtr = cudaInForce;

   cudaGaugeField* outForcePtr = cudaOutForce;

 #endif


   {

     // default settings for the unitarization

     const double unitarize_eps = 1e-14;

     const double hisq_force_filter = 5e-5;

     const double max_det_error = 1e-10;

     const bool   allow_svd = true;

     const bool   svd_only = false;

     const double svd_rel_err = 1e-8;

     const double svd_abs_err = 1e-8;


     setUnitarizeForceConstants(unitarize_eps,

         hisq_force_filter,

         max_det_error,

         allow_svd,

         svd_only,

         svd_rel_err,

         svd_abs_err);

   }

   profileHISQForce.Stop(QUDA_PROFILE_INIT);


   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaGauge->loadCPUField(cpuWLink, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   int R[4] = {2, 2, 2, 2};

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaGaugeEx, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);

   cudaGaugeEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #endif


   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaInForce->loadCPUField(*cpuStapleForce, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaInForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);

   cudaInForceEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaInForce->loadCPUField(*cpuOneLinkForce, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaOutForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);

   cudaOutForceEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #else

   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaOutForce->loadCPUField(*cpuOneLinkForce, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

 #endif


   profileHISQForce.Start(QUDA_PROFILE_COMPUTE);

   hisqStaplesForceCuda(act_path_coeff, *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);

   *flops += partialFlops;

   profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);


   // Load naik outer product

   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaInForce->loadCPUField(*cpuNaikForce, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaInForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);

   cudaInForceEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #endif


   // Compute Naik three-link term

   profileHISQForce.Start(QUDA_PROFILE_COMPUTE);

   hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);

   *flops += partialFlops;

   profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);

 #ifdef MULTI_GPU

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   cudaOutForceEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #endif

   // load v-link

   profileHISQForce.Start(QUDA_PROFILE_H2D);

   cudaGauge->loadCPUField(cpuVLink, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_H2D);

 #ifdef MULTI_GPU

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaGaugeEx, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);

   cudaGaugeEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #endif

   // Done with cudaInForce. It becomes the output force. Oops!

   profileHISQForce.Start(QUDA_PROFILE_INIT);

   int numFailures = 0;

   int* numFailuresDev;


   if(cudaMalloc((void**)&numFailuresDev, sizeof(int)) == cudaErrorMemoryAllocation){

     errorQuda("cudaMalloc failed for numFailuresDev\n");

   }

   cudaMemset(numFailuresDev, 0, sizeof(int));

   profileHISQForce.Stop(QUDA_PROFILE_INIT);


   profileHISQForce.Start(QUDA_PROFILE_COMPUTE);

   unitarizeForceCuda(*outForcePtr, *gaugePtr, inForcePtr, numFailuresDev, &partialFlops);

   *flops += partialFlops;

   profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);

   profileHISQForce.Start(QUDA_PROFILE_D2H);

   cudaMemcpy(&numFailures, numFailuresDev, sizeof(int), cudaMemcpyDeviceToHost);

   profileHISQForce.Stop(QUDA_PROFILE_D2H);

   cudaFree(numFailuresDev);


   if(numFailures>0){

     errorQuda("Error in the unitarization component of the hisq fermion force\n");

     exit(1);

   }

   cudaMemset((void**)(outForcePtr->Gauge_p()), 0, outForcePtr->Bytes());

   // read in u-link

   profileHISQForce.Start(QUDA_PROFILE_COMPUTE);

   cudaGauge->loadCPUField(cpuULink, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);

 #ifdef MULTI_GPU

   profileHISQForce.Start(QUDA_PROFILE_COMMS);

   copyExtendedGauge(*cudaGaugeEx, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);

   cudaGaugeEx->exchangeExtendedGhost(R,true);

   profileHISQForce.Stop(QUDA_PROFILE_COMMS);

 #endif

   // Compute Fat7-staple term

   profileHISQForce.Start(QUDA_PROFILE_COMPUTE);

   hisqStaplesForceCuda(fat7_coeff, *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);

   *flops += partialFlops;

   hisqCompleteForceCuda(*gParam, *outForcePtr, *gaugePtr, cudaMom, &partialFlops);

   *flops += partialFlops;

   profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);


   profileHISQForce.Start(QUDA_PROFILE_D2H);

   // Close the paths, make anti-hermitian, and store in compressed format

   cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);

   profileHISQForce.Stop(QUDA_PROFILE_D2H);


   profileHISQForce.Start(QUDA_PROFILE_FREE);


   delete cpuStapleForce;

   delete cpuOneLinkForce;

   delete cpuNaikForce;

   delete cpuMom;


   delete cudaInForce;

   delete cudaGauge;

   delete cudaMom;


 #ifdef MULTI_GPU

   delete cudaInForceEx;

   delete cudaOutForceEx;

   delete cudaGaugeEx;

 #else

   delete cudaOutForce;

 #endif

   profileHISQForce.Stop(QUDA_PROFILE_FREE);

   profileHISQForce.Stop(QUDA_PROFILE_TOTAL);

   return;

 #else

   errorQuda("HISQ force has not been built");

 #endif

 }


 void computeStaggeredOprodQuda(void** oprod,

     void** fermion,

     int num_terms,

     double** coeff,

     QudaGaugeParam* param)

 {

   using namespace quda;

   profileStaggeredOprod.Start(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   profileStaggeredOprod.Start(QUDA_PROFILE_INIT);

   GaugeFieldParam oParam(0, *param);


   oParam.nDim = 4;

   oParam.nFace = 0;

   // create the host outer-product field

   oParam.pad = 0;

   oParam.create = QUDA_REFERENCE_FIELD_CREATE;

   oParam.link_type = QUDA_GENERAL_LINKS;

   oParam.reconstruct = QUDA_RECONSTRUCT_NO;

   oParam.order = QUDA_QDP_GAUGE_ORDER;

   oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   oParam.gauge = oprod[0];

   oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; // no need for ghost exchange here

   cpuGaugeField cpuOprod0(oParam);


   oParam.gauge = oprod[1];

   cpuGaugeField cpuOprod1(oParam);


   // create the device outer-product field

   oParam.create = QUDA_ZERO_FIELD_CREATE;

   oParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   cudaGaugeField cudaOprod0(oParam);

   cudaGaugeField cudaOprod1(oParam);

   profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);


   //initLatticeConstants(cudaOprod0, profileStaggeredOprod);


   profileStaggeredOprod.Start(QUDA_PROFILE_H2D);

   cudaOprod0.loadCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);

   cudaOprod1.loadCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);

   profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);


   profileStaggeredOprod.Start(QUDA_PROFILE_INIT);


   ColorSpinorParam qParam;

   qParam.nColor = 3;

   qParam.nSpin = 1;

   qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;

   qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;

   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;

   qParam.nDim = 4;

   qParam.precision = oParam.precision;

   qParam.pad = 0;

   for(int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];

   qParam.x[0] /= 2;


   // create the device quark field

   qParam.create = QUDA_NULL_FIELD_CREATE;

   qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;

   cudaColorSpinorField cudaQuarkEven(qParam);

   cudaColorSpinorField cudaQuarkOdd(qParam);


   // create the host quark field

   qParam.create = QUDA_REFERENCE_FIELD_CREATE;

   qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;


   const int Ls = 1;

   const int Ninternal = 6;

   FaceBuffer faceBuffer1(cudaOprod0.X(), 4, Ninternal, 3, cudaOprod0.Precision(), Ls);

   FaceBuffer faceBuffer2(cudaOprod0.X(), 4, Ninternal, 3, cudaOprod0.Precision(), Ls);

   profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);


   // loop over different quark fields

   for(int i=0; i<num_terms; ++i){


     // Wrap the even-parity MILC quark field

     profileStaggeredOprod.Start(QUDA_PROFILE_INIT);

     qParam.v = fermion[i];

     cpuColorSpinorField cpuQuarkEven(qParam); // create host quark field

     qParam.v = (char*)fermion[i] + cpuQuarkEven.RealLength()*cpuQuarkEven.Precision();

     cpuColorSpinorField cpuQuarkOdd(qParam); // create host field

     profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);


     profileStaggeredOprod.Start(QUDA_PROFILE_H2D);

     cudaQuarkEven = cpuQuarkEven;

     cudaQuarkOdd = cpuQuarkOdd;

     profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);


     profileStaggeredOprod.Start(QUDA_PROFILE_COMPUTE);

     // Operate on even-parity sites

     computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer1, 0, coeff[i]);


     // Operate on odd-parity sites

     computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer2, 1, coeff[i]);

     profileStaggeredOprod.Stop(QUDA_PROFILE_COMPUTE);

   }


   // copy the outer product field back to the host

   profileStaggeredOprod.Start(QUDA_PROFILE_D2H);

   cudaOprod0.saveCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);

   cudaOprod1.saveCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);

   profileStaggeredOprod.Stop(QUDA_PROFILE_D2H);


   profileStaggeredOprod.Stop(QUDA_PROFILE_TOTAL);


   checkCudaError();

   return;

 }


 /*

    void computeStaggeredOprodQuda(void** oprod,

    void** fermion,

    int num_terms,

    double** coeff,

    QudaGaugeParam* param)

    {

    using namespace quda;

    profileStaggeredOprod.Start(QUDA_PROFILE_TOTAL);


    checkGaugeParam(param);


    profileStaggeredOprod.Start(QUDA_PROFILE_INIT);

    GaugeFieldParam oParam(0, *param);


    oParam.nDim = 4;

    oParam.nFace = 0;

 // create the host outer-product field

 oParam.pad = 0;

 oParam.create = QUDA_REFERENCE_FIELD_CREATE;

 oParam.link_type = QUDA_GENERAL_LINKS;

 oParam.reconstruct = QUDA_RECONSTRUCT_NO;

 oParam.order = QUDA_QDP_GAUGE_ORDER;

 oParam.gauge = oprod[0];

 cpuGaugeField cpuOprod0(oParam);


 oParam.gauge = oprod[1];

 cpuGaugeField cpuOprod1(oParam);


 // create the device outer-product field

 oParam.create = QUDA_ZERO_FIELD_CREATE;

 oParam.order = QUDA_FLOAT2_GAUGE_ORDER;

 cudaGaugeField cudaOprod0(oParam);

 cudaGaugeField cudaOprod1(oParam);

 initLatticeConstants(cudaOprod0, profileStaggeredOprod);


 profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);


 profileStaggeredOprod.Start(QUDA_PROFILE_H2D);

 cudaOprod0.loadCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);

 cudaOprod1.loadCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);

 profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);


 ColorSpinorParam qParam;

 qParam.nColor = 3;

 qParam.nSpin = 1;

 qParam.siteSubset = QUDA_FULL_SITE_SUBSET;

 qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;

 qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;

 qParam.nDim = 4;

 qParam.precision = oParam.precision;

 qParam.pad = 0;

 for(int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];


 // create the device quark field

 qParam.create = QUDA_NULL_FIELD_CREATE;

 qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;

 cudaColorSpinorField cudaQuark(qParam);


 cudaColorSpinorField**dQuark = new cudaColorSpinorField*[num_terms];

 for(int i=0; i<num_terms; ++i){

 dQuark[i] = new cudaColorSpinorField(qParam);

 }


 double* new_coeff  = new double[num_terms];


 // create the host quark field

 qParam.create = QUDA_REFERENCE_FIELD_CREATE;

 qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;

 for(int i=0; i<num_terms; ++i){

   qParam.v = fermion[i];

   cpuColorSpinorField cpuQuark(qParam);

   *(dQuark[i]) = cpuQuark;

   new_coeff[i] = coeff[i][0];

 }


 // loop over different quark fields

 for(int i=0; i<num_terms; ++i){

   computeKSOprodQuda(&cudaOprod0, dQuark[i], new_coeff[i], oParam.x, oParam.precision);

 }


 // copy the outer product field back to the host

 profileStaggeredOprod.Start(QUDA_PROFILE_D2H);

 cudaOprod0.saveCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);

 cudaOprod1.saveCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);

 profileStaggeredOprod.Stop(QUDA_PROFILE_D2H);


 for(int i=0; i<num_terms; ++i){

   delete dQuark[i];

 }

 delete[] dQuark;

 delete[] new_coeff;


 profileStaggeredOprod.Stop(QUDA_PROFILE_TOTAL);


 checkCudaError();

 return;

 }

 */


 void updateGaugeFieldQuda(void* gauge,

     void* momentum,

     double dt,

     int conj_mom,

     int exact,

     QudaGaugeParam* param)

 {

   profileGaugeUpdate.Start(QUDA_PROFILE_TOTAL);


   checkGaugeParam(param);


   profileGaugeUpdate.Start(QUDA_PROFILE_INIT);

   GaugeFieldParam gParam(0, *param);


   // create the host fields

   gParam.pad = 0;

   gParam.create = QUDA_REFERENCE_FIELD_CREATE;

   gParam.link_type = QUDA_SU3_LINKS;

   gParam.reconstruct = QUDA_RECONSTRUCT_NO;

   gParam.gauge = gauge;

   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

   cpuGaugeField *cpuGauge = new cpuGaugeField(gParam);


   if (gParam.order == QUDA_TIFR_GAUGE_ORDER) {

     gParam.reconstruct = QUDA_RECONSTRUCT_NO;

   } else {

     gParam.reconstruct = QUDA_RECONSTRUCT_10;

   }

   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;


   gParam.gauge = momentum;


   cpuGaugeField *cpuMom = new cpuGaugeField(gParam);


   // create the device fields

   gParam.create = QUDA_NULL_FIELD_CREATE;

   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;

   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;

   gParam.reconstruct = QUDA_RECONSTRUCT_10;


   cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : NULL;


   gParam.pad = param->ga_pad;

   gParam.link_type = QUDA_SU3_LINKS;

   gParam.reconstruct = param->reconstruct;


   cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;

   cudaGaugeField *cudaOutGauge = new cudaGaugeField(gParam);


   profileGaugeUpdate.Stop(QUDA_PROFILE_INIT);


   profileGaugeUpdate.Start(QUDA_PROFILE_H2D);


   /*printfQuda("UpdateGaugeFieldQuda use_resident_gauge = %d, make_resident_gauge = %d\n",

     param->use_resident_gauge, param->make_resident_gauge);

     printfQuda("UpdateGaugeFieldQuda use_resident_mom = %d, make_resident_mom = %d\n",

     param->use_resident_mom, param->make_resident_mom);*/


   if (!param->use_resident_gauge) {   // load fields onto the device

     cudaInGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);

   } else { // or use resident fields already present

     if (!gaugePrecise) errorQuda("No resident gauge field allocated");

     cudaInGauge = gaugePrecise;

     gaugePrecise = NULL;

   }


   if (!param->use_resident_mom) {

     cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);

   } else {

     if (!momResident) errorQuda("No resident mom field allocated");

     cudaMom = momResident;

     momResident = NULL;

   }


   profileGaugeUpdate.Stop(QUDA_PROFILE_H2D);


   // perform the update

   profileGaugeUpdate.Start(QUDA_PROFILE_COMPUTE);

   updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,

       (bool)conj_mom, (bool)exact);

   profileGaugeUpdate.Stop(QUDA_PROFILE_COMPUTE);


   // copy the gauge field back to the host

   profileGaugeUpdate.Start(QUDA_PROFILE_D2H);

   cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);

   profileGaugeUpdate.Stop(QUDA_PROFILE_D2H);


   profileGaugeUpdate.Stop(QUDA_PROFILE_TOTAL);


   if (param->make_resident_gauge) {

     if (gaugePrecise != NULL) delete gaugePrecise;

     gaugePrecise = cudaOutGauge;

   } else {

     delete cudaOutGauge;

   }


   if (param->make_resident_mom) {

     if (momResident != NULL && momResident != cudaMom) delete momResident;

     momResident = cudaMom;

   } else {

     delete cudaMom;

   }


   delete cudaInGauge;

   delete cpuMom;

   delete cpuGauge;


   checkCudaError();

   return;

 }


 /*

    The following functions are for the Fortran interface.

    */


 void init_quda_(int *dev) { initQuda(*dev); }

 void init_quda_device_(int *dev) { initQudaDevice(*dev); }

 void init_quda_memory_() { initQudaMemory(); }

 void end_quda_() { endQuda(); }

 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }

 void free_gauge_quda_() { freeGaugeQuda(); }

 void free_sloppy_gauge_quda_() { freeSloppyGaugeQuda(); }

 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)

 { loadCloverQuda(h_clover, h_clovinv, inv_param); }

 void free_clover_quda_(void) { freeCloverQuda(); }

 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,

     QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }

 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,

     QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }

 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)

 { MatQuda(h_out, h_in, inv_param); }

 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)

 { MatDagMatQuda(h_out, h_in, inv_param); }

 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)

 { invertQuda(hp_x, hp_b, param); }

 void invert_md_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)

 { invertMDQuda(hp_x, hp_b, param); }

 void invert_multishift_quda_(void *hp_x[QUDA_MAX_MULTI_SHIFT], void *hp_b, QudaInvertParam *param)

 { invertMultiShiftQuda(hp_x, hp_b, param); }

 void new_quda_gauge_param_(QudaGaugeParam *param) {

   *param = newQudaGaugeParam();

 }

 void new_quda_invert_param_(QudaInvertParam *param) {

   *param = newQudaInvertParam();

 }


 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,

     bool *conj_mom, bool *exact,

     QudaGaugeParam *param) {

   updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);

 }


 int compute_gauge_force_quda_(void *mom, void *gauge,  int *input_path_buf, int *path_length,

     double *loop_coeff, int *num_paths, int *max_length, double *dt,

     QudaGaugeParam *param) {


   // fortran uses multi-dimensional arrays which we have convert into an array of pointers to pointers

   const int dim = 4;

   int ***input_path = (int***)safe_malloc(dim*sizeof(int**));

   for (int i=0; i<dim; i++) {

     input_path[i] = (int**)safe_malloc(*num_paths*sizeof(int*));

     for (int j=0; j<*num_paths; j++) {

       input_path[i][j] = (int*)safe_malloc(path_length[j]*sizeof(int));

       for (int k=0; k<path_length[j]; k++) {

         input_path[i][j][k] = input_path_buf[(i* (*num_paths) + j)* (*max_length) + k];

       }

     }

   }


   computeGaugeForceQuda(mom, gauge, input_path, path_length, loop_coeff, *num_paths, *max_length, *dt, param, 0);


   for (int i=0; i<dim; i++) {

     for (int j=0; j<*num_paths; j++) { host_free(input_path[i][j]); }

     host_free(input_path[i]);

   }

   host_free(input_path);


   return 0;

 }


 void compute_staggered_force_quda_(void* cudaMom, void* qudaQuark, double *coeff) {

   computeStaggeredForceQuda(cudaMom, qudaQuark, *coeff);

 }


 // apply the staggered phases

 void apply_staggered_phase_quda_() {

   printfQuda("applying staggered phase\n");

   if (gaugePrecise) {

     gaugePrecise->applyStaggeredPhase();

   } else {

     errorQuda("No persistent gauge field");

   }

 }


 // remove the staggered phases

 void remove_staggered_phase_quda_() {

   printfQuda("removing staggered phase\n");

   if (gaugePrecise) {

     gaugePrecise->removeStaggeredPhase();

   } else {

     errorQuda("No persistent gauge field");

   }

   cudaDeviceSynchronize();

 }


 #ifdef MULTI_GPU

 static int bqcd_rank_from_coords(const int *coords, void *fdata)

 {

   int *dims = static_cast<int *>(fdata);


   int rank = coords[3];

   for (int i = 2; i >= 0; i--) {

     rank = dims[i] * rank + coords[i];

   }

   return rank;

 }

 #endif


 void comm_set_gridsize_(int *grid)

 {

 #ifdef MULTI_GPU

   initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));

 #endif

 }


 void set_kernel_pack_t_(int* pack)

 {

   bool pack_ = *pack ? true : false;

   setKernelPackT(pack_);

 }


 double plaqCuda ()

 {

   cudaGaugeField *data = NULL;

   #ifndef MULTI_GPU

 //    return quda::plaquette(*gaugePrecise, QUDA_CUDA_FIELD_LOCATION);

     data = gaugePrecise;

   #else

     if (extendedGaugeResident) {

       data = extendedGaugeResident;

     } else {

       int y[4];

       for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;

       int pad = 0;

       GaugeFieldParam gParamEx(y, gaugePrecise->Precision(), gaugePrecise->Reconstruct(),

           pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);

       gParamEx.create = QUDA_ZERO_FIELD_CREATE;

       gParamEx.order = gaugePrecise->Order();

       gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;

       gParamEx.t_boundary = gaugePrecise->TBoundary();

       gParamEx.nFace = 1;


       data = new cudaGaugeField(gParamEx);


       copyExtendedGauge(*data, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION);

       int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction

       data->exchangeExtendedGhost(R,true);

       extendedGaugeResident = data;

       cudaDeviceSynchronize();

     }

 //    return quda::plaquette(*extendedGaugeResident, QUDA_CUDA_FIELD_LOCATION);

   #endif

   return quda::plaquette(*data, QUDA_CUDA_FIELD_LOCATION);

 }


 void performAPEnStep(unsigned int nSteps, double alpha)

 {

   profileAPE.Start(QUDA_PROFILE_TOTAL);


   if (gaugePrecise == NULL) {

     errorQuda("Gauge field must be loaded");

   }


 #ifdef MULTI_GPU

   if (extendedGaugeResident == NULL)

   {

     int y[4];

     for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;

     int pad = 0;

     GaugeFieldParam gParamEx(y, gaugePrecise->Precision(), gaugePrecise->Reconstruct(),

         pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);

     gParamEx.create = QUDA_ZERO_FIELD_CREATE;

     gParamEx.order = gaugePrecise->Order();

     gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;

     gParamEx.t_boundary = gaugePrecise->TBoundary();

     gParamEx.nFace = 1;


     extendedGaugeResident = new cudaGaugeField(gParamEx);


     copyExtendedGauge(*extendedGaugeResident, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION);

     int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction

     extendedGaugeResident->exchangeExtendedGhost(R,true);

   }

 #endif


   int pad = 0;

   int y[4];


 #ifdef MULTI_GPU

     int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction

     for (int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;

 #else

     for (int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir];

 #endif


   GaugeFieldParam gParam(y, gaugePrecise->Precision(), gaugePrecise->Reconstruct(),

       pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);

   gParam.create = QUDA_ZERO_FIELD_CREATE;

   gParam.order = gaugePrecise->Order();

   gParam.siteSubset = QUDA_FULL_SITE_SUBSET;

   gParam.t_boundary = gaugePrecise->TBoundary();

   gParam.nFace = 1;

   gParam.tadpole = gaugePrecise->Tadpole();


   if (gaugeSmeared == NULL) {

 //    gaugeSmeared = new cudaGaugeField(gParamEx);

     gaugeSmeared = new cudaGaugeField(gParam);

     #ifdef MULTI_GPU

       copyExtendedGauge(*gaugeSmeared, *extendedGaugeResident, QUDA_CUDA_FIELD_LOCATION);

       gaugeSmeared->exchangeExtendedGhost(R,true);

     #else

       gaugeSmeared->copy(*gaugePrecise);

     #endif

   }


   cudaGaugeField *cudaGaugeTemp = NULL;

   cudaGaugeTemp = new cudaGaugeField(gParam);


   printfQuda("Plaquette after 0 APE steps: %le\n", plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION));


   for (unsigned int i=0; i<nSteps; i++) {

     #ifdef MULTI_GPU

       copyExtendedGauge(*cudaGaugeTemp, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);

       cudaGaugeTemp->exchangeExtendedGhost(R,true);

       APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha, QUDA_CUDA_FIELD_LOCATION);

 //      gaugeSmeared->exchangeExtendedGhost(R,true);    FIXME I'm not entirely sure whether I can remove this...

     #else

       cudaGaugeTemp->copy(*gaugeSmeared);

       APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha, QUDA_CUDA_FIELD_LOCATION);

     #endif

   }


   delete cudaGaugeTemp;


   #ifdef MULTI_GPU

     gaugeSmeared->exchangeExtendedGhost(R,true);

   #endif


   printfQuda("Plaquette after %d APE steps: %le\n", nSteps, plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION));


   profileAPE.Stop(QUDA_PROFILE_TOTAL);

 }


 //#include"contractions.cpp"    Contraction interface, to be added soon

new_quda_invert_param_
void new_quda_invert_param_(QudaInvertParam *param)
Definition: interface_quda.cpp:5081

quda::CloverFieldParam::order
QudaCloverFieldOrder order
Definition: clover_field.h:21

gauge_param
QudaGaugeParam gauge_param
Definition: dslash_test.cpp:37

quda::DiracDomainWall4DPC::Dslash5
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_domain_wall_4d.cpp:53

destroyQudaGaugeField
void destroyQudaGaugeField(void *gauge)
Definition: interface_quda.cpp:4035

invert_quda.h

QudaInvertParam_s::secs
double secs
Definition: quda.h:183

quda::Eig_Solver
Definition: lanczos_quda.h:16

quda::GaugeFieldParam::t_boundary
QudaTboundary t_boundary
Definition: gauge_field.h:18

quda::GaugeField::Tadpole
double Tadpole() const
Definition: gauge_field.h:171

QudaInvertParam_s::dirac_order
QudaDiracFieldOrder dirac_order
Definition: quda.h:156

QUDA_VERBOSE
Definition: enum_quda.h:217

QudaInvertParam_s::mass_normalization
QudaMassNormalization mass_normalization
Definition: quda.h:146

QudaInvertParam_s::tol_hq_offset
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:134

quda::destroyStaggeredOprodEvents
void destroyStaggeredOprodEvents()

QudaGaugeParam_s::reconstruct_sloppy
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46

comm_finalize
void comm_finalize(void)
Definition: comm_common.cpp:274

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

quda::DiracParam::c_5
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:27

plaqCuda
double plaqCuda()
Definition: interface_quda.cpp:5176

cpuOprod
cpuGaugeField * cpuOprod
Definition: hisq_paths_force_test.cpp:41

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:60

freeCloverQuda
void freeCloverQuda(void)
Definition: interface_quda.cpp:996

createExtendedGaugeField
void * createExtendedGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
Definition: interface_quda.cpp:3939

compute_gauge_force_quda_
int compute_gauge_force_quda_(void *mom, void *gauge, int *input_path_buf, int *path_length, double *loop_coeff, int *num_paths, int *max_length, double *dt, QudaGaugeParam *param)
Definition: interface_quda.cpp:5091

quda::DiracTwistedCloverPC
Definition: dirac_quda.h:445

quda::setDiracSloppyParam
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1182

QudaInvertParam_s::b_5
double b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:94

QUDA_MAT_SOLUTION
Definition: enum_quda.h:119

invert_multishift_quda_
void invert_multishift_quda_(void *hp_x[QUDA_MAX_MULTI_SHIFT], void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5076

computeKSOprodQuda
void computeKSOprodQuda(void *oprod, void *fermion, double coeff, int X[4], QudaPrecision prec)
Definition: interface_quda.cpp:4149

loadCloverQuda
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:659

quda::QUDA_PROFILE_CONSTANT
Definition: quda_internal.h:165

QUDA_DOMAIN_WALL_DIRAC
Definition: enum_quda.h:253

setVerbosityQuda
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
Definition: interface_quda.cpp:214

QudaInvertParam_s::clover_location
QudaFieldLocation clover_location
Definition: quda.h:160

quda::llfat_cuda
void llfat_cuda(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
Definition: llfat_quda_itf.cpp:23

QudaInvertParam_s::solve_type
QudaSolveType solve_type
Definition: quda.h:143

computeCloverTraceQuda
void computeCloverTraceQuda(void *out, void *clov, int mu, int nu, int dim[4])
Definition: interface_quda.cpp:4041

QudaPrecision
enum QudaPrecision_s QudaPrecision

freeGaugeQuda
void freeGaugeQuda(void)
Definition: interface_quda.cpp:929

saveGaugeField
void saveGaugeField(void *gauge, void *inGauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:3921

commDimPartitioned
int commDimPartitioned(int dir)
Definition: face_buffer.cpp:539

QudaGaugeParam_s::ga_pad
int ga_pad
Definition: quda.h:53

QUDA_WILSONPC_DIRAC
Definition: enum_quda.h:250

load_gauge_quda_
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:5058

y
int y[4]
Definition: staggered_dslash_core.h:356

invertMDQuda
void invertMDQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2217

QudaGaugeParam_s::make_resident_mom
int make_resident_mom
Definition: quda.h:72

quda::ColorSpinorField
Definition: color_spinor_field.h:141

quda::DeflatedSolver::StoreRitzVecs
virtual void StoreRitzVecs(void *host_buffer, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources=false)=0

QudaInvertParam_s::mu
double mu
Definition: quda.h:97

quda::CloverField::V
void * V(bool inverse=false)
Definition: clover_field.h:59

QudaInvertParam_s::tune
QudaTune tune
Definition: quda.h:185

quda::MultiShiftCG
Definition: invert_quda.h:487

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:164

quda::fermion_force::setUnitarizeForceConstants
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)

quda::DiracMobiusDomainWallPC::Dslash5inv
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
Definition: dirac_mobius.cpp:87

QUDA_MAX_MULTI_SHIFT
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
Definition: quda_constants.h:24

QUDA_MASS_NORMALIZATION
Definition: enum_quda.h:178

gaugeExtended
cudaGaugeField * gaugeExtended
Definition: interface_quda.cpp:109

quda::createDirac
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
Definition: interface_quda.cpp:1228

quda::ColorSpinorParam::nColor
int nColor
Definition: color_spinor_field.h:16

printQudaGaugeParam
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

QUDA_INVALID_PRECISION
Definition: enum_quda.h:51

QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:81

QUDA_TWIST_PLUS
Definition: enum_quda.h:332

QudaGaugeParam_s::type
QudaLinkType type
Definition: quda.h:35

quda::fermion_force::unitarizeForceCuda
void unitarizeForceCuda(cudaGaugeField &cudaOldForce, cudaGaugeField &cudaGauge, cudaGaugeField *cudaNewForce, int *unitarization_failed, long long *flops=NULL)

QudaInvertParam_s::kappa
double kappa
Definition: quda.h:89

quda::CloverField::Bytes
size_t Bytes() const
Definition: clover_field.h:67

gauge_tools.h

quda::DiracClover::Clover
void Clover(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_clover.cpp:67

quda::QUDA_PROFILE_FREE
Definition: quda_internal.h:150

QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:23

quda::TimeProfile::Print
void Print()
Definition: timer.cpp:6

check_params.h

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
Definition: cuda_gauge_field.cu:439

quda::DiracM
Definition: dirac_quda.h:607

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

color_spinor_field.h

QUDA_COMPUTE_FAT_STANDARD
Definition: enum_quda.h:373

QudaInvertParam_s::dslash_type
QudaDslashType dslash_type
Definition: quda.h:85

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error, bool check_unitarization=true)

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:162

QudaGaugeParam_s::reconstruct_precondition
QudaReconstructType reconstruct_precondition
Definition: quda.h:49

quda::CloverFieldParam::create
QudaFieldCreate create
Definition: clover_field.h:22

QudaInvertParam_s::inv_type
QudaInverterType inv_type
Definition: quda.h:86

quda_fortran.h
Fortran interface functions.

computeHISQForceCompleteQuda
void computeHISQForceCompleteQuda(void *const milc_momentum, const double level2_coeff[6], const double fat7_coeff[6], void **quark_array, int num_terms, double **quark_coeff, const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4408

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:271

quda::ColorSpinorField::RealLength
int RealLength() const
Definition: color_spinor_field.h:249

quda::cudaGaugeField::exchangeGhost
void exchangeGhost()
Definition: cuda_gauge_field.cu:160

QudaInvertParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:152

ritz_quda.h

QudaInvertParam_s::c_5
double c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:95

host_free
#define host_free(ptr)
Definition: malloc_quda.h:29

QUDA_ODD_PARITY
Definition: enum_quda.h:240

computeKSLinkQuda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)

QUDA_INVALID_GEOMETRY
Definition: enum_quda.h:388

quda::SolverParam::true_res_hq
double true_res_hq
Definition: invert_quda.h:72

QUDA_HALF_PRECISION
Definition: enum_quda.h:48

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:32

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

QUDA_TWISTED_CLOVERPC_DIRAC
Definition: enum_quda.h:264

quda::CloverField
Definition: clover_field.h:32

gaugeFatExtended
cudaGaugeField *& gaugeFatExtended
Definition: interface_quda.cpp:115

mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
Definition: hisq_paths_force_core.h:82

QUDA_WILSON_LINKS
Definition: enum_quda.h:20

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169

setOutputPrefix
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:38

loadGaugeQuda
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:452

quda::DeflatedSolver
Definition: invert_quda.h:530

quda::DiracCloverPC::CloverInv
void CloverInv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_clover.cpp:136

QUDA_SUMMARIZE
Definition: enum_quda.h:216

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:30

tmp1
cudaColorSpinorField * tmp1
Definition: dslash_test.cpp:41

gaugeLongPrecise
cudaGaugeField * gaugeLongPrecise
Definition: interface_quda.cpp:119

QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:277

QUDA_WILSON_DIRAC
Definition: enum_quda.h:249

QUDA_NORMERR_SOLVE
Definition: enum_quda.h:133

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

QUDA_DIRECT_SOLVE
Definition: enum_quda.h:129

quda::completeKSForce
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
Definition: ks_force_quda.cu:206

quda::Nstream
const int Nstream
Definition: quda_internal.h:217

QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:303

QudaInvertParam_s::trlogA
double trlogA[2]
Definition: quda.h:172

QUDA_EIGCG_INVERTER
Definition: enum_quda.h:107

quda::assertAllMemFree
void assertAllMemFree()
Definition: malloc.cpp:294

quda::CloverFieldParam::clover
void * clover
Definition: clover_field.h:12

quda::LatticeFieldParam::precision
QudaPrecision precision
Definition: lattice_field.h:41

quda::Dirac::reconstruct
virtual void reconstruct(cudaColorSpinorField &x, const cudaColorSpinorField &b, const QudaSolutionType) const =0

QudaInvertParam_s::dagger
QudaDagType dagger
Definition: quda.h:145

quda::cudaColorSpinorField
Definition: color_spinor_field.h:302

test_type
int test_type
Definition: test_util.cpp:1564

quda::DiracDomainWall4DPC::Dslash5inv
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
Definition: dirac_domain_wall_4d.cpp:70

free_clover_quda_
void free_clover_quda_(void)
Definition: interface_quda.cpp:5063

initCommsGridQuda
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
Definition: interface_quda.cpp:254

quda::plaquette
double plaquette(const GaugeField &data, QudaFieldLocation location)
Definition: gauge_plaq.cu:242

QUDA_CG_INVERTER
Definition: enum_quda.h:98

quda::massRescale
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
Definition: interface_quda.cpp:1245

quda::GaugeFieldParam::anisotropy
double anisotropy
Definition: gauge_field.h:20

quda::DeflatedSolver::CleanResources
virtual void CleanResources()=0

QudaGaugeParam_s::gauge_order
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36

QudaInvertParam_s::true_res
double true_res
Definition: quda.h:105

quda::CloverFieldParam::inverse
bool inverse
Definition: clover_field.h:11

quda::SolverParam::iter
int iter
Definition: invert_quda.h:78

QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:333

quda::computeFatLinkCore
void computeFatLinkCore(cudaGaugeField *cudaSiteLink, double *act_path_coeff, QudaGaugeParam *qudaGaugeParam, QudaComputeFatMethod method, cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, TimeProfile &profile)

cpuMom
cpuGaugeField * cpuMom
Definition: fermion_force_test.cpp:24

comm_gpuid
int comm_gpuid(void)
Definition: comm_mpi.cpp:92

LexMapData::ndim
int ndim
Definition: interface_quda.cpp:223

setFatLinkPadding
void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam *param)

quda::cloverDerivative
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
Definition: clover_deriv_quda.cu:369

saveGaugeQuda
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
Definition: interface_quda.cpp:623

quda::CloverFieldParam::invNorm
void * invNorm
Definition: clover_field.h:15

QUDA_DAG_YES
Definition: enum_quda.h:172

quda::llfat_init_cuda_ex
void llfat_init_cuda_ex(QudaGaugeParam *param_ex)

spinorSiteSize
#define spinorSiteSize
Definition: interface_quda.cpp:54

Vsh_t
#define Vsh_t
Definition: llfat_core.h:4

quda::setUnitarizeLinksPadding
void setUnitarizeLinksPadding(int input_padding, int output_padding)

quda::TimeProfile
Definition: quda_internal.h:171

gauge_force_quda.h

quda::LatticeField::GBytes
size_t GBytes() const
Definition: lattice_field.h:178

solutionResident
cudaColorSpinorField * solutionResident
Definition: interface_quda.cpp:136

quda::CloverFieldParam::mu2
double mu2
Definition: clover_field.h:19

QudaInvertParam_s::compute_clover_trlog
int compute_clover_trlog
Definition: quda.h:171

quda::LatticeFieldParam::siteSubset
QudaSiteSubset siteSubset
Definition: lattice_field.h:42

cudaGauge
cudaGaugeField * cudaGauge
Definition: fermion_force_test.cpp:20

QudaInvertParam_s::clover_cuda_prec_sloppy
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:163

gaugeLongExtended
cudaGaugeField * gaugeLongExtended
Definition: interface_quda.cpp:118

QudaInvertParam_s::input_location
QudaFieldLocation input_location
Definition: quda.h:82

dslashQuda_4dpc
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:1411

QudaInvertParam_s::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:140

blas_magma.h

longlink
void * longlink[4]
Definition: staggered_dslash_test.cpp:51

quda::DiracParam::gauge
cudaGaugeField * gauge
Definition: dirac_quda.h:30

dirac_quda.h

Vsh_z
#define Vsh_z
Definition: llfat_core.h:3

quda::FaceBuffer
Definition: face_quda.h:10

cloverPrecondition
cudaCloverField * cloverPrecondition
Definition: interface_quda.cpp:127

QudaInvertParam_s::use_init_guess
QudaUseInitGuess use_init_guess
Definition: quda.h:167

QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:306

Ls
int Ls
Definition: test_util.cpp:40

init_quda_
void init_quda_(int *dev)
Definition: interface_quda.cpp:5054

getGaugePadding
int getGaugePadding(GaugeFieldParam &param)
Definition: interface_quda.cpp:3588

lanczos_quda.h

param
QudaGaugeParam param
Definition: pack_test.cpp:17

comm_quda.h

openMagma
void openMagma()
Definition: interface_quda.cpp:87

QudaGaugeParam_s::llfat_ga_pad
int llfat_ga_pad
Definition: quda.h:58

quda::setDiracParam
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Definition: interface_quda.cpp:1102

comm_init
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_mpi.cpp:31

quda::cudaColorSpinorField::Odd
cudaColorSpinorField & Odd() const
Definition: cuda_color_spinor_field.cu:403

QudaInvertParam_s::solution_type
QudaSolutionType solution_type
Definition: quda.h:142

QUDA_MATPC_SOLUTION
Definition: enum_quda.h:121

QudaInvertParam_s::solver_normalization
QudaSolverNormalization solver_normalization
Definition: quda.h:147

quda::QUDA_PROFILE_D2H
Definition: quda_internal.h:145

numa_affinity_enabled
int numa_affinity_enabled
Definition: interface_quda.cpp:75

E
int E[4]
Definition: hisq_paths_force_core.h:133

quda::Dirac::Dslash
virtual void Dslash(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const =0

quda::LatticeFieldParam::x
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38

QudaInvertParam_s::clover_cuda_prec
QudaPrecision clover_cuda_prec
Definition: quda.h:162

QUDA_TWISTED_MASSPC_DIRAC
Definition: enum_quda.h:262

staggered_oprod.h

cloverInvPrecise
cudaCloverField * cloverInvPrecise
Definition: interface_quda.cpp:129

QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
Definition: enum_quda.h:165

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:176

quda::GaugeFieldParam
Definition: gauge_field.h:10

invertMultiShiftQuda
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2473

QUDA_TWISTED_MASS_DIRAC
Definition: enum_quda.h:261

QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:394

quda::DiracParam
Definition: dirac_quda.h:18

initQuda
void initQuda(int dev)
Definition: interface_quda.cpp:428

quda::DiracParam::mu
double mu
Definition: dirac_quda.h:36

computeAsqtadForceQuda
void computeAsqtadForceQuda(void *const milc_momentum, long long *flops, const double act_path_coeff[6], const void *const one_link_src[4], const void *const naik_src[4], const void *const link, const QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4235

QudaEigParam_s::invert_param
QudaInvertParam * invert_param
Definition: quda.h:261

setTuning
void setTuning(QudaTune tune)
Definition: util_quda.cpp:33

quda::cudaCloverField
Definition: clover_field.h:74

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:138

QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:305

QudaInvertParam_s::spinorGiB
double spinorGiB
Definition: quda.h:180

quda::cloverInvert
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
Definition: clover_invert.cu:298

quda::DiracMobiusDomainWallPC::Dslash4
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:39

exchange_cpu_sitelink_ex
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)

quda::DiracMobiusDomainWallPC::Dslash5
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:70

tmp
cudaColorSpinorField * tmp
Definition: staggered_dslash_test.cpp:48

QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:285

quda::DiracParam::mass
double mass
Definition: dirac_quda.h:23

QudaInvertParam_s::output_location
QudaFieldLocation output_location
Definition: quda.h:83

QUDA_USE_INIT_GUESS_YES
Definition: enum_quda.h:362

QudaInvertParam_s::clover_cuda_prec_precondition
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:164

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:56

QudaGaugeParam_s::site_ga_pad
int site_ga_pad
Definition: quda.h:55

quda::CloverFieldParam
Definition: clover_field.h:9

QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION
Definition: enum_quda.h:124

quda::DiracMdag
Definition: dirac_quda.h:707

apply_staggered_phase_quda_
void apply_staggered_phase_quda_()
Definition: interface_quda.cpp:5124

kappa
VOLATILE spinorFloat kappa
Definition: dw_dslash5inv_core.h:142

LexMapData
Definition: interface_quda.cpp:222

QUDA_CLOVERPC_DIRAC
Definition: enum_quda.h:252

quda::Dirac::Dagger
void Dagger(QudaDagType dag)
Definition: dirac_quda.h:140

quda::SolverParam::true_res_hq_offset
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:113

quda::SolverParam::updateInvertParam
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:194

QudaInvertParam_s::m5
double m5
Definition: quda.h:91

quda::llfat_cuda_ex
void llfat_cuda_ex(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
Definition: llfat_quda_itf.cpp:276

QUDA_PRESERVE_SOURCE_NO
Definition: enum_quda.h:189

QudaGaugeParam_s
Definition: quda.h:25

quda::DiracParam::Ls
int Ls
Definition: dirac_quda.h:25

cpuFatLink
cpuGaugeField * cpuFatLink
Definition: hisq_unitarize_force_test.cpp:19

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
Definition: cuda_gauge_field.cu:191

cloverSloppy
cudaCloverField * cloverSloppy
Definition: interface_quda.cpp:126

QudaInvertParam_s::verbosity
QudaVerbosity verbosity
Definition: quda.h:174

quda::QUDA_PROFILE_EPILOGUE
Definition: quda_internal.h:149

free_gauge_quda_
void free_gauge_quda_()
Definition: interface_quda.cpp:5059

quda::FaceBuffer::flushPinnedCache
static void flushPinnedCache()
Definition: face_buffer.cpp:196

QudaInvertParam_s::tol_offset
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:131

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:35

quda::DiracParam::commDim
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:42

QudaInvertParam_s::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:137

MatQuda
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:1567

quda::QUDA_PROFILE_H2D
Definition: quda_internal.h:144

load_clover_quda_
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5061

quda::DiracParam::dagger
QudaDagType dagger
Definition: dirac_quda.h:29

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:50

gaugeFatPrecondition
cudaGaugeField *& gaugeFatPrecondition
Definition: interface_quda.cpp:114

newQudaInvertParam
QudaInvertParam newQudaInvertParam(void)

cudaFatLink
cudaGaugeField * cudaFatLink
Definition: hisq_unitarize_force_test.cpp:18

quda::CloverFieldParam::setPrecision
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23

QudaInvertParam_s::gflops
double gflops
Definition: quda.h:182

quda::Solver::create
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:12

quda::fermion_force::hisqCompleteForceCuda
void hisqCompleteForceCuda(const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)

quda::LatticeField::freeBuffer
static void freeBuffer(int index=0)
Definition: lattice_field.cpp:129

quda::cpuGaugeField
Definition: gauge_field.h:278

QudaGaugeParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:48

free_sloppy_gauge_quda_
void free_sloppy_gauge_quda_()
Definition: interface_quda.cpp:5060

QUDA_DOMAIN_WALL_4DPC_DIRAC
Definition: enum_quda.h:255

QudaGaugeParam_s::staple_pad
int staple_pad
Definition: quda.h:57

QudaInvertParam_s::clover_order
QudaCloverFieldOrder clover_order
Definition: quda.h:166

quda::createDslashEvents
void createDslashEvents()
Definition: dslash_quda.cu:108

updateGaugeFieldQuda
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:4936

face_quda.h

QudaGaugeParam_s::preserve_gauge
int preserve_gauge
Definition: quda.h:62

remove_staggered_phase_quda_
void remove_staggered_phase_quda_()
Definition: interface_quda.cpp:5134

QudaInvertParam_s::nev
int nev
Definition: quda.h:240

dim
int dim
Definition: tm_ndeg_fused_exterior_dslash_core.h:195

invert_md_quda_
void invert_md_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5074

QUDA_PERIODIC_T
Definition: enum_quda.h:43

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:148

QUDA_CLOVER_WILSON_DSLASH
Definition: enum_quda.h:76

quda::GaugeFieldParam::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:15

cloverInvPrecondition
cudaCloverField * cloverInvPrecondition
Definition: interface_quda.cpp:131

quda::Dirac::MdagM
virtual void MdagM(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168

warningQuda
#define warningQuda(...)
Definition: util_quda.h:84

QudaInvertParam_s::true_res_hq
double true_res_hq
Definition: quda.h:106

quda::cudaGaugeField
Definition: gauge_field.h:216

performAPEnStep
void performAPEnStep(unsigned int nSteps, double alpha)
Definition: interface_quda.cpp:5210

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100

quda::cudaColorSpinorField::freeGhostBuffer
static void freeGhostBuffer(void)
Definition: cuda_color_spinor_field.cu:584

quda::DiracParam::b_5
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
Definition: dirac_quda.h:26

QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:278

quda::DiracParam::type
QudaDiracType type
Definition: dirac_quda.h:21

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:293

QUDA_PCG_INVERTER
Definition: enum_quda.h:105

quda::copyCuda
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235

coeff
__constant__ double coeff
Definition: dslash_constants.h:180

quda::SolverParam::tol_hq
double tol_hq
Definition: invert_quda.h:66

QudaGaugeParam_s::cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45

quda::GaugeFieldParam::tadpole
double tadpole
Definition: gauge_field.h:21

QUDA_FAT_PRESERVE_GPU_GAUGE
Definition: enum_quda.h:380

QudaInvertParam_s::max_search_dim
int max_search_dim
Definition: quda.h:242

dirac
Dirac * dirac
Definition: dslash_test.cpp:45

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:392

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
Definition: cuda_gauge_field.cu:372

quda::fatlink::initLatticeConstants
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)

BlasMagmaArgs::CloseMagma
static void CloseMagma()
Definition: blas_magma.cpp:58

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

gauge_update_quda.h

quda::DiracParam::matpcType
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:28

QUDA_COMPUTE_FAT_EXTENDED_VOLUME
Definition: enum_quda.h:374

quda::SolverParam::true_res_offset
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:110

comm_set_gridsize_
void comm_set_gridsize_(int *grid)
Definition: interface_quda.cpp:5160

QudaInvertParam_s::offset
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:128

quda::gauge_force_cuda
void gauge_force_cuda(cudaGaugeField &cudaMom, double eb3, cudaGaugeField &cudaSiteLink, QudaGaugeParam *param, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Definition: gauge_force_quda.cu:328

initQudaMemory
void initQudaMemory()
Definition: interface_quda.cpp:398

QudaEigParam_s::RitzMat_lanczos
QudaSolutionType RitzMat_lanczos
Definition: quda.h:262

init_quda_memory_
void init_quda_memory_()
Definition: interface_quda.cpp:5056

cloverPrecise
cudaCloverField * cloverPrecise
Definition: interface_quda.cpp:125

compute_staggered_force_quda_
void compute_staggered_force_quda_(void *cudaMom, void *qudaQuark, double *coeff)
Definition: interface_quda.cpp:5119

QudaParity
enum QudaParity_s QudaParity

cpuGauge
cpuGaugeField * cpuGauge
Definition: fermion_force_test.cpp:21

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:43

lanczosQuda
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
Definition: interface_quda.cpp:1824

QudaGaugeParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:42

quda::DiracCloverPC
Definition: dirac_quda.h:221

QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:177

extendGaugeField
void extendGaugeField(void *out, void *in)
Definition: interface_quda.cpp:4021

QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:22

QudaGaugeParam_s::X
int X[4]
Definition: quda.h:29

quda::DiracParam::cloverInv
cudaCloverField * cloverInv
Definition: dirac_quda.h:34

quda::computeStaggeredOprod
void computeStaggeredOprod(cudaGaugeField &out, cudaColorSpinorField &in, FaceBuffer &facebuffer, const unsigned int parity, const double coeff, const unsigned int displacement)

quda::DiracDomainWall4DPC
Definition: dirac_quda.h:298

QudaInvertParam_s::mass
double mass
Definition: quda.h:88

dslashQuda_mdwf
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
Definition: interface_quda.cpp:1487

quda::Eig_Solver::create
static Eig_Solver * create(QudaEigParam &param, RitzMat &ritz_mat, TimeProfile &profile)
Definition: eig_solver.cpp:12

quda::unitarizeLinksCuda
void unitarizeLinksCuda(const QudaGaugeParam &param, cudaGaugeField &infield, cudaGaugeField *outfield, int *num_failures)

quda::computeCloverSigmaTrace
void computeCloverSigmaTrace(GaugeField &gauge, const CloverField &clover, int dir1, int dir2, QudaFieldLocation location)
Definition: clover_trace_quda.cu:310

QUDA_HEAVY_QUARK_RESIDUAL
Definition: enum_quda.h:149

cudaMom
cudaGaugeField * cudaMom
Definition: fermion_force_test.cpp:23

quda::cpuColorSpinorField::freeGhostBuffer
static void freeGhostBuffer(void)
Definition: cpu_color_spinor_field.cpp:224

clover_field.h

quda::GaugeFieldParam::nFace
int nFace
Definition: gauge_field.h:12

computeStaggeredForceQuda
void computeStaggeredForceQuda(void *cudaMom, void *qudaQuark, double coeff)
Definition: interface_quda.cpp:4185

MatDagMatQuda
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:1643

invertMultiShiftMDQuda
void invertMultiShiftMDQuda(void **_hp_xe, void **_hp_xo, void **_hp_ye, void **_hp_yo, void *_hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:2767

QudaEigParam_s
Definition: quda.h:259

QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:386

invert_quda_
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:5072

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:25

quda::DiracParam::kappa
double kappa
Definition: dirac_quda.h:22

quda::cudaCloverField::copy
void copy(const CloverField &src, bool inverse=true)
Definition: clover_field.cpp:206

QUDA_TWISTED_CLOVER_DSLASH
Definition: enum_quda.h:83

LexMapData::dims
int dims[QUDA_MAX_DIM]
Definition: interface_quda.cpp:224

x
int x[4]
Definition: hisq_paths_force_core.h:99

quda::LatticeFieldParam::nDim
int nDim
Definition: lattice_field.h:37

quda::DiracMdagM::shift
double shift
Shift term added onto operator (M^dag M + shift)
Definition: dirac_quda.h:643

tune_quda.h

quda::Dirac::setMass
void setMass(double mass)
Definition: dirac_quda.h:132

pushVerbosity
void pushVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:52

QUDA_ASQTADPC_DIRAC
Definition: enum_quda.h:260

init_quda_device_
void init_quda_device_(int *dev)
Definition: interface_quda.cpp:5055

gaugeLongSloppy
cudaGaugeField * gaugeLongSloppy
Definition: interface_quda.cpp:120

quda::DiracDomainWall4DPC::Dslash4
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_domain_wall_4d.cpp:39

quda::GaugeFieldParam::ghostExchange
QudaGhostExchange ghostExchange
Definition: gauge_field.h:40

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:37

QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
Definition: enum_quda.h:296

checkCudaErrorNoSync
#define checkCudaErrorNoSync()
Definition: util_quda.h:94

hisq_links_quda.h

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_WILSON_DSLASH
Definition: enum_quda.h:75

quda::fermion_force::hisqLongLinkForceCuda
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)

quda::DiracMobiusDomainWallPC::Dslash4pre
void Dslash4pre(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
Definition: dirac_mobius.cpp:53

update_gauge_field_quda_
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
Definition: interface_quda.cpp:5085

fat_force_quda.h

quda::DiracMobiusDomainWallPC
Definition: dirac_quda.h:328

blas_quda.h

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:57

quda::QUDA_PROFILE_INIT
Definition: quda_internal.h:146

quda::APEStep
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha, QudaFieldLocation location)
Definition: gauge_ape.cu:497

quda::CloverFieldParam::cloverInv
void * cloverInv
Definition: clover_field.h:14

quda::SolverParam::true_res
double true_res
Definition: invert_quda.h:69

invertQuda
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
Definition: interface_quda.cpp:1977

printQudaInvertParam
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:162

QUDA_TWISTED_CLOVER_DIRAC
Definition: enum_quda.h:263

computeHISQForceQuda
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const staple_src[4], const void *const one_link_src[4], const void *const naik_src[4], const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
Definition: interface_quda.cpp:4448

quda::Solver
Definition: invert_quda.h:220

QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:218

clover_quda_
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
Definition: interface_quda.cpp:5066

quda::ColorSpinorParam
Definition: color_spinor_field.h:14

quda::fermion_force::hisqStaplesForceCuda
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

incrementalEigQuda
void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h_u, double *inv_eigenvals, int last_rhs)
Definition: interface_quda.cpp:3071

cudaGauge_ex
cudaGaugeField * cudaGauge_ex
Definition: hisq_paths_force_test.cpp:63

QUDA_NORMERR_PC_SOLVE
Definition: enum_quda.h:134

QudaGaugeParam_s::location
QudaFieldLocation location
Definition: quda.h:27

inv_param
QudaInvertParam inv_param
Definition: dslash_test.cpp:38

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:51

quda::TimeProfile::Stop
void Stop(QudaProfileType idx)
Definition: quda_internal.h:194

QudaGaugeParam_s::gaugeGiB
double gaugeGiB
Definition: quda.h:60

QudaInvertParam_s::cuda_prec_precondition
QudaPrecision cuda_prec_precondition
Definition: quda.h:154

gaugePrecondition
cudaGaugeField * gaugePrecondition
Definition: interface_quda.cpp:108

QUDA_TWISTED_MASS_DSLASH
Definition: enum_quda.h:82

quda::CloverFieldParam::twisted
bool twisted
Definition: clover_field.h:18

QUDA_MATPC_ODD_ODD_ASYMMETRIC
Definition: enum_quda.h:166

quda::endBlas
void endBlas(void)
Definition: blas_quda.cu:59

QudaInvertParam_s::iter
int iter
Definition: quda.h:179

QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:80

quda::loadTuneCache
void loadTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:131

gParam
GaugeFieldParam gParam
Definition: hisq_paths_force_test.cpp:75

Vh_2d_max
__constant__ int Vh_2d_max
Definition: dslash_constants.h:147

if
if(x2 >=X2) return

QUDA_SOURCE_NORMALIZATION
Definition: enum_quda.h:185

quda::DiracMdagM
Definition: dirac_quda.h:636

quda::CG
Definition: invert_quda.h:287

QUDA_GENERAL_LINKS
Definition: enum_quda.h:17

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

quda.h
Main header file for the QUDA library.

quda::DeflatedSolver::create
static DeflatedSolver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matCGSloppy, DiracMatrix &matDeflate, TimeProfile &profile)
Definition: solver.cpp:150

createGaugeField
void * createGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
Definition: interface_quda.cpp:3891

quda::Dirac::M
virtual void M(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0

QUDA_STAGGEREDPC_DIRAC
Definition: enum_quda.h:258

quda::DiracMMdag
Definition: dirac_quda.h:672

quda::DiracParam::clover
cudaCloverField * clover
Definition: dirac_quda.h:33

quda::GaugeFieldParam::link_type
QudaLinkType link_type
Definition: gauge_field.h:17

set_kernel_pack_t_
void set_kernel_pack_t_(int *pack)
Definition: interface_quda.cpp:5170

quda::ColorSpinorField::Precision
QudaPrecision Precision() const
Definition: color_spinor_field.h:242

quda::CloverFieldParam::direct
bool direct
Definition: clover_field.h:10

quda::Dirac::prepare
virtual void prepare(cudaColorSpinorField *&src, cudaColorSpinorField *&sol, cudaColorSpinorField &x, cudaColorSpinorField &b, const QudaSolutionType) const =0

quda::printPeakMemUsage
void printPeakMemUsage()
Definition: malloc.cpp:286

quda::TimeProfile::Last
double Last(QudaProfileType idx)
Definition: quda_internal.h:204

QUDA_MAX_DWF_LS
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
Definition: quda_constants.h:30

quda::GaugeField::applyStaggeredPhase
void applyStaggeredPhase()
Definition: gauge_field.cpp:66

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:31

Vsh_y
#define Vsh_y
Definition: llfat_core.h:2

dslash_quda.h

QudaGaugeParam_s::mom_ga_pad
int mom_ga_pad
Definition: quda.h:59

quda::cpuCloverField
Definition: clover_field.h:141

freeSloppyGaugeQuda
void freeSloppyGaugeQuda(void)
Definition: interface_quda.cpp:973

mat_dag_mat_quda_
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5070

QudaGaugeParam_s::use_resident_gauge
int use_resident_gauge
Definition: quda.h:69

QudaInvertParam_s::Ls
int Ls
Definition: quda.h:92

quda::QUDA_PROFILE_TOTAL
Definition: quda_internal.h:167

printfQuda
#define printfQuda(...)
Definition: util_quda.h:67

new_quda_gauge_param_
void new_quda_gauge_param_(QudaGaugeParam *param)
Definition: interface_quda.cpp:5078

QUDA_DAG_NO
Definition: enum_quda.h:171

quda::DiracParam::fatGauge
cudaGaugeField * fatGauge
Definition: dirac_quda.h:31

QudaInvertParam_s::twist_flavor
QudaTwistFlavorType twist_flavor
Definition: quda.h:100

cloverQuda
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
Definition: interface_quda.cpp:1752

gaugeSmeared
cudaGaugeField * gaugeSmeared
Definition: interface_quda.cpp:123

MAX
#define MAX(a, b)
Definition: interface_quda.cpp:51

QUDA_INC_EIGCG_INVERTER
Definition: enum_quda.h:108

quda::GaugeFieldParam::gauge
void * gauge
Definition: gauge_field.h:24

QudaInvertParam_s::overlap
int overlap
Definition: quda.h:125

QUDA_ASYMMETRIC_MASS_NORMALIZATION
Definition: enum_quda.h:179

checkGauge
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
Definition: interface_quda.cpp:1722

streams
cudaStream_t * streams
Definition: interface_quda.cpp:139

QudaGaugeParam_s::use_resident_mom
int use_resident_mom
Definition: quda.h:70

quda::ColorSpinorParam::v
void * v
Definition: color_spinor_field.h:29

QUDA_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:254

quda::GaugeFieldParam::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:14

quda::setKernelPackT
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:82

closeMagma
void closeMagma()
Definition: interface_quda.cpp:98

quda::GaugeFieldParam::create
QudaFieldCreate create
Definition: gauge_field.h:26

quda::TimeProfile::Start
void Start(QudaProfileType idx)
Definition: quda_internal.h:184

quda::printLaunchTimer
void printLaunchTimer()
Definition: tune.cpp:437

gaugeFatSloppy
cudaGaugeField *& gaugeFatSloppy
Definition: interface_quda.cpp:113

quda::cudaGaugeField::copy
void copy(const GaugeField &)
Definition: cuda_gauge_field.cu:333

QudaInvertParam_s::cl_pad
int cl_pad
Definition: quda.h:177

QUDA_DIRECT_PC_SOLVE
Definition: enum_quda.h:131

QudaInvertParam_s::residual_type
QudaResidualType residual_type
Definition: quda.h:235

QUDA_MATPC_DAG_SOLUTION
Definition: enum_quda.h:122

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

quda::SolverParam::use_init_guess
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:38

QUDA_EVEN_PARITY
Definition: enum_quda.h:239

QudaInvertParam_s::num_offset
int num_offset
Definition: quda.h:123

quda::DiracParam::longGauge
cudaGaugeField * longGauge
Definition: dirac_quda.h:32

QUDA_MATDAG_MAT_SOLUTION
Definition: enum_quda.h:120

dslash_quda_
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
Definition: interface_quda.cpp:5064

popVerbosity
void popVerbosity()
Definition: util_quda.cpp:63

Vsh_x
#define Vsh_x
Definition: llfat_core.h:1

QudaVerbosity
enum QudaVerbosity_s QudaVerbosity

QudaInvertParam_s::cloverGiB
double cloverGiB
Definition: quda.h:181

quda::updateGaugeField
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
Definition: gauge_update_quda.cu:348

QUDA_CLOVER_DIRAC
Definition: enum_quda.h:251

createCloverQuda
void createCloverQuda(QudaInvertParam *invertParam)
Definition: interface_quda.cpp:3776

QUDA_CPS_WILSON_DIRAC_ORDER
Definition: enum_quda.h:199

end_quda_
void end_quda_()
Definition: interface_quda.cpp:5057

computeStaggeredOprodQuda
void computeStaggeredOprodQuda(void **oprod, void **fermion, int num_terms, double **coeff, QudaGaugeParam *param)
Definition: interface_quda.cpp:4705

ks_force_quda.h

QudaInvertParam_s::epsilon
double epsilon
Definition: quda.h:98

numa_affinity.h

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

cloverInvSloppy
cudaCloverField * cloverInvSloppy
Definition: interface_quda.cpp:130

checkCudaError
#define checkCudaError()
Definition: util_quda.h:110

quda::GaugeFieldParam::compute_fat_link_max
bool compute_fat_link_max
Definition: gauge_field.h:34

quda::GaugeFieldParam::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:28

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:385

QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:304

setOutputFile
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:44

gaugePrecise
cudaGaugeField * gaugePrecise
Definition: interface_quda.cpp:106

quda::RitzMat
Definition: ritz_quda.h:20

QudaCommsMap
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12

quda::Dirac::create
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:134

gaugeFatPrecise
cudaGaugeField *& gaugeFatPrecise
Definition: interface_quda.cpp:112

QUDA_DOMAIN_WALL_DSLASH
Definition: enum_quda.h:77

QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
Definition: enum_quda.h:256

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177

dslashQuda
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
Definition: interface_quda.cpp:1329

quda::llfat_init_cuda
void llfat_init_cuda(QudaGaugeParam *param)

quda::Dirac::Mdag
void Mdag(cudaColorSpinorField &out, const cudaColorSpinorField &in) const
Definition: dirac.cpp:68

quda::ColorSpinorParam::create
QudaFieldCreate create
Definition: color_spinor_field.h:25

quda::SolverParam
Definition: invert_quda.h:14

quda::DiracParam::m5
double m5
Definition: dirac_quda.h:24

QudaGaugeParam_s::make_resident_gauge
int make_resident_gauge
Definition: quda.h:71

cudaOprod
cudaGaugeField * cudaOprod
Definition: hisq_paths_force_test.cpp:42

QUDA_NORMOP_PC_SOLVE
Definition: enum_quda.h:132

prec
QudaPrecision prec
Definition: test_util.cpp:1551

QUDA_MATPCDAG_MATPC_SOLUTION
Definition: enum_quda.h:123

QUDA_ASQTAD_DIRAC
Definition: enum_quda.h:259

quda::CloverFieldParam::norm
void * norm
Definition: clover_field.h:13

quda::gauge_force_init_cuda
void gauge_force_init_cuda(QudaGaugeParam *param, int max_length)
Definition: gauge_force_quda.cu:112

dims
const int dims[]
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:408

quda::axCuda
void axCuda(const double &a, cudaColorSpinorField &x)
Definition: blas_quda.cu:171

extendedGaugeResident
cudaGaugeField * extendedGaugeResident
Definition: interface_quda.cpp:134

quda::computeClover
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:602

QUDA_SU3_LINKS
Definition: enum_quda.h:16

QudaInvertParam_s::dslash_type_precondition
QudaDslashType dslash_type_precondition
Definition: quda.h:208

quda::norm2
double norm2(const ColorSpinorField &)
Definition: color_spinor_field.cpp:486

QudaInvertParam_s::clover_cpu_prec
QudaPrecision clover_cpu_prec
Definition: quda.h:161

QUDA_TWIST_MINUS
Definition: enum_quda.h:331

QudaInvertParam_s
Definition: quda.h:80

initLatticeConstants
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)
Definition: dslash_constants.h:206

quda::GaugeField::TBoundary
QudaTboundary TBoundary() const
Definition: gauge_field.h:173

QudaComputeFatMethod
enum QudaComputeFatMethod_s QudaComputeFatMethod

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:270

computeCloverDerivativeQuda
void computeCloverDerivativeQuda(void *out, void *gauge, void *oprod, int mu, int nu, double coeff, QudaParity parity, QudaGaugeParam *param, int conjugate)
Definition: interface_quda.cpp:4064

quda::destroyDslashEvents
void destroyDslashEvents()
Definition: dslash_quda.cu:129

llfat_quda.h

quda::CloverField::TrLog
double * TrLog() const
Definition: clover_field.h:64

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:21

parity
const QudaParity parity
Definition: dslash_test.cpp:29

quda::SolverParam::tol
double tol
Definition: invert_quda.h:60

gauge_field.h

gauge
void * gauge[4]
Definition: su3_test.cpp:15

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Definition: copy_gauge_extended.cu:337

eig_variables.h

setVerbosity
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24

QudaInvertParam_s::matpc_type
QudaMatPCType matpc_type
Definition: quda.h:144

quda::initBlas
void initBlas()
Definition: blas_quda.cu:53

setNumaAffinity
int setNumaAffinity(int)
Definition: numa_affinity.cpp:169

QUDA_DOMAIN_WALL_4D_DSLASH
Definition: enum_quda.h:78

momResident
cudaGaugeField * momResident
Definition: interface_quda.cpp:133

quda::cudaColorSpinorField::Even
cudaColorSpinorField & Even() const
Definition: cuda_color_spinor_field.cu:394

quda::GaugeField::Anisotropy
double Anisotropy() const
Definition: gauge_field.h:170

BlasMagmaArgs::OpenMagma
static void OpenMagma()
Definition: blas_magma.cpp:39

kappa5
double kappa5
Definition: dslash_test.cpp:32

quda::saveTuneCache
void saveTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:205

quda::cpuColorSpinorField
Definition: color_spinor_field.h:517

QudaGaugeParam_s::cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:40

gaugeSloppy
cudaGaugeField * gaugeSloppy
Definition: interface_quda.cpp:107

QUDA_STAGGERED_DIRAC
Definition: enum_quda.h:257

quda::createStaggeredOprodEvents
void createStaggeredOprodEvents()

ks_improved_force.h

initQudaDevice
void initQudaDevice(int dev)
Definition: interface_quda.cpp:320

endQuda
void endQuda(void)
Definition: interface_quda.cpp:1018

quda::Dirac
Definition: dirac_quda.h:79

computeGaugeForceQuda
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam, double *timeinfo)
Definition: interface_quda.cpp:3608

QudaGaugeParam_s::overlap
int overlap
Definition: quda.h:67

quda::QUDA_PROFILE_COMMS
Definition: quda_internal.h:161

quda::setDiracPreParam
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
Definition: interface_quda.cpp:1199

newQudaGaugeParam
QudaGaugeParam newQudaGaugeParam(void)

QudaInvertParam_s::preserve_source
QudaPreserveSource preserve_source
Definition: quda.h:149

quda_internal.h

quda::LatticeFieldParam::pad
int pad
Definition: lattice_field.h:39

mat_quda_
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
Definition: interface_quda.cpp:5068

QudaInvertParam_s::clover_coeff
double clover_coeff
Definition: quda.h:169

quda::GaugeField
Definition: gauge_field.h:118

fatlink
void * fatlink[4]
Definition: staggered_dslash_test.cpp:51

QUDA_MOBIUS_DWF_DSLASH
Definition: enum_quda.h:79

gaugeLongPrecondition
cudaGaugeField * gaugeLongPrecondition
Definition: interface_quda.cpp:121

quda::GaugeField::removeStaggeredPhase
void removeStaggeredPhase()
Definition: gauge_field.cpp:79

quda::DiracParam::epsilon
double epsilon
Definition: dirac_quda.h:37