v0.9.0/doc/gauge__stout_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <su3_project.cuh>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <index_helper.cuh>

 #define  DOUBLE_TOL 1e-15
 #define  SINGLE_TOL 2e-6

 namespace quda {

 #ifdef GPU_GAUGE_TOOLS

   template <typename Float, typename GaugeOr, typename GaugeDs>
   struct GaugeSTOUTArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     int border[4];
     GaugeOr origin;
     const Float rho;
     const Float tolerance;

     GaugeDs dest;

     GaugeSTOUTArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float rho, const Float tolerance)
       : threads(1), origin(origin), dest(dest), rho(rho), tolerance(tolerance) {
       for ( int dir = 0; dir < 4; ++dir ) {
         border[dir] = data.R()[dir];
         X[dir] = data.X()[dir] - border[dir] * 2;
   threads *= X[dir];
       }
       threads /= 2;
     }
   };


   template <typename Float, typename GaugeOr, typename GaugeDs, typename Float2>
   __host__ __device__ void computeStaple(GaugeSTOUTArg<Float,GaugeOr,GaugeDs>& arg, int idx, int parity, int dir, Matrix<Float2,3> &staple) {

     typedef Matrix<complex<Float>,3> Link;
     // compute spacetime dimensions and parity

     int X[4];
     for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];

     int x[4];
     getCoords(x, idx, X, parity);
     for(int dr=0; dr<4; ++dr) {
       x[dr] += arg.border[dr];
       X[dr] += 2*arg.border[dr];
     }

     setZero(&staple);

     // I believe most users won't want to include time staples in smearing
     for (int mu=0; mu<3; mu++) {

       //identify directions orthogonal to the link.
       if (mu != dir) {

   int nu = dir;
   {
     int dx[4] = {0, 0, 0, 0};
     Link U1, U2, U3;

     //Get link U_{\mu}(x)
     U1 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     dx[mu]++;
     //Get link U_{\nu}(x+\mu)
     U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     dx[mu]--;
     dx[nu]++;
     //Get link U_{\mu}(x+\nu)
     U3 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     // staple += U_{\mu}(x) * U_{\nu}(x+\mu) * U^\dag_{\mu}(x+\nu)
     staple = staple + U1 * U2 * conj(U3);

     dx[mu]--;
     dx[nu]--;
     //Get link U_{\mu}(x-\mu)
     U1 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);
     //Get link U_{\nu}(x-\mu)
     U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     dx[nu]++;
     //Get link U_{\mu}(x-\mu+\nu)
     U3 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     // staple += U^\dag_{\mu}(x-\mu) * U_{\nu}(x-\mu) * U_{\mu}(x-\mu+\nu)
     staple = staple + conj(U1) * U2 * U3;
   }
       }
     }
   }

   template<typename Float, typename GaugeOr, typename GaugeDs>
     __global__ void computeSTOUTStep(GaugeSTOUTArg<Float,GaugeOr,GaugeDs> arg){

       int idx = threadIdx.x + blockIdx.x*blockDim.x;
       int parity = threadIdx.y + blockIdx.y*blockDim.y;
       int dir = threadIdx.z + blockIdx.z*blockDim.z;
       if (idx >= arg.threads) return;
       if (dir >= 3) return;
       typedef complex<Float> Complex;
       typedef Matrix<complex<Float>,3> Link;

       int X[4];
       for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];

       int x[4];
       getCoords(x, idx, X, parity);
       for(int dr=0; dr<4; ++dr) {
   x[dr] += arg.border[dr];
   X[dr] += 2*arg.border[dr];
       }

       int dx[4] = {0, 0, 0, 0};
       //Only spatial dimensions are smeared
       {
         Link U, UDag, Stap, Omega, OmegaDiff, ODT, Q, exp_iQ;
   Complex OmegaDiffTr;
   Complex i_2(0,0.5);

   //This function gets stap = S_{mu,nu} i.e., the staple of length 3,
         computeStaple<Float,GaugeOr,GaugeDs,Complex>(arg,idx,parity,dir,Stap);
   //
   // |- > -|                /- > -/                /- > -
   // ^     v               ^     v                ^
   // |     |              /     /                /- < -
   //         + |     |  +         +  /     /  +         +  - > -/
   //           v     ^              v     ^                    v
   //           |- > -|             /- > -/                - < -/

   // Get link U
         U = arg.origin(dir, linkIndexShift(x,dx,X), parity);

   //Compute Omega_{mu}=[Sum_{mu neq nu}rho_{mu,nu}C_{mu,nu}]*U_{mu}^dag

   //Get U^{\dagger}
   computeMatrixInverse(U,&UDag);

   //Compute \Omega = \rho * S * U^{\dagger}
   Omega = (arg.rho * Stap) * UDag;

   //Compute \Q_{mu} = i/2[Omega_{mu}^dag - Omega_{mu}
   //                      - 1/3 Tr(Omega_{mu}^dag - Omega_{mu})]

   OmegaDiff = conj(Omega) - Omega;

   Q = OmegaDiff;
   OmegaDiffTr = getTrace(OmegaDiff);
   OmegaDiffTr = (1.0/3.0) * OmegaDiffTr;

   //Matrix proportional to OmegaDiffTr
   setIdentity(&ODT);

   Q = Q - OmegaDiffTr * ODT;
   Q = i_2 * Q;
   //Q is now defined.

 #ifdef HOST_DEBUG
   //Test for Tracless:
   //reuse OmegaDiffTr
   OmegaDiffTr = getTrace(Q);
   double error;
   error = OmegaDiffTr.real();
   printf("Trace test %d %d %.15e\n", idx, dir, error);

   //Test for hemiticity:
   Link Q_diff = conj(Q);
   Q_diff -= Q; //This should be the zero matrix. Test by ReTr(Q_diff^2);
   Q_diff *= Q_diff;
   //reuse OmegaDiffTr
   OmegaDiffTr = getTrace(Q_diff);
   error = OmegaDiffTr.real();
   printf("Herm test %d %d %.15e\n", idx, dir, error);
 #endif

   exponentiate_iQ(Q,&exp_iQ);

 #ifdef HOST_DEBUG
   //Test for expiQ unitarity:
   error = ErrorSU3(exp_iQ);
   printf("expiQ test %d %d %.15e\n", idx, dir, error);
 #endif

   U = exp_iQ * U;
 #ifdef HOST_DEBUG
   //Test for expiQ*U unitarity:
   error = ErrorSU3(U);
   printf("expiQ*u test %d %d %.15e\n", idx, dir, error);
 #endif

         arg.dest(dir, linkIndexShift(x,dx,X), parity) = U;
     }
   }

   template<typename Float, typename GaugeOr, typename GaugeDs>
   class GaugeSTOUT : TunableVectorYZ {
       GaugeSTOUTArg<Float,GaugeOr,GaugeDs> arg;
       const GaugeField &meta;

       private:
       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
       unsigned int minThreads() const { return arg.threads; }

       public:
     // (2,3) --- 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
     GaugeSTOUT(GaugeSTOUTArg<Float,GaugeOr,GaugeDs> &arg, const GaugeField &meta)
       : TunableVectorYZ(2,3), arg(arg), meta(meta) {}
       virtual ~GaugeSTOUT () {}

       void apply(const cudaStream_t &stream){
         if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
           computeSTOUTStep<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
         } else {
           errorQuda("CPU not supported yet\n");
           //computeSTOUTStepCPU(arg);
         }
       }

       TuneKey tuneKey() const {
         std::stringstream aux;
         aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
         return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
       }

       long long flops() const { return 3*(2+2*4)*198ll*arg.threads; } // just counts matrix multiplication
       long long bytes() const { return 3*((1+2*6)*arg.origin.Bytes()+arg.dest.Bytes())*arg.threads; }
     }; // GaugeSTOUT

   template<typename Float,typename GaugeOr, typename GaugeDs>
   void STOUTStep(GaugeOr origin, GaugeDs dest, const GaugeField& dataOr, Float rho) {
     GaugeSTOUTArg<Float,GaugeOr,GaugeDs> arg(origin, dest, dataOr, rho, dataOr.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL);
     GaugeSTOUT<Float,GaugeOr,GaugeDs> gaugeSTOUT(arg,dataOr);
     gaugeSTOUT.apply(0);
     qudaDeviceSynchronize();
   }

   template<typename Float>
   void STOUTStep(GaugeField &dataDs, const GaugeField& dataOr, Float rho) {

     if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GDs;

       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
       }
     } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GDs;
       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
       }
     } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GDs;
       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   STOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
             }
     } else {
       errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
     }

   }

 #endif

   void STOUTStep(GaugeField &dataDs, const GaugeField& dataOr, double rho) {

 #ifdef GPU_GAUGE_TOOLS

     if(dataOr.Precision() != dataDs.Precision()) {
       errorQuda("Origin and destination fields must have the same precision\n");
     }

     if(dataDs.Precision() == QUDA_HALF_PRECISION){
       errorQuda("Half precision not supported\n");
     }

     if (!dataOr.isNative())
       errorQuda("Order %d with %d reconstruct not supported", dataOr.Order(), dataOr.Reconstruct());

     if (!dataDs.isNative())
       errorQuda("Order %d with %d reconstruct not supported", dataDs.Order(), dataDs.Reconstruct());

     if (dataDs.Precision() == QUDA_SINGLE_PRECISION){
       STOUTStep<float>(dataDs, dataOr, (float) rho);
     } else if(dataDs.Precision() == QUDA_DOUBLE_PRECISION) {
       STOUTStep<double>(dataDs, dataOr, rho);
     } else {
       errorQuda("Precision %d not supported", dataDs.Precision());
     }
     return;
 #else
   errorQuda("Gauge tools are not build");
 #endif
   }


   //------------------------//
   // Over-Improved routines //
   //------------------------//


   template <typename Float, typename GaugeOr, typename GaugeDs>
   struct GaugeOvrImpSTOUTArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     int border[4];
     GaugeOr origin;
     const Float rho;
     const Float epsilon;
     const Float tolerance;

     GaugeDs dest;

     GaugeOvrImpSTOUTArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float rho, const Float epsilon, const Float tolerance)
       : threads(1), origin(origin), dest(dest), rho(rho), epsilon(epsilon), tolerance(tolerance) {
       for ( int dir = 0; dir < 4; ++dir ) {
         border[dir] = data.R()[dir];
         X[dir] = data.X()[dir] - border[dir] * 2;
   threads *= X[dir];
       }
       threads /= 2;
     }
   };


   template <typename Float, typename GaugeOr, typename GaugeDs, typename Float2>
   __host__ __device__ void computeStapleRectangle(GaugeOvrImpSTOUTArg<Float,GaugeOr,GaugeDs>& arg, int idx, int parity, int dir,
               Matrix<Float2,3> &staple, Matrix<Float2,3> &rectangle) {

     typedef Matrix<complex<Float>,3> Link;
     // compute spacetime dimensions and parity

     int X[4];
     for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];

     int x[4];
     getCoords(x, idx, X, parity);
     for(int dr=0; dr<4; ++dr) {
       x[dr] += arg.border[dr];
       X[dr] += 2*arg.border[dr];
     }

     setZero(&staple);
     setZero(&rectangle);

     // Over-Improved stout is usually done for topological
     // measuremnts, so we include the temporal direction.
     for (int mu=0; mu<4; mu++) {

       //identify directions orthogonal to the link.
       if (mu != dir) {

   int nu = dir;

   //RECTANGLE calculation
   // This is done in three parts. For some link U_nu(x) there are
   // 1x2 rectangles (R12) and two sets of 2x1 rectangles, defined as
   // 'forward' (R21f)  and 'backward' (R21b).

   //STAPLE calculation
   // This is done part way through the computation of (R21f) as the
   // First two links of the staple are already in memory.

   //Memory usage and communications.
   // There are 10 unique links to be fetched per direction. 3 of these
   // links (the ones that form the simple staple) can be recycled on
   // the fly. The two links immediately succeeding and preceding
   // U_nu(x) in the nu directon are also reused when changing from
   // +ve to -ve mu.

   {
     int dx[4] = {0, 0, 0, 0};
     Link U1, U2, U3, U4, U5, U6, U7;

     //--------//
     // +ve mu //
     //--------//

     //----------------------------------------------------------------
     //R12 = U_mu(x)*U_mu(x+mu)*U_nu(x+2mu)*U^d_mu(x+nu+mu)*U^d_mu(x+nu)
     //Get link U_mu(x)
     U1 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     dx[mu]++;
     //Get link U_mu(x+mu)
     U2 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     dx[mu]++;
     //Get link U_nu(x+2mu)
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     dx[mu]--;
     dx[nu]++;
     //Get link U_mu(x+nu+mu)
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     dx[mu]--;
     //Get link U_mu(x+nu)
     U5 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     rectangle = rectangle + U1*U2*U3*conj(U4)*conj(U5);
     //---------------------------------------------------------------

     //reset dx
     dx[nu]--;
     //---------------------------------------------------------------
     //R21f=U_mu(x)*U_nu(x+mu)*U_nu(x+nu+mu)*U^d_mu(x+2nu)*U^d_nu(x+nu)
     //Get link U_mu(x)
     //Same as U1 from R12

     dx[mu]++;
     //Get link U_nu(x+mu)
     U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     //Here we get the third link in the staple and compute.
     //Get U_mu(x+nu)
     //Same as U5 from R12
     staple = staple + U1*U2*conj(U5);

     dx[nu]++;
     //Get link U_nu(x+nu+mu)
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     dx[mu]--;
     dx[nu]++;
     //Get link U_mu(x+2nu)
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     dx[nu]--;
     //Get link U_nu(x+nu)
     U6 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     rectangle = rectangle + U1 * U2 * U3 * conj(U4) * conj(U6);
     //---------------------------------------------------------------


     //reset dx
     dx[nu]--;
     //---------------------------------------------------------------
     //R21b=U^d_nu(x-nu)*U_mu(x-nu)*U_nu(x+nu+mu)*U^d_mu(x+2nu)*U^dag_nu(x+nu)

     //Get link U_nu(x-nu)
     dx[nu]--;
     U7 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     //Get link U_mu(x-nu)
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     //Get link U_nu(x-nu+mu)
     dx[mu]++;
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     //Get link U_nu(x+mu)
     //Same as U2 from R21f

     //Get link U_mu(x+nu)
     //Same as U5 from R12

     rectangle = rectangle + conj(U7) * U4 * U3 * U2 * conj(U5);
     //---------------------------------------------------------------


     //--------//
     // -ve mu //
     //--------//

     //reset dx
     dx[mu]--;
     dx[nu]++;
     //---------------------------------------------------------------
     // R12 = U^dag_mu(x-mu) * U^dag_mu(x-2mu) * U_nu(x-2mu) * U_mu(x-2mu+nu) * U_mu(x-mu+nu)

     dx[mu]--;
     //Get link U_mu(x-mu)
     U1 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     dx[mu]--;
     //Get link U_mu(x-2mu)
     U2 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     //Get link U_nu(x-2mu)
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     dx[nu]++;
     //Get link U_mu(x-2mu+nu)
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     dx[mu]++;
     //Get link U_mu(x-mu+nu)
     U5 = arg.origin(mu, linkIndexShift(x,dx,X), parity);

     rectangle = rectangle + conj(U1) * conj(U2) * U3 * U4 * U5;
     //---------------------------------------------------------------

     //reset dx
     dx[mu]++;
     dx[nu]--;
     //---------------------------------------------------------------
     // R21f = U^dag_mu(x-mu) * U_nu(x-mu) * U_nu(x-mu+nu) * U_mu(x-mu+2nu) * U^dag_nu(x+nu)

     //Get link U_mu(x-mu)
     //Same as U1 from R12

     dx[mu]--;
     //Get link U_nu(x-mu)
     U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);

     //Here we get the third link in the staple and compute.
     //Get U_mu(x-mu+nu)
     //Same as U5 from R12
     staple = staple + conj(U1) * U2 * U5;

     dx[nu]++;
     //Get link U_nu(x-mu+nu)
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     dx[nu]++;
     //Get link U_mu(x-mu+2nu)
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     //Get link U_nu(x+nu)
     //Same as U6 from +ve R21f

     rectangle = rectangle + conj(U1) * U2 * U3 * U4 * conj(U6);
     //---------------------------------------------------------------

     //reset dx
     dx[nu]--;
     dx[nu]--;
     dx[mu]++;
     //---------------------------------------------------------------
     // R21b= U^dag_nu(x-nu) * U^dag_mu(x-mu-nu) * U_nu(x-mu-nu) * U_nu(x-mu) * U_mu(x-mu+nu)

     //Get link U_nu(x-nu)
     //Same as U7 from +ve R21b

     //Get link U_mu(x-mu-nu)
     dx[nu]--;
     dx[mu]--;
     U4 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);

     //Get link U_nu(x-nu-mu)
     U3 = arg.origin(nu, linkIndexShift(x,dx,X), parity);

     //Get link U_nu(x-mu)
     //Same as U2 from R21f

     //Get link U_mu(x-mu+nu)
     //Same as U5 from R12

     rectangle = rectangle + conj(U7) * conj(U4) * U3 * U2 * U5;
     //---------------------------------------------------------------
   }
       }
     }
   }

   template<typename Float, typename GaugeOr, typename GaugeDs>
     __global__ void computeOvrImpSTOUTStep(GaugeOvrImpSTOUTArg<Float,GaugeOr,GaugeDs> arg){

       int idx = threadIdx.x + blockIdx.x*blockDim.x;
       int parity = threadIdx.y + blockIdx.y*blockDim.y;
       int dir = threadIdx.z + blockIdx.z*blockDim.z;
       if (idx >= arg.threads) return;
       //if (dir >= 3) return;
       typedef complex<Float> Complex;
       typedef Matrix<complex<Float>,3> Link;

       int X[4];
       for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];

       int x[4];
       getCoords(x, idx, X, parity);
       for(int dr=0; dr<4; ++dr) {
   x[dr] += arg.border[dr];
   X[dr] += 2*arg.border[dr];
       }

       double staple_coeff = (5.0 - 2.0*arg.epsilon)/3.0;
       double rectangle_coeff = (1.0 - arg.epsilon)/12.0;

       int dx[4] = {0, 0, 0, 0};
       //All dimensions are smeared
       {
         Link U, UDag, Stap, Rect, Omega, OmegaDiff, ODT, Q, exp_iQ;
   Complex OmegaDiffTr;
   Complex i_2(0,0.5);

   //This function gets stap = S_{mu,nu} i.e., the staple of length 3,
   //and the 1x2 and 2x1 rectangles of length 5. From the following paper:
   //https://arxiv.org/abs/0801.1165
         computeStapleRectangle<Float,GaugeOr,GaugeDs,Complex>(arg,idx,parity,dir,Stap,Rect);

   // Get link U
         U = arg.origin(dir, linkIndexShift(x,dx,X), parity);

   //Compute Omega_{mu}=[Sum_{mu neq nu}rho_{mu,nu}C_{mu,nu}]*U_{mu}^dag
   //-------------------------------------------------------------------

   //Get U^{\dagger}
   computeMatrixInverse(U,&UDag);

   //Compute \rho * staple_coeff * S
   Omega = (arg.rho*staple_coeff)*(Stap);

   //Compute \rho * rectangle_coeff * R
   Omega = Omega - (arg.rho*rectangle_coeff)*(Rect);
   Omega = Omega * UDag;

   //Compute \Q_{mu} = i/2[Omega_{mu}^dag - Omega_{mu}
   //                      - 1/3 Tr(Omega_{mu}^dag - Omega_{mu})]

   OmegaDiff = conj(Omega) - Omega;

   Q = OmegaDiff;
   OmegaDiffTr = getTrace(OmegaDiff);
   OmegaDiffTr = (1.0/3.0) * OmegaDiffTr;

   //Matrix proportional to OmegaDiffTr
   setIdentity(&ODT);

   Q = Q - OmegaDiffTr * ODT;
   Q = i_2 * Q;
   //Q is now defined.

 #ifdef HOST_DEBUG
   //Test for Tracless:
   //reuse OmegaDiffTr
   OmegaDiffTr = getTrace(Q);
   double error;
   error = OmegaDiffTr.real();
   printf("Trace test %d %d %.15e\n", idx, dir, error);

   //Test for hemiticity:
   Link Q_diff = conj(Q);
   Q_diff -= Q; //This should be the zero matrix. Test by ReTr(Q_diff^2);
   Q_diff *= Q_diff;
   //reuse OmegaDiffTr
   OmegaDiffTr = getTrace(Q_diff);
   error = OmegaDiffTr.real();
   printf("Herm test %d %d %.15e\n", idx, dir, error);
 #endif

   exponentiate_iQ(Q,&exp_iQ);

 #ifdef HOST_DEBUG
   //Test for expiQ unitarity:
   error = ErrorSU3(exp_iQ);
   printf("expiQ test %d %d %.15e\n", idx, dir, error);
 #endif

   U = exp_iQ * U;
 #ifdef HOST_DEBUG
   //Test for expiQ*U unitarity:
   error = ErrorSU3(U);
   printf("expiQ*u test %d %d %.15e\n", idx, dir, error);
 #endif

         arg.dest(dir, linkIndexShift(x,dx,X), parity) = U;
     }
   }


   template<typename Float, typename GaugeOr, typename GaugeDs>
     class GaugeOvrImpSTOUT : TunableVectorYZ {
       GaugeOvrImpSTOUTArg<Float,GaugeOr,GaugeDs> arg;
       const GaugeField &meta;

       private:
       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
       unsigned int minThreads() const { return arg.threads; }

       public:
     // (2,3) --- 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
     GaugeOvrImpSTOUT(GaugeOvrImpSTOUTArg<Float,GaugeOr,GaugeDs> &arg, const GaugeField &meta)
       : TunableVectorYZ(2,3), arg(arg), meta(meta) {}
       virtual ~GaugeOvrImpSTOUT () {}

       void apply(const cudaStream_t &stream){
         if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
           computeOvrImpSTOUTStep<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
         } else {
           errorQuda("CPU not supported yet\n");
           //computeOvrImpSTOUTStepCPU(arg);
         }
       }

       TuneKey tuneKey() const {
         std::stringstream aux;
         aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
         return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
       }

     long long flops() const { return 4*(18+2+2*4)*198ll*arg.threads; } // just counts matrix multiplication
     long long bytes() const { return 4*((1+2*12)*arg.origin.Bytes()+arg.dest.Bytes())*arg.threads; }
   }; // GaugeSTOUT


   template<typename Float,typename GaugeOr, typename GaugeDs>
   void OvrImpSTOUTStep(GaugeOr origin, GaugeDs dest, const GaugeField& dataOr, Float rho, Float epsilon) {
     GaugeOvrImpSTOUTArg<Float,GaugeOr,GaugeDs> arg(origin, dest, dataOr, rho, epsilon,
                dataOr.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL);
     GaugeOvrImpSTOUT<Float,GaugeOr,GaugeDs> gaugeOvrImpSTOUT(arg,dataOr);
     gaugeOvrImpSTOUT.apply(0);
     qudaDeviceSynchronize();
   }

   template<typename Float>
   void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField& dataOr, Float rho, Float epsilon) {

     if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GDs;

       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
       }
     } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GDs;
       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
       }
     } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GDs;
       if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
   OvrImpSTOUTStep(GOr(dataOr), GDs(dataDs), dataOr, rho, epsilon);
       }else{
   errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
             }
     } else {
       errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
     }

   }


   void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField& dataOr, double rho, double epsilon) {

 #ifdef GPU_GAUGE_TOOLS

     if(dataOr.Precision() != dataDs.Precision()) {
       errorQuda("Origin and destination fields must have the same precision\n");
     }

     if(dataDs.Precision() == QUDA_HALF_PRECISION){
       errorQuda("Half precision not supported\n");
     }

     if (!dataOr.isNative())
       errorQuda("Order %d with %d reconstruct not supported", dataOr.Order(), dataOr.Reconstruct());

     if (!dataDs.isNative())
       errorQuda("Order %d with %d reconstruct not supported", dataDs.Order(), dataDs.Reconstruct());

     if (dataDs.Precision() == QUDA_SINGLE_PRECISION){
       OvrImpSTOUTStep<float>(dataDs, dataOr, (float) rho, epsilon);
     } else if(dataDs.Precision() == QUDA_DOUBLE_PRECISION) {
       OvrImpSTOUTStep<double>(dataDs, dataOr, rho, epsilon);
     } else {
       errorQuda("Precision %d not supported", dataDs.Precision());
     }
     return;
 #else
   errorQuda("Gauge tools are not build");
 #endif
   }
 }
quda::GaugeOvrImpSTOUTArg::dest
GaugeDs dest
Definition: gauge_stout.cu:347

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

quda::ErrorSU3
__device__ __host__ double ErrorSU3(const Matrix< Cmplx, 3 > &matrix)
Definition: quda_matrix.h:1083

mu
double mu
Definition: test_util.cpp:1643

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:592

quda::linkIndexShift
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
Definition: index_helper.cuh:13

quda::GaugeOvrImpSTOUTArg::tolerance
const Float tolerance
Definition: gauge_stout.cu:345

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

quda::GaugeOvrImpSTOUT::~GaugeOvrImpSTOUT
virtual ~GaugeOvrImpSTOUT()
Definition: gauge_stout.cu:716

quda::STOUTStep
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Definition: gauge_stout.cu:300

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

quda::GaugeOvrImpSTOUT::apply
void apply(const cudaStream_t &stream)
Definition: gauge_stout.cu:718

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

quda::GaugeOvrImpSTOUTArg::rho
const Float rho
Definition: gauge_stout.cu:343

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:524

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::GaugeOvrImpSTOUTArg::border
int border[4]
Definition: gauge_stout.cu:341

quda
Definition: blas_cublas.h:6

quda::GaugeOvrImpSTOUTArg::epsilon
const Float epsilon
Definition: gauge_stout.cu:344

quda::GaugeOvrImpSTOUTArg::X
int X[4]
Definition: gauge_stout.cu:340

quda::GaugeOvrImpSTOUT::tuneKey
TuneKey tuneKey() const
Definition: gauge_stout.cu:728

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:452

SINGLE_TOL
#define SINGLE_TOL
Definition: gauge_stout.cu:10

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

printf
int printf(const char *,...) __attribute__((__format__(__printf__

quda::GaugeOvrImpSTOUT::arg
GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > arg
Definition: gauge_stout.cu:705

quda::GaugeOvrImpSTOUT
Definition: gauge_stout.cu:704

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::GaugeOvrImpSTOUT::GaugeOvrImpSTOUT
GaugeOvrImpSTOUT(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > &arg, const GaugeField &meta)
Definition: gauge_stout.cu:714

quda::GaugeOvrImpSTOUTArg::threads
int threads
Definition: gauge_stout.cu:339

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

quda::GaugeOvrImpSTOUT::meta
const GaugeField & meta
Definition: gauge_stout.cu:706

quda::GaugeOvrImpSTOUT::bytes
long long bytes() const
Definition: gauge_stout.cu:735

quda::GaugeOvrImpSTOUT::flops
long long flops() const
Definition: gauge_stout.cu:734

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

su3_project.cuh

quda::OvrImpSTOUTStep
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Definition: gauge_stout.cu:801

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

quda::computeStapleRectangle
__host__ __device__ void computeStapleRectangle(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > &arg, int idx, int parity, int dir, Matrix< Float2, 3 > &staple, Matrix< Float2, 3 > &rectangle)
Definition: gauge_stout.cu:362

quda::GaugeOvrImpSTOUT::tuneGridDim
bool tuneGridDim() const
Definition: gauge_stout.cu:709

quda::GaugeOvrImpSTOUTArg
Definition: gauge_stout.cu:338

tune_quda.h

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543

quda::GaugeOvrImpSTOUTArg::origin
GaugeOr origin
Definition: gauge_stout.cu:342

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:522

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

quda::getTrace
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305

index_helper.cuh

DOUBLE_TOL
#define DOUBLE_TOL
Definition: gauge_stout.cu:9

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::TunableVectorYZ
Definition: tune_quda.h:400

quda::gauge_mapper
Definition: gauge_field_order.h:2083

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::computeMatrixInverse
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:501

quda::computeOvrImpSTOUTStep
__global__ void computeOvrImpSTOUTStep(GaugeOvrImpSTOUTArg< Float, GaugeOr, GaugeDs > arg)
Definition: gauge_stout.cu:598

quda::GaugeOvrImpSTOUTArg::GaugeOvrImpSTOUTArg
GaugeOvrImpSTOUTArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float rho, const Float epsilon, const Float tolerance)
Definition: gauge_stout.cu:349

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204

float
float
Definition: CMakeCUDACompilerId.cpp1.ii:12791

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:138

parity
QudaParity parity
Definition: covdev_test.cpp:53

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:68

quda::exponentiate_iQ
__device__ __host__ void exponentiate_iQ(const Matrix< T, 3 > &Q, Matrix< T, 3 > *exp_iQ)
Definition: quda_matrix.h:1110

quda::Tunable::aux
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:123

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129

quda::GaugeOvrImpSTOUT::minThreads
unsigned int minThreads() const
Definition: gauge_stout.cu:710