v0.9.0/doc/hisq__paths__force__quda_8cu_source.html

 #include <quda_internal.h>
 #include <gauge_field.h>
 #include <ks_improved_force.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <index_helper.cuh>
 #include <gauge_field_order.h>

 #ifdef GPU_HISQ_FORCE

 namespace quda {

   namespace fermion_force {

     enum {
       XUP = 0,
       YUP = 1,
       ZUP = 2,
       TUP = 3,
       TDOWN = 4,
       ZDOWN = 5,
       YDOWN = 6,
       XDOWN = 7
     };

     enum HisqForceType {
       FORCE_ALL_LINK,
       FORCE_MIDDLE_LINK,
       FORCE_LEPAGE_MIDDLE_LINK,
       FORCE_SIDE_LINK,
       FORCE_SIDE_LINK_SHORT,
       FORCE_LONG_LINK,
       FORCE_COMPLETE,
       FORCE_ONE_LINK,
       FORCE_INVALID
     };

     __device__ __host__ constexpr inline int opp_dir(int dir) { return 7-dir; }
     __device__ __host__ constexpr inline int goes_forward(int dir) { return dir<=3; }
     __device__ __host__ constexpr inline int goes_backward(int dir) { return dir>3; }
     __device__ __host__ constexpr inline int CoeffSign(int pos_dir, int odd_lattice) { return 2*((pos_dir + odd_lattice + 1) & 1) - 1; }
     __device__ __host__ constexpr inline int Sign(int parity) { return parity ? -1 : 1; }
     __device__ __host__ constexpr inline int posDir(int dir) { return (dir >= 4) ? 7-dir : dir; }

     template <int dir, typename Arg>
     inline __device__ __host__ void updateCoords(int x[], int shift, const Arg &arg) {
       x[dir] = (x[dir] + shift + arg.E[dir]) % arg.E[dir];
     }

     template <typename Arg>
     inline __device__ __host__ void updateCoords(int x[], int dir, int shift, const Arg &arg) {
       switch (dir) {
       case 0: updateCoords<0>(x, shift, arg); break;
       case 1: updateCoords<1>(x, shift, arg); break;
       case 2: updateCoords<2>(x, shift, arg); break;
       case 3: updateCoords<3>(x, shift, arg); break;
       }
     }

     //struct for holding the fattening path coefficients
     template <typename real>
     struct PathCoefficients {
       const real one;
       const real three;
       const real five;
       const real seven;
       const real naik;
       const real lepage;
       PathCoefficients(const double *path_coeff_array)
         : one(path_coeff_array[0]), naik(path_coeff_array[1]),
           three(path_coeff_array[2]), five(path_coeff_array[3]),
           seven(path_coeff_array[4]), lepage(path_coeff_array[5]) { }
     };

     template <typename real, QudaReconstructType reconstruct=QUDA_RECONSTRUCT_NO>
     struct BaseForceArg {
       typedef typename gauge_mapper<real,reconstruct>::type G;
       const G link;
       int threads;
       int X[4]; // regular grid dims
       int D[4]; // working set grid dims
       int E[4]; // extended grid dims

       int commDim[4];
       int border[4];
       int base_idx[4]; // the offset into the extended field
       int oddness_change;
       int mu;
       int sig;

       BaseForceArg(const GaugeField &link, int overlap) : link(link), threads(1),
         commDim{ comm_dim_partitioned(0), comm_dim_partitioned(1), comm_dim_partitioned(2), comm_dim_partitioned(3) }
       {
         for (int d=0; d<4; d++) {
           E[d] = link.X()[d];
           border[d] = link.R()[d];
           X[d] = E[d] - 2*border[d];
           D[d] = comm_dim_partitioned(d) ? X[d]+overlap*2 : X[d];
           base_idx[d] = comm_dim_partitioned(d) ? border[d]-overlap : 0;
           threads *= D[d];
         }
         threads /= 2;
         oddness_change = (base_idx[0] + base_idx[1] + base_idx[2] + base_idx[3])&1;
       }
     };

     template <typename real, QudaReconstructType reconstruct=QUDA_RECONSTRUCT_NO>
     struct FatLinkArg : public BaseForceArg<real,reconstruct> {

       typedef typename gauge_mapper<real,QUDA_RECONSTRUCT_NO>::type F;
       F outA;
       F outB;
       F pMu;
       F p3;
       F qMu;

       const F oProd;
       const F qProd;
       const F qPrev;
       const real coeff;
       const real accumu_coeff;

       const bool p_mu;
       const bool q_mu;
       const bool q_prev;

       FatLinkArg(GaugeField &force, const GaugeField &oProd, const GaugeField &link, real coeff, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, 0), outA(force), outB(force), pMu(oProd), p3(oProd), qMu(oProd),
         oProd(oProd), qProd(oProd), qPrev(oProd), coeff(coeff), accumu_coeff(0),
         p_mu(false), q_mu(false), q_prev(false)
       { if (type != FORCE_ONE_LINK) errorQuda("This constructor is for FORCE_ONE_LINK"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &pMu, GaugeField &P3, GaugeField &qMu,
                  const GaugeField &oProd, const GaugeField &qPrev, const GaugeField &link,
                  real coeff, int overlap, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(newOprod), pMu(pMu), p3(P3), qMu(qMu),
         oProd(oProd), qProd(oProd), qPrev(qPrev), coeff(coeff), accumu_coeff(0), p_mu(true), q_mu(true), q_prev(true)
       { if (type != FORCE_MIDDLE_LINK) errorQuda("This constructor is for FORCE_MIDDLE_LINK"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &pMu, GaugeField &P3, GaugeField &qMu,
                  const GaugeField &oProd, const GaugeField &link,
                  real coeff, int overlap, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(newOprod), pMu(pMu), p3(P3), qMu(qMu),
         oProd(oProd), qProd(oProd), qPrev(qMu), coeff(coeff), accumu_coeff(0), p_mu(true), q_mu(true), q_prev(false)
       { if (type != FORCE_MIDDLE_LINK) errorQuda("This constructor is for FORCE_MIDDLE_LINK"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &P3, const GaugeField &oProd,
                  const GaugeField &qPrev, const GaugeField &link,
                  real coeff, int overlap, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(newOprod), pMu(P3), p3(P3), qMu(qPrev),
         oProd(oProd), qProd(oProd), qPrev(qPrev), coeff(coeff), accumu_coeff(0), p_mu(false), q_mu(false), q_prev(true)
       { if (type != FORCE_LEPAGE_MIDDLE_LINK) errorQuda("This constructor is for FORCE_MIDDLE_LINK"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &shortP, const GaugeField &P3,
                  const GaugeField &qProd, const GaugeField &link, real coeff, real accumu_coeff, int overlap, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(shortP), pMu(P3), p3(P3), qMu(qProd), oProd(qProd), qProd(qProd),
         qPrev(qProd), coeff(coeff), accumu_coeff(accumu_coeff),
         p_mu(false), q_mu(false), q_prev(false)
       { if (type != FORCE_SIDE_LINK) errorQuda("This constructor is for FORCE_SIDE_LINK or FORCE_ALL_LINK"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &P3, const GaugeField &link,
                  real coeff, int overlap, HisqForceType type)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(newOprod),
         pMu(P3), p3(P3), qMu(P3), oProd(P3), qProd(P3), qPrev(P3), coeff(coeff), accumu_coeff(0.0),
         p_mu(false), q_mu(false), q_prev(false)
       { if (type != FORCE_SIDE_LINK_SHORT) errorQuda("This constructor is for FORCE_SIDE_LINK_SHORT"); }

       FatLinkArg(GaugeField &newOprod, GaugeField &shortP, const GaugeField &oProd, const GaugeField &qPrev,
                  const GaugeField &link, real coeff, real accumu_coeff, int overlap, HisqForceType type, bool dummy)
         : BaseForceArg<real,reconstruct>(link, overlap), outA(newOprod), outB(shortP), oProd(oProd), qPrev(qPrev),
         pMu(shortP), p3(shortP), qMu(qPrev), qProd(qPrev), // dummy
         coeff(coeff), accumu_coeff(accumu_coeff), p_mu(false), q_mu(false), q_prev(false)
       { if (type != FORCE_ALL_LINK) errorQuda("This constructor is for FORCE_ALL_LINK"); }

     };

     template <typename real, typename Arg>
     __global__ void oneLinkTermKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;
       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;
       int sig = blockIdx.z * blockDim.z + threadIdx.z;
       if (sig >= 4) return;

       int x[4];
       getCoords(x, x_cb, arg.X, parity);
 #pragma unroll
       for (int d=0; d<4; d++) x[d] += arg.border[d];
       int e_cb = linkIndex(x,arg.E);

       Link w = arg.oProd(sig, e_cb, parity);
       Link force = arg.outA(sig, e_cb, parity);
       force += arg.coeff * w;
       arg.outA(sig, e_cb, parity) = force;
     }


     /********************************allLinkKernel*********************************************
      *
      * In this function we need
      *   READ
      *     3 LINKS:         ad_link, ab_link, bc_link
      *     5 COLOR MATRIX:  Qprev_at_D, oprod_at_C, newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D
      *   WRITE:
      *     3 COLOR MATRIX:  newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D,
      *
      * If sig is negative, then we don't need to read/write the color matrix newOprod_at_A(sig)
      *
      * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
      *
      *             if (sig is positive):    (3, 8)
      *             else               :     (3, 6)
      *
      * This function is called 384 times, half positive sig, half negative sig
      *
      * Flop count, in two-number pair (matrix_multi, matrix_add)
      *             if(sig is positive)      (6,3)
      *             else                     (4,2)
      *
      ************************************************************************************************/
     template<typename real, int sig_positive, int mu_positive, typename Arg>
     __global__ void allLinkKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;

       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       getCoords(x, x_cb, arg.D, parity);
       for (int d=0; d<4; d++) x[d] += arg.base_idx[d];
       int e_cb = linkIndex(x,arg.E);
       parity = parity^arg.oddness_change;

       real mycoeff = CoeffSign(sig_positive,parity)*arg.coeff;

       int y[4] = {x[0], x[1], x[2], x[3]};
       int mysig = posDir(arg.sig);
       updateCoords(y, mysig, (sig_positive ? 1 : -1), arg);
       int point_b = linkIndex(y,arg.E);
       int ab_link_nbr_idx = (sig_positive) ? e_cb : point_b;

       for (int d=0; d<4; d++) y[d] = x[d];

       /*            sig
        *         A________B
        *      mu  |      |
        *        D |      |C
        *
        *   A is the current point (sid)
        *
        */

       int mu = mu_positive ? arg.mu : opp_dir(arg.mu);
       int dir = mu_positive ? -1 : 1;

       updateCoords(y, mu, dir, arg);
       int point_d = linkIndex(y,arg.E);
       updateCoords(y, mysig, (sig_positive ? 1 : -1), arg);
       int point_c = linkIndex(y,arg.E);

       Link Uab = arg.link(posDir(arg.sig), ab_link_nbr_idx, sig_positive^(1-parity));
       Link Uad = arg.link(mu, mu_positive ? point_d : e_cb, mu_positive ? 1-parity : parity);
       Link Ubc = arg.link(mu, mu_positive ? point_c : point_b, mu_positive ? parity : 1-parity);
       Link Ox = arg.qPrev(0, point_d, 1-parity);
       Link Oy = arg.oProd(0, point_c, parity);
       Link Oz = mu_positive ? conj(Ubc)*Oy : Ubc*Oy;

       if (sig_positive) {
         Link force = arg.outA(arg.sig, e_cb, parity);
         force += Sign(parity)*mycoeff*Oz*Ox* (mu_positive ? Uad : conj(Uad));
         arg.outA(arg.sig, e_cb, parity) = force;
         Oy = Uab*Oz;
       } else {
         Oy = conj(Uab)*Oz;
       }

       Link force = arg.outA(mu, mu_positive ? point_d : e_cb, mu_positive ? 1-parity : parity);
       force += Sign(mu_positive ? 1-parity : parity)*mycoeff* (mu_positive ? Oy*Ox : conj(Ox)*conj(Oy));
       arg.outA(mu, mu_positive ? point_d : e_cb, mu_positive ? 1-parity : parity) = force;

       Link shortP = arg.outB(0, point_d, 1-parity);
       shortP += arg.accumu_coeff* (mu_positive ? Uad : conj(Uad)) *Oy;
       arg.outB(0, point_d, 1-parity) = shortP;
     }


     /**************************middleLinkKernel*****************************
      *
      *
      * Generally we need
      * READ
      *    3 LINKS:         ab_link,     bc_link,    ad_link
      *    3 COLOR MATRIX:  newOprod_at_A, oprod_at_C,  Qprod_at_D
      * WRITE
      *    4 COLOR MATRIX:  newOprod_at_A, P3_at_A, Pmu_at_B, Qmu_at_A
      *
      * Three call variations:
      *   1. when Qprev == NULL:   Qprod_at_D does not exist and is not read in
      *   2. full read/write
      *   3. when Pmu/Qmu == NULL,   Pmu_at_B and Qmu_at_A are not written out
      *
      *   In all three above case, if the direction sig is negative, newOprod_at_A is
      *   not read in or written out.
      *
      * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
      *   Call 1:  (called 48 times, half positive sig, half negative sig)
      *             if (sig is positive):    (3, 6)
      *             else               :     (3, 4)
      *   Call 2:  (called 192 time, half positive sig, half negative sig)
      *             if (sig is positive):    (3, 7)
      *             else               :     (3, 5)
      *   Call 3:  (called 48 times, half positive sig, half negative sig)
      *             if (sig is positive):    (3, 5)
      *             else               :     (3, 2) no need to loadQprod_at_D in this case
      *
      * note: oprod_at_C could actually be read in from D when it is the fresh outer product
      *       and we call it oprod_at_C to simply naming. This does not affect our data traffic analysis
      *
      * Flop count, in two-number pair (matrix_multi, matrix_add)
      *   call 1:     if (sig is positive)  (3, 1)
      *               else                  (2, 0)
      *   call 2:     if (sig is positive)  (4, 1)
      *               else                  (3, 0)
      *   call 3:     if (sig is positive)  (4, 1)
      *   (Lepage)    else                  (2, 0)
      *
      ****************************************************************************/
     template <typename real, int sig_positive, int mu_positive, bool pMu, bool qMu, bool qPrev, typename Arg>
     __global__ void middleLinkKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;

       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       getCoords(x, x_cb, arg.D, parity);

       /*        A________B
        *   mu   |        |
        *       D|        |C
        *
        *    A is the current point (sid)
        *
        */

       for (int d=0; d<4; d++) x[d] += arg.base_idx[d];
       int e_cb = linkIndex(x,arg.E);
       parity = parity ^ arg.oddness_change;
       int y[4] = {x[0], x[1], x[2], x[3]};

       int mymu = posDir(arg.mu);
       updateCoords(y, mymu, (mu_positive ? -1 : 1), arg);

       int point_d = linkIndex(y, arg.E);
       int ad_link_nbr_idx = mu_positive ? point_d : e_cb;

       int mysig = posDir(arg.sig);
       updateCoords(y, mysig, (sig_positive ? 1 : -1), arg);
       int point_c = linkIndex(y, arg.E);

       for (int d=0; d<4; d++) y[d] = x[d];
       updateCoords(y, mysig, (sig_positive ? 1 : -1), arg);
       int point_b = linkIndex(y, arg.E);

       int bc_link_nbr_idx = mu_positive ? point_c : point_b;
       int ab_link_nbr_idx = sig_positive ? e_cb : point_b;

       // load the link variable connecting a and b
       Link Uab = arg.link(mysig, ab_link_nbr_idx, sig_positive^(1-parity));

       // load the link variable connecting b and c
       Link Ubc = arg.link(mymu, bc_link_nbr_idx, mu_positive^(1-parity));

       Link Oy;
       if (!qPrev) {
         Oy = arg.oProd(posDir(arg.sig), sig_positive ? point_d : point_c, sig_positive^parity);
         if (!sig_positive) Oy = conj(Oy);
       } else { // QprevOdd != NULL
         Oy = arg.oProd(0, point_c, parity);
       }

       Link Ow = !mu_positive ? Ubc*Oy : conj(Ubc)*Oy;

       if (pMu) arg.pMu(0, point_b, 1-parity) = Ow;

       arg.p3(0, e_cb, parity) = sig_positive ? Uab*Ow : conj(Uab)*Ow;

       Link Uad = arg.link(mymu, ad_link_nbr_idx, mu_positive^parity);
       if (!mu_positive)  Uad = conj(Uad);

       if (!qPrev) {
         if (sig_positive) Oy = Ow*Uad;
         if ( qMu ) arg.qMu(0, e_cb, parity) = Uad;
       } else {
         Link Ox;
         if ( qMu || sig_positive ) {
           Oy = arg.qPrev(0, point_d, 1-parity);
           Ox = Oy*Uad;
         }
         if ( qMu ) arg.qMu(0, e_cb, parity) = Ox;
         if (sig_positive) Oy = Ow*Ox;
       }

       if (sig_positive) {
         Link oprod = arg.outA(arg.sig, e_cb, parity);
         oprod += arg.coeff*Oy;
         arg.outA(arg.sig, e_cb, parity) = oprod;
       }

     }

     /***********************************sideLinkKernel***************************
      *
      * In general we need
      * READ
      *    1  LINK:          ad_link
      *    4  COLOR MATRIX:  shortP_at_D, newOprod, P3_at_A, Qprod_at_D,
      * WRITE
      *    2  COLOR MATRIX:  shortP_at_D, newOprod,
      *
      * Two call variations:
      *   1. full read/write
      *   2. when shortP == NULL && Qprod == NULL:
      *          no need to read ad_link/shortP_at_D or write shortP_at_D
      *          Qprod_at_D does not exit and is not read in
      *
      *
      * Therefore the data traffic, in two-number pair (num_of_links, num_of_color_matrix)
      *   Call 1:   (called 192 times)
      *                           (1, 6)
      *
      *   Call 2:   (called 48 times)
      *                           (0, 3)
      *
      * note: newOprod can be at point D or A, depending on if mu is postive or negative
      *
      * Flop count, in two-number pair (matrix_multi, matrix_add)
      *   call 1:       (2, 2)
      *   call 2:       (0, 1)
      *
      *********************************************************************************/
     template <typename real, int mu_positive, typename Arg>
     __global__ void sideLinkKernel(Arg arg)
     {
       typedef Matrix<complex<real>, 3> Link;
       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       getCoords(x, x_cb ,arg.D, parity);
       for (int d=0; d<4; d++) x[d] = x[d] + arg.base_idx[d];
       int e_cb = linkIndex(x,arg.E);
       parity = parity ^ arg.oddness_change;

       /*      compute the side link contribution to the momentum
        *
        *             sig
        *          A________B
        *           |       |   mu
        *         D |       |C
        *
        *      A is the current point (x_cb)
        *
        */

       int mymu = posDir(arg.mu);
       int y[4] = {x[0], x[1], x[2], x[3]};
       updateCoords(y, mymu, (mu_positive ? -1 : 1), arg);
       int point_d = linkIndex(y,arg.E);

       Link Oy = arg.p3(0, e_cb, parity);

       {
         int ad_link_nbr_idx = mu_positive ? point_d : e_cb;

         Link Uad = arg.link(mymu, ad_link_nbr_idx, mu_positive^parity);
         Link Ow = mu_positive ? Uad*Oy : conj(Uad)*Oy;

         Link shortP = arg.outB(0, point_d, 1-parity);
         shortP += arg.accumu_coeff * Ow;
         arg.outB(0, point_d, 1-parity) = shortP;
       }

       {
         Link Ox = arg.qProd(0, point_d, 1-parity);
         Link Ow = mu_positive ? Oy*Ox : conj(Ox)*conj(Oy);

         real mycoeff = CoeffSign(goes_forward(arg.sig), parity)*CoeffSign(goes_forward(arg.mu),parity)*arg.coeff;

         Link oprod = arg.outA(mu_positive ? arg.mu : opp_dir(arg.mu), mu_positive ? point_d : e_cb, mu_positive ? 1-parity : parity);
         oprod += mycoeff * Ow;
         arg.outA(mu_positive ? arg.mu : opp_dir(arg.mu), mu_positive ? point_d : e_cb, mu_positive ? 1-parity : parity) = oprod;
       }
     }

     // Flop count, in two-number pair (matrix_mult, matrix_add)
     //    (0,1)
     template<typename real, int mu_positive, typename Arg>
     __global__ void sideLinkShortKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;
       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       getCoords(x, x_cb, arg.D, parity);
       for (int d=0; d<4; d++) x[d] = x[d] + arg.base_idx[d];
       int e_cb = linkIndex(x,arg.E);
       parity = parity ^ arg.oddness_change;

       /*      compute the side link contribution to the momentum
        *
        *             sig
        *          A________B
        *           |       |   mu
        *         D |       |C
        *
        *      A is the current point (x_cb)
        *
        */
       int mymu = posDir(arg.mu);
       int y[4] = {x[0], x[1], x[2], x[3]};
       updateCoords(y, mymu, (mu_positive ? -1 : 1), arg);
       int point_d = mu_positive ? linkIndex(y,arg.E) : e_cb;

       int parity_ = mu_positive ? 1-parity : parity;
       real mycoeff = CoeffSign(goes_forward(arg.sig),parity)*CoeffSign(goes_forward(arg.mu),parity)*arg.coeff;

       Link Oy = arg.p3(0, e_cb, parity);
       Link oprod = arg.outA(posDir(arg.mu), point_d, parity_);
       oprod += mu_positive ? mycoeff * Oy : mycoeff * conj(Oy);
       arg.outA(posDir(arg.mu), point_d, parity_) = oprod;
     }

     template <typename real, typename Arg>
     class FatLinkForce : public TunableVectorYZ {

     private:
       Arg &arg;
       const GaugeField &meta;
       const HisqForceType type;

       unsigned int minThreads() const { return arg.threads; }
       bool tuneGridDim() const { return false; }

     public:
       FatLinkForce(Arg &arg, const GaugeField &meta, int sig, int mu, HisqForceType type)
         : TunableVectorYZ(2,type == FORCE_ONE_LINK ? 4 : 1), arg(arg), meta(meta), type(type) {
         arg.sig = sig;
         arg.mu = mu;
       }
       virtual ~FatLinkForce() { }

       TuneKey tuneKey() const {
         std::stringstream aux;
         if (type == FORCE_ONE_LINK) aux << "threads=" << arg.threads;
         else if (type == FORCE_MIDDLE_LINK || type == FORCE_LEPAGE_MIDDLE_LINK)
           aux << "threads=" << arg.threads << ",sig=" << arg.sig << ",mu=" << arg.mu <<
             ",pMu=" << arg.p_mu << ",q_muu=" << arg.q_mu << ",q_prev=" << arg.q_prev;
         else
           aux << "threads=" << arg.threads << ",mu=" << arg.mu; // no sig dependence needed for side link

         switch (type) {
         case FORCE_ONE_LINK:           aux << ",ONE_LINK";           break;
         case FORCE_ALL_LINK:           aux << ",ALL_LINK";           break;
         case FORCE_MIDDLE_LINK:        aux << ",MIDDLE_LINK";        break;
         case FORCE_LEPAGE_MIDDLE_LINK: aux << ",LEPAGE_MIDDLE_LINK"; break;
         case FORCE_SIDE_LINK:          aux << ",SIDE_LINK";          break;
         case FORCE_SIDE_LINK_SHORT:    aux << ",SIDE_LINK_SHORT";    break;
         default: errorQuda("Undefined force type %d", type);
         }
         return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
       }

       void apply(const cudaStream_t &stream) {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         switch (type) {
         case FORCE_ONE_LINK:
           oneLinkTermKernel<real,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           break;
         case FORCE_ALL_LINK:
           if (goes_forward(arg.sig) && goes_forward(arg.mu))
             allLinkKernel<real,1,1,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else if (goes_forward(arg.sig) && goes_backward(arg.mu))
             allLinkKernel<real,1,0,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else if (goes_backward(arg.sig) && goes_forward(arg.mu))
             allLinkKernel<real,0,1,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else
             allLinkKernel<real,0,0,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           break;
         case FORCE_MIDDLE_LINK:
           if (!arg.p_mu || !arg.q_mu) errorQuda("Expect p_mu=%d and q_mu=%d to both be true", arg.p_mu, arg.q_mu);
           if (arg.q_prev) {
             if (goes_forward(arg.sig) && goes_forward(arg.mu))
               middleLinkKernel<real,1,1,true,true,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else if (goes_forward(arg.sig) && goes_backward(arg.mu))
               middleLinkKernel<real,1,0,true,true,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else if (goes_backward(arg.sig) && goes_forward(arg.mu))
               middleLinkKernel<real,0,1,true,true,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else
               middleLinkKernel<real,0,0,true,true,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           } else {
             if (goes_forward(arg.sig) && goes_forward(arg.mu))
               middleLinkKernel<real,1,1,true,true,false,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else if (goes_forward(arg.sig) && goes_backward(arg.mu))
               middleLinkKernel<real,1,0,true,true,false,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else if (goes_backward(arg.sig) && goes_forward(arg.mu))
               middleLinkKernel<real,0,1,true,true,false,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
             else
               middleLinkKernel<real,0,0,true,true,false,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           }
           break;
         case FORCE_LEPAGE_MIDDLE_LINK:
           if (arg.p_mu || arg.q_mu || !arg.q_prev)
             errorQuda("Expect p_mu=%d and q_mu=%d to both be false and q_prev=%d true", arg.p_mu, arg.q_mu, arg.q_prev);
           if (goes_forward(arg.sig) && goes_forward(arg.mu))
             middleLinkKernel<real,1,1,false,false,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else if (goes_forward(arg.sig) && goes_backward(arg.mu))
             middleLinkKernel<real,1,0,false,false,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else if (goes_backward(arg.sig) && goes_forward(arg.mu))
             middleLinkKernel<real,0,1,false,false,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else
             middleLinkKernel<real,0,0,false,false,true,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           break;
         case FORCE_SIDE_LINK:
           if (goes_forward(arg.mu)) sideLinkKernel<real,1,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else                      sideLinkKernel<real,0,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           break;
         case FORCE_SIDE_LINK_SHORT:
           if (goes_forward(arg.mu)) sideLinkShortKernel<real,1,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           else                      sideLinkShortKernel<real,0,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
           break;
         default:
             errorQuda("Undefined force type %d", type);
         }
       }

       void preTune() {
         switch (type) {
         case FORCE_ONE_LINK:
           arg.outA.save();
           break;
         case FORCE_ALL_LINK:
           arg.outA.save();
           arg.outB.save();
           break;
         case FORCE_MIDDLE_LINK:
           arg.pMu.save();
           arg.qMu.save();
         case FORCE_LEPAGE_MIDDLE_LINK:
           arg.outA.save();
           arg.p3.save();
           break;
         case FORCE_SIDE_LINK:
           arg.outB.save();
         case FORCE_SIDE_LINK_SHORT:
           arg.outA.save();
           break;
         default: errorQuda("Undefined force type %d", type);
         }
       }

       void postTune() {
         switch (type) {
         case FORCE_ONE_LINK:
           arg.outA.load();
           break;
         case FORCE_ALL_LINK:
           arg.outA.load();
           arg.outB.load();
           break;
         case FORCE_MIDDLE_LINK:
           arg.pMu.load();
           arg.qMu.load();
         case FORCE_LEPAGE_MIDDLE_LINK:
           arg.outA.load();
           arg.p3.load();
           break;
         case FORCE_SIDE_LINK:
           arg.outB.load();
         case FORCE_SIDE_LINK_SHORT:
           arg.outA.load();
           break;
         default: errorQuda("Undefined force type %d", type);
         }
       }

       long long flops() const {
         switch (type) {
         case FORCE_ONE_LINK:
           return 2*4*arg.threads*36ll;
         case FORCE_ALL_LINK:
           return 2*arg.threads*(goes_forward(arg.sig) ? 1242ll : 828ll);
         case FORCE_MIDDLE_LINK:
         case FORCE_LEPAGE_MIDDLE_LINK:
           return 2*arg.threads*(2 * 198 +
                                 (!arg.q_prev && goes_forward(arg.sig) ? 198 : 0) +
                                 (arg.q_prev && (arg.q_mu || goes_forward(arg.sig) ) ? 198 : 0) +
                                 ((arg.q_prev && goes_forward(arg.sig) ) ?  198 : 0) +
                                 ( goes_forward(arg.sig) ? 216 : 0) );
         case FORCE_SIDE_LINK:       return 2*arg.threads*2*234;
         case FORCE_SIDE_LINK_SHORT: return 2*arg.threads*36;
         default: errorQuda("Undefined force type %d", type);
         }
         return 0;
       }

       long long bytes() const {
         switch (type) {
         case FORCE_ONE_LINK:
           return 2*4*arg.threads*( arg.oProd.Bytes() + 2*arg.outA.Bytes() );
         case FORCE_ALL_LINK:
           return 2*arg.threads*( (goes_forward(arg.sig) ? 4 : 2)*arg.outA.Bytes() + 3*arg.link.Bytes()
                                  + arg.oProd.Bytes() + arg.qPrev.Bytes() + 2*arg.outB.Bytes());
         case FORCE_MIDDLE_LINK:
         case FORCE_LEPAGE_MIDDLE_LINK:
           return 2*arg.threads*( ( goes_forward(arg.sig) ? 2*arg.outA.Bytes() : 0 ) +
                                  (arg.p_mu ? arg.pMu.Bytes() : 0) +
                                  (arg.q_mu ? arg.qMu.Bytes() : 0) +
                                  ( ( goes_forward(arg.sig) || arg.q_mu ) ? arg.qPrev.Bytes() : 0) +
                                  arg.p3.Bytes() + 3*arg.link.Bytes() + arg.oProd.Bytes() );
         case FORCE_SIDE_LINK:
           return 2*arg.threads*( 2*arg.outA.Bytes() + 2*arg.outB.Bytes() +
                                  arg.p3.Bytes() + arg.link.Bytes() + arg.qProd.Bytes() );
         case FORCE_SIDE_LINK_SHORT:
           return 2*arg.threads*( 2*arg.outA.Bytes() + arg.p3.Bytes() );
         default: errorQuda("Undefined force type %d", type);
         }
         return 0;
       }
     };

     template<typename real>
     static void hisqStaplesForce(GaugeField &Pmu, GaugeField &P3, GaugeField &P5, GaugeField &Pnumu,
                                  GaugeField &Qmu, GaugeField &Qnumu, GaugeField &newOprod,
                                  const GaugeField &oprod, const GaugeField &link,
                                  const PathCoefficients<real> &act_path_coeff)
     {
       real OneLink = act_path_coeff.one;
       real ThreeSt = act_path_coeff.three;
       real mThreeSt = -ThreeSt;
       real FiveSt  = act_path_coeff.five;
       real mFiveSt  = -FiveSt;
       real SevenSt = act_path_coeff.seven;
       real Lepage  = act_path_coeff.lepage;
       real mLepage  = -Lepage;

       FatLinkArg<real> arg(newOprod, oprod, link, OneLink, FORCE_ONE_LINK);
       FatLinkForce<real, FatLinkArg<real> > oneLink(arg, link, 0, 0, FORCE_ONE_LINK);
       oneLink.apply(0);

       for (int sig=0; sig<8; sig++) {
         for (int mu=0; mu<8; mu++) {
           if ( (mu == sig) || (mu == opp_dir(sig))) continue;

           //3-link
           //Kernel A: middle link
           FatLinkArg<real> middleLinkArg( newOprod, Pmu, P3, Qmu, oprod, link, mThreeSt, 2, FORCE_MIDDLE_LINK);
           FatLinkForce<real, FatLinkArg<real> > middleLink(middleLinkArg, link, sig, mu, FORCE_MIDDLE_LINK);
           middleLink.apply(0);

           for (int nu=0; nu < 8; nu++) {
             if (nu == sig || nu == opp_dir(sig) || nu == mu || nu == opp_dir(mu)) continue;

             //5-link: middle link
             //Kernel B
             FatLinkArg<real> middleLinkArg( newOprod, Pnumu, P5, Qnumu, Pmu, Qmu, link, FiveSt, 1, FORCE_MIDDLE_LINK);
             FatLinkForce<real, FatLinkArg<real> > middleLink(middleLinkArg, link, sig, nu, FORCE_MIDDLE_LINK);
             middleLink.apply(0);

             for (int rho = 0; rho < 8; rho++) {
               if (rho == sig || rho == opp_dir(sig) || rho == mu || rho == opp_dir(mu) || rho == nu || rho == opp_dir(nu)) continue;

               //7-link: middle link and side link
               FatLinkArg<real> arg(newOprod, P5, Pnumu, Qnumu, link, SevenSt, FiveSt != 0 ? SevenSt/FiveSt : 0, 1, FORCE_ALL_LINK, true);
               FatLinkForce<real, FatLinkArg<real> > all(arg, link, sig, rho, FORCE_ALL_LINK);
               all.apply(0);

             }//rho

             //5-link: side link
             FatLinkArg<real> arg(newOprod, P3, P5, Qmu, link, mFiveSt, (ThreeSt != 0 ? FiveSt/ThreeSt : 0), 1, FORCE_SIDE_LINK);
             FatLinkForce<real, FatLinkArg<real> > side(arg, link, sig, nu, FORCE_SIDE_LINK);
             side.apply(0);

           } //nu

           //lepage
           if (Lepage != 0.) {
             FatLinkArg<real> middleLinkArg( newOprod, P5, Pmu, Qmu, link, Lepage, 2, FORCE_LEPAGE_MIDDLE_LINK);
             FatLinkForce<real, FatLinkArg<real> > middleLink(middleLinkArg, link, sig, mu, FORCE_LEPAGE_MIDDLE_LINK);
             middleLink.apply(0);

             FatLinkArg<real> arg(newOprod, P3, P5, Qmu, link, mLepage, (ThreeSt != 0 ? Lepage/ThreeSt : 0), 2, FORCE_SIDE_LINK);
             FatLinkForce<real, FatLinkArg<real> > side(arg, link, sig, mu, FORCE_SIDE_LINK);
             side.apply(0);
           } // Lepage != 0.0

           // 3-link side link
           FatLinkArg<real> arg(newOprod, P3, link, ThreeSt, 1, FORCE_SIDE_LINK_SHORT);
           FatLinkForce<real, FatLinkArg<real> > side(arg, P3, sig, mu, FORCE_SIDE_LINK_SHORT);
           side.apply(0);
         }//mu
       }//sig

     } // hisqStaplesForce

     void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff_array[6], long long* flops)
     {
       if (!link.isNative()) errorQuda("Unsupported gauge order %d", link.Order());
       if (!oprod.isNative()) errorQuda("Unsupported gauge order %d", oprod.Order());
       if (!newOprod.isNative()) errorQuda("Unsupported gauge order %d", newOprod.Order());
       if (checkLocation(newOprod,oprod,link) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU not implemented");

       // create color matrix fields with zero padding
       GaugeFieldParam gauge_param(link);
       gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
       gauge_param.order = QUDA_FLOAT2_GAUGE_ORDER;
       gauge_param.geometry = QUDA_SCALAR_GEOMETRY;

       cudaGaugeField Pmu(gauge_param);
       cudaGaugeField P3(gauge_param);
       cudaGaugeField P5(gauge_param);
       cudaGaugeField Pnumu(gauge_param);
       cudaGaugeField Qmu(gauge_param);
       cudaGaugeField Qnumu(gauge_param);

       QudaPrecision precision = checkPrecision(oprod, link, newOprod);
       if (precision ==  QUDA_DOUBLE_PRECISION) {
         PathCoefficients<double> act_path_coeff(path_coeff_array);
         hisqStaplesForce<double>(Pmu, P3, P5, Pnumu, Qmu, Qnumu, newOprod, oprod, link, act_path_coeff);
       } else if (precision == QUDA_SINGLE_PRECISION) {
         PathCoefficients<float> act_path_coeff(path_coeff_array);
         hisqStaplesForce<float>(Pmu, P3, P5, Pnumu, Qmu, Qnumu, newOprod, oprod, link, act_path_coeff);
       } else {
         errorQuda("Unsupported precision");
       }

       cudaDeviceSynchronize();
       checkCudaError();

       if (flops) {
         int volume = 1;
         for (int d=0; d<4; d++) volume += link.X()[d]-2*link.R()[d]; // compute physical volume for useful flops
         // Middle Link, side link, short side link, AllLink, OneLink
         *flops += (long long)volume*(134784 + 24192 + 103680 + 864 + 397440 + 72 + (path_coeff_array[5] != 0 ? 28944 : 0));
       }

     }

     template <typename real, QudaReconstructType reconstruct=QUDA_RECONSTRUCT_NO>
     struct CompleteForceArg : public BaseForceArg<real,reconstruct> {

       typedef typename gauge::FloatNOrder<real,18,2,11> M;
       typedef typename gauge_mapper<real,QUDA_RECONSTRUCT_NO>::type F;
       M outA;
       const F oProd;
       const real coeff;

       CompleteForceArg(GaugeField &force, const GaugeField &link, const GaugeField &oprod)
         : BaseForceArg<real,reconstruct>(link, 0), outA(force), oProd(oprod), coeff(0.0)
       { }

     };

     // Flops count: 4 matrix multiplications per lattice site = 792 Flops per site
     template <typename real, typename Arg>
     __global__ void completeForceKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;
       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       getCoords(x, x_cb, arg.X, parity);

       for (int d=0; d<4; d++) x[d] += arg.border[d];
       int e_cb = linkIndex(x,arg.E);

 #pragma unroll
       for (int sig=0; sig<4; ++sig) {
         Link Uw = arg.link(sig, e_cb, parity);
         Link Ox = arg.oProd(sig, e_cb, parity);
         Link Ow = Uw*Ox;

         makeAntiHerm(Ow);

         real coeff = (parity==1) ? -1.0 : 1.0;
         arg.outA(sig, x_cb, parity) = coeff*Ow;
       }
     }

     template <typename real, QudaReconstructType reconstruct=QUDA_RECONSTRUCT_NO>
     struct LongLinkArg : public BaseForceArg<real,reconstruct> {

       typedef typename gauge::FloatNOrder<real,18,2,11> M;
       typedef typename gauge_mapper<real,QUDA_RECONSTRUCT_NO>::type F;
       F outA;
       const F oProd;
       const real coeff;

       LongLinkArg(GaugeField &newOprod, const GaugeField &link, const GaugeField &oprod, real coeff)
         : BaseForceArg<real,reconstruct>(link,0), outA(newOprod), oProd(oprod), coeff(coeff)
       { }

     };

     // Flops count, in two-number pair (matrix_mult, matrix_add)
     //           (24, 12)
     // 4968 Flops per site in total
     template <typename real, typename Arg>
     __global__ void longLinkKernel(Arg arg)
     {
       typedef Matrix<complex<real>,3> Link;
       int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
       if (x_cb >= arg.threads) return;
       int parity = blockIdx.y * blockDim.y + threadIdx.y;

       int x[4];
       int dx[4] = {0,0,0,0};

       getCoords(x, x_cb, arg.X, parity);

       for (int i=0; i<4; i++) x[i] += arg.border[i];
       int e_cb = linkIndex(x,arg.E);

       /*
        *
        *    A   B    C    D    E
        *    ---- ---- ---- ----
        *
        *   ---> sig direction
        *
        *   C is the current point (sid)
        *
        */

       // compute the force for forward long links
 #pragma unroll
       for (int sig=0; sig<4; sig++) {
         int point_c = e_cb;

         dx[sig]++;
         int point_d = linkIndexShift(x,dx,arg.E);

         dx[sig]++;
         int point_e = linkIndexShift(x,dx,arg.E);

         dx[sig] = -1;
         int point_b = linkIndexShift(x,dx,arg.E);

         dx[sig]--;
         int point_a = linkIndexShift(x,dx,arg.E);
         dx[sig] = 0;

         Link Uab = arg.link(sig, point_a, parity);
         Link Ubc = arg.link(sig, point_b, 1-parity);
         Link Ude = arg.link(sig, point_d, 1-parity);
         Link Uef = arg.link(sig, point_e, parity);

         Link Oz = arg.oProd(sig, point_c, parity);
         Link Oy = arg.oProd(sig, point_b, 1-parity);
         Link Ox = arg.oProd(sig, point_a, parity);

         Link temp = Ude*Uef*Oz - Ude*Oy*Ubc + Ox*Uab*Ubc;

         Link force = arg.outA(sig, e_cb, parity);
         arg.outA(sig, e_cb, parity) = force + arg.coeff*temp;
       } // loop over sig

     }

     template <typename real, typename Arg>
     class HisqForce : public TunableVectorY {

       Arg &arg;
       const GaugeField &meta;
       const HisqForceType type;

       unsigned int minThreads() const { return arg.threads; }
       bool tuneGridDim() const { return false; }

     public:
       HisqForce(Arg &arg, const GaugeField &meta, int sig, int mu, HisqForceType type)
         : TunableVectorY(2), arg(arg), meta(meta), type(type) {
         arg.sig = sig;
         arg.mu = mu;
       }
       virtual ~HisqForce() { }

       void apply(const cudaStream_t &stream) {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         switch (type) {
         case FORCE_LONG_LINK:
           longLinkKernel<real,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
         case FORCE_COMPLETE:
           completeForceKernel<real,Arg><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
         default:
           errorQuda("Undefined force type %d", type);
         }
       }

       TuneKey tuneKey() const {
         std::stringstream aux;
         aux << "threads=" << arg.threads << ",prec=" << sizeof(real);
         switch (type) {
         case FORCE_LONG_LINK: aux << ",LONG_LINK"; break;
         case FORCE_COMPLETE:  aux << ",COMPLETE";  break;
         default: errorQuda("Undefined force type %d", type);
         }
         return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
       }

       void preTune() {
         switch (type) {
         case FORCE_LONG_LINK: arg.outA.save(); break;
         case FORCE_COMPLETE: break;
         default: errorQuda("Undefined force type %d", type);
         }
       }

       void postTune() {
         switch (type) {
         case FORCE_LONG_LINK: arg.outA.load(); break;
         case FORCE_COMPLETE: break;
         default: errorQuda("Undefined force type %d", type);
         }
       }

       long long flops() const {
         switch (type) {
         case FORCE_LONG_LINK: return 2*arg.threads*4968ll;
         case FORCE_COMPLETE:  return 2*arg.threads*792ll;
         default: errorQuda("Undefined force type %d", type);
         }
         return 0;
       }

       long long bytes() const {
         switch (type) {
         case FORCE_LONG_LINK: return 4*2*arg.threads*(2*arg.outA.Bytes() + 4*arg.link.Bytes() + 3*arg.oProd.Bytes());
         case FORCE_COMPLETE:  return 4*2*arg.threads*(arg.outA.Bytes() + arg.link.Bytes() + arg.oProd.Bytes());
         default: errorQuda("Undefined force type %d", type);
         }
         return 0;
       }
     };

     void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oldOprod, const GaugeField &link, double coeff, long long* flops)
     {
       if (!link.isNative()) errorQuda("Unsupported gauge order %d", link.Order());
       if (!oldOprod.isNative()) errorQuda("Unsupported gauge order %d", oldOprod.Order());
       if (!newOprod.isNative()) errorQuda("Unsupported gauge order %d", newOprod.Order());
       if (checkLocation(newOprod,oldOprod,link) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU not implemented");

       QudaPrecision precision = checkPrecision(newOprod, link, oldOprod);
       if (precision == QUDA_DOUBLE_PRECISION) {
         if (link.Reconstruct() == QUDA_RECONSTRUCT_NO) {
           typedef LongLinkArg<double,QUDA_RECONSTRUCT_NO> Arg;
           Arg arg(newOprod, link, oldOprod, coeff);
           HisqForce<double,Arg> longLink(arg, link, 0, 0, FORCE_LONG_LINK);
           longLink.apply(0);
           if (flops) (*flops) += longLink.flops();
         } else {
           errorQuda("Reconstruct %d not supported", link.Reconstruct());
         }
       } else if (precision == QUDA_SINGLE_PRECISION) {
         if (link.Reconstruct() == QUDA_RECONSTRUCT_NO) {
           typedef LongLinkArg<float,QUDA_RECONSTRUCT_NO> Arg;
           Arg arg(newOprod, link, oldOprod, coeff);
           HisqForce<float, Arg> longLink(arg, link, 0, 0, FORCE_LONG_LINK);
           longLink.apply(0);
           if (flops) (*flops) += longLink.flops();
         } else {
           errorQuda("Reconstruct %d not supported", link.Reconstruct());
         }
       } else {
         errorQuda("Unsupported precision %d", precision);
       }
       checkCudaError();
       cudaDeviceSynchronize();
     }

     void hisqCompleteForce(GaugeField &force, const GaugeField &oprod, const GaugeField &link, long long* flops)
     {
       if (!link.isNative()) errorQuda("Unsupported gauge order %d", link.Order());
       if (!oprod.isNative()) errorQuda("Unsupported gauge order %d", oprod.Order());
       if (!force.isNative()) errorQuda("Unsupported gauge order %d", force.Order());
       if (checkLocation(force,oprod,link) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU not implemented");

       QudaPrecision precision = checkPrecision(oprod, link, force);
       if (precision == QUDA_DOUBLE_PRECISION) {
         if (link.Reconstruct() == QUDA_RECONSTRUCT_NO) {
           typedef CompleteForceArg<double,QUDA_RECONSTRUCT_NO> Arg;
           Arg arg(force, link, oprod);
           HisqForce<double,Arg> completeForce(arg, link, 0, 0, FORCE_COMPLETE);
           completeForce.apply(0);
           if (flops) *flops += completeForce.flops();
         } else {
           errorQuda("Reconstruct %d not supported", link.Reconstruct());
         }
       } else if (precision == QUDA_SINGLE_PRECISION) {
         if (link.Reconstruct() == QUDA_RECONSTRUCT_NO) {
           typedef CompleteForceArg<float,QUDA_RECONSTRUCT_NO> Arg;
           Arg arg(force, link, oprod);
           HisqForce<float, Arg> completeForce(arg, link, 0, 0, FORCE_COMPLETE);
           completeForce.apply(0);
           if (flops) *flops += completeForce.flops();
         } else {
           errorQuda("Reconstruct %d not supported", link.Reconstruct());
         }
       } else {
         errorQuda("Unsupported precision %d", precision);
       }
       checkCudaError();
       cudaDeviceSynchronize();
     }
   } // namespace fermion_force
 } // namespace quda

 #endif // GPU_HISQ_FORCE
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

mu
double mu
Definition: test_util.cpp:1643

QudaPrecision
enum QudaPrecision_s QudaPrecision

quda::linkIndexShift
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
Definition: index_helper.cuh:13

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:592

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

TDOWN
#define TDOWN
Definition: misc.h:64

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

PathCoefficients
Definition: hisq_force_reference2.cpp:1246

PathCoefficients::three
Real three
Definition: hisq_force_reference2.cpp:1249

XUP
#define XUP
Definition: llfat_reference.cpp:14

PathCoefficients::five
Real five
Definition: hisq_force_reference2.cpp:1250

gauge_param
QudaGaugeParam gauge_param
Definition: dslash_ctest.cpp:36

E
int E[4]
Definition: test_util.cpp:36

PathCoefficients::seven
Real seven
Definition: hisq_force_reference2.cpp:1251

YUP
#define YUP
Definition: llfat_reference.cpp:15

Qmu
#define Qmu
Definition: hisq_force_reference.cpp:720

quda
Definition: blas_cublas.h:6

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

PathCoefficients::lepage
Real lepage
Definition: hisq_force_reference2.cpp:1253

quda::fermion_force::hisqLongLinkForce
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff, long long *flops=nullptr)
Compute the long-link contribution to the fermion force.

PathCoefficients::one
Real one
Definition: hisq_force_reference2.cpp:1248

quda::fermion_force::hisqStaplesForce
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6], long long *flops=nullptr)
Compute the fat-link contribution to the fermion force.

w
int int int w
Definition: CMakeCUDACompilerId.cpp1.ii:2637

ZUP
#define ZUP
Definition: llfat_reference.cpp:16

commDim
int commDim(int)
Definition: comm_common.cpp:670

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:561

shift
static unsigned int unsigned int shift
Definition: CMakeCUDACompilerId.cpp1.ii:13738

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:43

tune_quda.h

Sign
Definition: hisq_force_reference2.cpp:23

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

TUP
#define TUP
Definition: llfat_reference.cpp:17

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

quda::fermion_force::hisqCompleteForce
void hisqCompleteForce(GaugeField &momentum, const GaugeField &oprod, const GaugeField &link, long long *flops=nullptr)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...

index_helper.cuh

dw_dslash_4D_cuda_gen.coeff
def coeff()
Definition: dw_dslash_4D_cuda_gen.py:1099

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

Pmu
#define Pmu
Definition: hisq_force_reference.cpp:708

P5
#define P5
Definition: hisq_force_reference.cpp:713

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

Qnumu
#define Qnumu
Definition: hisq_force_reference.cpp:721

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

XDOWN
#define XDOWN
Definition: misc.h:67

ZDOWN
#define ZDOWN
Definition: misc.h:65

quda::makeAntiHerm
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:636

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:426

Pnumu
#define Pnumu
Definition: hisq_force_reference.cpp:709

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

parity
QudaParity parity
Definition: covdev_test.cpp:53

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:296

YDOWN
#define YDOWN
Definition: misc.h:66

gauge_field.h

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

P3
#define P3
Definition: hisq_force_reference.cpp:714

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

ks_improved_force.h

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda_internal.h

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129