quda-ref/v0.7.0/hisq__paths__force__core_8h_source.html

 //macro KERNEL_ENABLED is used to control compile time, debug purpose only

 #if (PRECISION == 0 && RECON == 18)

 #define EXT _dp_18_

 #elif (PRECISION == 0 && RECON == 12)

 #define EXT _dp_12_

 #elif (PRECISION == 1 && RECON == 18)

 #define EXT _sp_18_

 #else

 #define EXT _sp_12_

 #endif


 #define print_matrix(mul)                                               \

 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \

 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \

 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);


 /**************************do_middle_link_kernel*****************************

  *

  *

  * Generally we need

  * READ

  *    3 LINKS:         ab_link,     bc_link,    ad_link

  *    3 COLOR MATRIX:  newOprod_at_A, oprod_at_C,  Qprod_at_D

  * WRITE

  *    4 COLOR MATRIX:  newOprod_at_A, P3_at_A, Pmu_at_B, Qmu_at_A

  *

  * Three call variations:

  *   1. when Qprev == NULL:   Qprod_at_D does not exist and is not read in

  *   2. full read/write

  *   3. when Pmu/Qmu == NULL,   Pmu_at_B and Qmu_at_A are not written out

  *

  *   In all three above case, if the direction sig is negative, newOprod_at_A is

  *   not read in or written out.

  *

  * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)

  *   Call 1:  (called 48 times, half positive sig, half negative sig)

  *             if (sig is positive):    (3, 6)

  *             else               :     (3, 4)

  *   Call 2:  (called 192 time, half positive sig, half negative sig)

  *             if (sig is positive):    (3, 7)

  *             else               :     (3, 5)

  *   Call 3:  (called 48 times, half positive sig, half negative sig)

  *             if (sig is positive):    (3, 5)

  *             else               :     (3, 2) no need to loadQprod_at_D in this case

  *

  * note: oprod_at_C could actually be read in from D when it is the fresh outer product

  *       and we call it oprod_at_C to simply naming. This does not affect our data traffic analysis

  *

  * Flop count, in two-number pair (matrix_multi, matrix_add)

  *   call 1:     if (sig is positive)  (3, 1)

  *               else                  (2, 0)

  *   call 2:     if (sig is positive)  (4, 1)

  *               else                  (3, 0)

  *   call 3:     if (sig is positive)  (4, 1)

  *               else                  (2, 0)

  *

  ****************************************************************************/

 // call 1: if (sig is positive) 612 Flops per site

 //         else                 396 Flops per site

 //

 // call 2: if (sig is positive) 810 Flops per site

 //         else                 594 Flops per site

 //

 // call 3: if (sig is positive) 810 Flops per site

 //         else                 396 Flops per site

 //

 // call 1: 24 times with +ve sig and 24 times with -ve sig

 //         24192 Flops per site for the full 48 calls

 //

 // call 2: 96 times with +ve sig and 96 times with -ve sig

 //         134784 Flops per site in total

 //

 // call 3 (Lepage)

 //      : 24 times with +ve sig and 24 times with -ve sig

 //      28944 Flops per site in total

 //

 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>

   __global__ void

                  HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,

                      const RealA* const QprevEven, const RealA* const QprevOdd,

                      const RealB* const linkEven,  const RealB* const linkOdd,

                      int sig, int mu,

                      typename RealTypeId<RealA>::Type coeff,

                      RealA* const PmuEven, RealA* const PmuOdd,

                      RealA* const P3Even, RealA* const P3Odd,

                      RealA* const QmuEven, RealA* const QmuOdd,

                      RealA* const newOprodEven, RealA* const newOprodOdd,

                      hisq_kernel_param_t kparam)

 {


   int oddBit = _oddBit;

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if(sid >= kparam.threads) return;

   int dx[4] = {0,0,0,0};

   int x[4];


   getCoords(x, sid, kparam.D, oddBit);


   Matrix<RealA,3> Uab, Ubc, Uad;

   Matrix<RealA,3> Ow, Ox, Oy;


   /*        A________B

    *   mu   |        |

    *       D|        |C

    *

    *      A is the current point (sid)

    *

    */


   int point_b, point_c, point_d;

   int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx;

   int mymu = posDir(mu);


 #ifdef MULTI_GPU

   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};


   x[0] = x[0] + kparam.base_idx[0];

   x[1] = x[1] + kparam.base_idx[1];

   x[2] = x[2] + kparam.base_idx[2];

   x[3] = x[3] + kparam.base_idx[3];

   int new_sid = linkIndex(x,dx,E);

   oddBit = _oddBit ^ oddness_change;


 #else

   int E[4] = {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   int y[4] = {x[0], x[1], x[2], x[3]};


   mymu = posDir(mu);


   updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);


   point_d = linkIndex(y, dx, E);


   if (mu_positive){

     ad_link_nbr_idx = point_d;

   }else{

     ad_link_nbr_idx = new_sid;

   }


   int mysig = posDir(sig);

   updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

   point_c = linkIndex(y, dx, E);


   if (mu_positive){

     bc_link_nbr_idx = point_c;

   }


   for(int dir=0; dir<4; ++dir) y[dir] = x[dir];

   updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

   point_b = linkIndex(y, dx, E);


   if (!mu_positive){

     bc_link_nbr_idx = point_b;

   }


   if(sig_positive){

     ab_link_nbr_idx = new_sid;

   }else{

     ab_link_nbr_idx = point_b;

   }

   // now we have ab_link_nbr_idx


   // load the link variable connecting a and b

   // Store in ab_link

   //loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);

   loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);


   // load the link variable connecting b and c

   // Store in bc_link

   loadLink<18>(linkEven, linkOdd, mymu, bc_link_nbr_idx, Ubc.data, mu_positive^(1-oddBit), kparam.thin_link_stride);


   if(QprevOdd == NULL){

     loadMatrixFromField(oprodEven, oprodOdd, posDir(sig), (sig_positive ? point_d : point_c), Oy.data, sig_positive^oddBit, kparam.color_matrix_stride);

     if(!sig_positive) Oy = conj(Oy);

   }else{ // QprevOdd != NULL

     loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride);

   }


   if(!mu_positive){

     Ow = Ubc*Oy;

   }else{

     Ow = conj(Ubc)*Oy;

   }


   if(PmuOdd){

     storeMatrixToField(Ow.data, point_b, PmuEven, PmuOdd, 1-oddBit, kparam.color_matrix_stride);

   }

   if(sig_positive){

     Oy = Uab*Ow;

   }else{

     Oy = conj(Uab)*Ow;

   }


   storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride);


   loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);

   if(!mu_positive)  Uad = conj(Uad);


   if(QprevOdd == NULL){

     if(sig_positive){

       Oy = Ow*Uad;

     }


     if(QmuEven){

       Ox = Uad;

       storeMatrixToField(Ox.data, new_sid, QmuEven, QmuOdd, oddBit, kparam.color_matrix_stride);

     }

   }else{

     if(QmuEven || sig_positive){

       loadMatrixFromField(QprevEven, QprevOdd, point_d, Oy.data, 1-oddBit, kparam.color_matrix_stride);

       Ox = Oy*Uad;

     }

     if(QmuEven){

       storeMatrixToField(Ox.data, new_sid, QmuEven, QmuOdd, oddBit, kparam.color_matrix_stride);

     }

     if(sig_positive){

       Oy = Ow*Ox;

     }

   }


   if(sig_positive){

     addMatrixToNewOprod(Oy.data, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);

   }


   return;

 }


 // Flop count, in two-number pair (matrix_multi, matrix_add)

 //  if (sig is positive)  (4, 1)

 //  else                  (2, 0)

 //  if(sig is positive) 810 flops per lattice site

 //  else 396 flops per lattice site

 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>

   __global__ void

 HISQ_KERNEL_NAME(do_lepage_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,

     const RealA* const QprevEven, const RealA* const QprevOdd,

     const RealB* const linkEven,  const RealB* const linkOdd,

     int sig, int mu,

     typename RealTypeId<RealA>::Type coeff,

     RealA* const P3Even, RealA* const P3Odd,

     RealA* const newOprodEven, RealA* const newOprodOdd,

     hisq_kernel_param_t kparam)

 {


   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if(sid >= kparam.threads) return;

   int oddBit = _oddBit;


   Matrix<RealA,3> Uab, Ubc, Uad;

   Matrix<RealA,3> Ow, Ox, Oy;


   /*        A________B

    *   mu   |        |

    *       D|        |C

    *

    *   A is the current point (sid)

    *

    */


   int point_b, point_c, point_d;

   int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx;

   int mymu;


   int x[4];

   int dx[4] = {0,0,0,0};

   getCoords(x, sid, kparam.D, oddBit);


 #ifdef MULTI_GPU

   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   x[0] = x[0] + kparam.base_idx[0];

   x[1] = x[1] + kparam.base_idx[1];

   x[2] = x[2] + kparam.base_idx[2];

   x[3] = x[3] + kparam.base_idx[3];

   int new_sid = linkIndex(x,dx,E);

   oddBit = _oddBit ^ oddness_change;

 #else

   int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   mymu = posDir(mu);

   int y[4] = {x[0], x[1], x[2], x[3]};

   updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);

   point_d = linkIndex(y, dx, E);


   if (mu_positive){

     ad_link_nbr_idx = point_d;

   }else{

     ad_link_nbr_idx = new_sid;

   }


   int mysig = posDir(sig);

   updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

   point_c = linkIndex(y, dx, E);


   if (mu_positive){

     bc_link_nbr_idx = point_c;

   }


   for(int dir=0; dir<4; ++dir) y[dir] = x[dir];

   updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

   point_b = linkIndex(y, dx, E);


   if (!mu_positive){

     bc_link_nbr_idx = point_b;

   }


   if(sig_positive){

     ab_link_nbr_idx = new_sid;

   }else{

     ab_link_nbr_idx = point_b;

   }

   // now we have ab_link_nbr_idx

   //

   //

   // load the link variable connecting a and b

   // Store in ab_link

   loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);


   // load the link variable connecting b and c

   // Store in bc_link

   loadLink<18>(linkEven, linkOdd, mymu, bc_link_nbr_idx, Ubc.data, mu_positive^(1-oddBit), kparam.thin_link_stride);


   loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride);


   if(!mu_positive){

     Ow = Ubc*Oy;

   }else{

     Ow = conj(Ubc)*Oy;

   }


   if(sig_positive){

     Oy = Uab*Ow;

   }else{

     Oy = conj(Uab)*Ow;

   }


   storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride);

   if(sig_positive){

     loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);

     if(!mu_positive) Uad = conj(Uad);


     loadMatrixFromField(QprevEven, QprevOdd, point_d, Oy.data, 1-oddBit, kparam.color_matrix_stride);


     Ox = Oy*Uad;

     Oy = Ow*Ox;


     addMatrixToNewOprod(Oy.data, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);

   }


 //#endif

   return;

 }


 /***********************************do_side_link_kernel***************************

  *

  * In general we need

  * READ

  *    1  LINK:          ad_link

  *    4  COLOR MATRIX:  shortP_at_D, newOprod, P3_at_A, Qprod_at_D,

  * WRITE

  *    2  COLOR MATRIX:  shortP_at_D, newOprod,

  *

  * Two call variations:

  *   1. full read/write

  *   2. when shortP == NULL && Qprod == NULL:

  *          no need to read ad_link/shortP_at_D or write shortP_at_D

  *          Qprod_at_D does not exit and is not read in

  *

  *

  * Therefore the data traffic, in two-number pair (num_of_links, num_of_color_matrix)

  *   Call 1:   (called 192 times)

  *                           (1, 6)

  *

  *   Call 2:   (called 48 times)

  *                           (0, 3)

  *

  * note: newOprod can be at point D or A, depending on if mu is postive or negative

  *

  * Flop count, in two-number pair (matrix_multi, matrix_add)

  *   call 1:       (2, 2)

  *   call 2:       (0, 1)

  *

  *********************************************************************************/


 // Flop count, in two-number pair (matrix_mult, matrix_add)

 //              (2,2)

 // call 1: 432 Flops per site

 // call 2 (short)

 //      : 18 Flops per site

 //

 // call 1: 240 calls

 // call 2: 48 calls

 //

 // Aggregate Flops:

 // call 1: 103680

 // call 2: 864


 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>

   __global__ void

 HISQ_KERNEL_NAME(do_side_link, EXT)(const RealA* const P3Even, const RealA* const P3Odd,

     const RealA* const QprodEven, const RealA* const QprodOdd,

     const RealB* const linkEven,  const RealB* const linkOdd,

     int sig, int mu,

     typename RealTypeId<RealA>::Type coeff,

     typename RealTypeId<RealA>::Type accumu_coeff,

     RealA* const shortPEven, RealA* const shortPOdd,

     RealA* const newOprodEven, RealA* const newOprodOdd,

     hisq_kernel_param_t kparam)

 {

   int oddBit = _oddBit;

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if(sid >= kparam.threads) return;


   int x[4];

   int dx[4] = {0,0,0,0};

   getCoords(x, sid, kparam.D, oddBit);


 #ifdef MULTI_GPU

   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   x[0] = x[0] + kparam.base_idx[0];

   x[1] = x[1] + kparam.base_idx[1];

   x[2] = x[2] + kparam.base_idx[2];

   x[3] = x[3] + kparam.base_idx[3];

   int new_sid = linkIndex(x,dx,E);

   oddBit = _oddBit ^ oddness_change;

 #else

   int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   Matrix<RealA,3> Uad;

   Matrix<RealA,3> Ow, Ox, Oy;


   loadMatrixFromField(P3Even, P3Odd, new_sid, Oy.data, oddBit, kparam.color_matrix_stride);


   /*      compute the side link contribution to the momentum

    *

    *             sig

    *          A________B

    *           |       |   mu

    *         D |       |C

    *

    *      A is the current point (sid)

    *

    */


   int y[4] = {x[0], x[1], x[2], x[3]};


   typename RealTypeId<RealA>::Type mycoeff;

   int point_d;

   int ad_link_nbr_idx;

   int mymu = posDir(mu);

   updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);

   point_d = linkIndex(y,dx,E);


   if (mu_positive){

     ad_link_nbr_idx = point_d;

   }else{

     ad_link_nbr_idx = new_sid;

   }


   loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);


   if(mu_positive){

     Ow = Uad*Oy;

   }else{

     Ow = conj(Uad)*Oy;

   }


   addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);

   mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;


   loadMatrixFromField(QprodEven, QprodOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride);


   if(mu_positive){

     Ow = Oy*Ox;

     if(!oddBit){ mycoeff = -mycoeff; }

     addMatrixToNewOprod(Ow.data, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);

   }else{

     Ow = conj(Ox)*conj(Oy);

     if(oddBit){ mycoeff = -mycoeff; }

     addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);

   }

   return;

 }


 // Flop count, in two-number pair (matrix_mult, matrix_add)

 //              (0,1)


 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>

   __global__ void

 HISQ_KERNEL_NAME(do_side_link_short, EXT)(const RealA* const P3Even, const RealA* const P3Odd,

     const RealB* const linkEven,  const RealB* const linkOdd,

     int sig, int mu,

     typename RealTypeId<RealA>::Type coeff,

     RealA* const newOprodEven, RealA* const newOprodOdd,

     hisq_kernel_param_t kparam)

 {

   int oddBit = _oddBit;

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if(sid >= kparam.threads) return;


   int x[4];

   int dx[4] = {0,0,0,0};

   getCoords(x, sid, kparam.D, oddBit);


 #ifdef MULTI_GPU

   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   x[0] = x[0] + kparam.base_idx[0];

   x[1] = x[1] + kparam.base_idx[1];

   x[2] = x[2] + kparam.base_idx[2];

   x[3] = x[3] + kparam.base_idx[3];

   int new_sid = linkIndex(x,dx,E);

   oddBit = _oddBit ^ oddness_change;

 #else

   int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   /*      compute the side link contribution to the momentum

    *

    *             sig

    *          A________B

    *           |       |   mu

    *         D |       |C

    *

    *      A is the current point (sid)

    *

    */


   Matrix<RealA,3> Ow, Oy;


   loadMatrixFromField(P3Even, P3Odd, new_sid, Oy.data, oddBit, kparam.color_matrix_stride);


   typename RealTypeId<RealA>::Type mycoeff;

   int point_d;

   int mymu = posDir(mu);

   int y[4] = {x[0], x[1], x[2], x[3]};


   updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);

   point_d = linkIndex(y,dx,E);

   mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;


   if(mu_positive){

     if(!oddBit){ mycoeff = -mycoeff;} // need to change this to get away from oddBit

     addMatrixToNewOprod(Oy.data, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);

   }else{

     if(oddBit){ mycoeff = -mycoeff; }

     Ow = conj(Oy);

     addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd,  oddBit, kparam.color_matrix_stride);

   }

   return;

 }


 /********************************do_all_link_kernel*********************************************

  *

  * In this function we need

  *   READ

  *     3 LINKS:         ad_link, ab_link, bc_link

  *     5 COLOR MATRIX:  Qprev_at_D, oprod_at_C, newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D

  *   WRITE:

  *     3 COLOR MATRIX:  newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D,

  *

  * If sig is negative, then we don't need to read/write the color matrix newOprod_at_A(sig)

  *

  * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)

  *

  *             if (sig is positive):    (3, 8)

  *             else               :     (3, 6)

  *

  * This function is called 384 times, half positive sig, half negative sig

  *

  * Flop count, in two-number pair (matrix_multi, matrix_add)

  *             if(sig is positive)      (6,3)

  *             else                     (4,2)

  *

  ************************************************************************************************/


 // 198 flops per matrix multiply

 // 18 flops per matrix addition

 // if(sig is positive) 1242 Flops per lattice site

 // else 828 Flops per lattice site

 //

 // Aggregate Flops per site

 // 1242*192 + 828*192

 // = 397440 Flops per site


 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>

   __global__ void

 HISQ_KERNEL_NAME(do_all_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,

     const RealA* const QprevEven, const RealA* const QprevOdd,

     const RealB* const linkEven, const RealB* const linkOdd,

     int sig, int mu,

     typename RealTypeId<RealA>::Type coeff,

     typename RealTypeId<RealA>::Type accumu_coeff,

     RealA* const shortPEven, RealA* const shortPOdd,

     RealA* const newOprodEven, RealA* const newOprodOdd,

     hisq_kernel_param_t kparam)

 {

   int oddBit = _oddBit;

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if(sid >= kparam.threads) return;


   int x[4];

   int dx[4] = {0,0,0,0};

   getCoords(x, sid, kparam.D, oddBit);


   Matrix<RealA,3> Uab, Ubc, Uad;

   Matrix<RealA,3> Ow, Ox, Oy, Oz;


   /*            sig

    *         A________B

    *      mu  |      |

    *        D |      |C

    *

    *   A is the current point (sid)

    *

    */


   int point_b, point_c, point_d;

   int ab_link_nbr_idx;


 #ifdef MULTI_GPU

   x[0] = x[0] + kparam.base_idx[0];

   x[1] = x[1] + kparam.base_idx[1];

   x[2] = x[2] + kparam.base_idx[2];

   x[3] = x[3] + kparam.base_idx[3];


   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   int new_sid = linkIndex(x,dx,E);

   oddBit = _oddBit ^ oddness_change;

 #else

   int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   int y[4] = {x[0], x[1], x[2], x[3]};

   int mysig = posDir(sig);

   updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

   point_b = linkIndex(y,dx,E);


   ab_link_nbr_idx = (sig_positive) ? new_sid : point_b;


   for(int dir=0; dir<4; ++dir) y[dir] = x[dir];


   const typename RealTypeId<RealA>::Type & mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;

   if(mu_positive){ //positive mu


     updateCoords(y, mu, -1, kparam.X, kparam.ghostDim[mu]);

     point_d = linkIndex(y,dx,E);


     updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

     point_c = linkIndex(y,dx,E);


     loadMatrixFromField(QprevEven, QprevOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride);      // COLOR_MAT_X

     loadLink<18>(linkEven, linkOdd, mu, point_d, Uad.data, 1-oddBit, kparam.thin_link_stride);


     loadMatrixFromField(oprodEven,oprodOdd,  point_c, Oy.data, oddBit, kparam.color_matrix_stride);             // COLOR_MAT_Y

     loadLink<18>(linkEven, linkOdd, mu, point_c, Ubc.data, oddBit, kparam.thin_link_stride);


     Oz = conj(Ubc)*Oy;


     if (sig_positive)

     {

       Ow = Oz*Ox*Uad;

       addMatrixToNewOprod(Ow.data, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);

     }


     loadLink<18>(linkEven, linkOdd, posDir(sig), ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);


     if(sig_positive){

       Oy = Uab*Oz;

     }else{

       Oy = conj(Uab)*Oz;

     }


     Ow = Oy*Ox;

     addMatrixToNewOprod(Ow.data, mu, point_d, -Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);

     Ow = Uad*Oy;

     addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);


   } else{ //negative mu


     mu = OPP_DIR(mu);

     updateCoords(y, mu, 1, kparam.X, kparam.ghostDim[mu]);

     point_d = linkIndex(y,dx,E);

     updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);

     point_c = linkIndex(y,dx,E);


     loadMatrixFromField(QprevEven, QprevOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride);         // COLOR_MAT_X used!


     loadLink<18>(linkEven, linkOdd, mu, new_sid, Uad.data, oddBit, kparam.thin_link_stride);


     loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride);          // COLOR_MAT_Y used

     loadLink<18>(linkEven, linkOdd, mu, point_b, Ubc.data, 1-oddBit, kparam.thin_link_stride);


     if(sig_positive){

       Ow = Ox*conj(Uad);

     }

     Oz = Ubc*Oy;


     if (sig_positive){

       Oy = Oz*Ow;

       addMatrixToNewOprod(Oy.data, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);

     }

     loadLink<18>(linkEven, linkOdd, posDir(sig), ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);


     if(sig_positive){

       Oy = Uab*Oz;

     }else{

       Oy = conj(Uab)*Oz;

     }


     Ow = conj(Ox)*conj(Oy);


     addMatrixToNewOprod(Ow.data, mu, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);


     Ow = conj(Uad)*Oy;


     addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);


   }

   return;

 }


 // Flops count, in two-number pair (matrix_mult, matrix_add)

 //                                 (24, 12)

 // 4968 Flops per site in total

 template<class RealA, class RealB,  int oddBit>

   __global__ void

 HISQ_KERNEL_NAME(do_longlink, EXT)(const RealB* const linkEven, const RealB* const linkOdd,

     const RealA* const naikOprodEven, const RealA* const naikOprodOdd,

     typename RealTypeId<RealA>::Type coeff,

     RealA* const outputEven, RealA* const outputOdd,

     hisq_kernel_param_t kparam)

 {

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if (sid >= kparam.threads) return;


   int x[4];

   int dx[4] = {0,0,0,0};


   getCoords(x, sid, kparam.X, oddBit);

 #ifdef MULTI_GPU

   int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   for(int i=0; i<4; ++i) x[i] += 2;

   int new_sid = linkIndex(x,dx,E);

 #else

   int E[4] = {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};

   int new_sid = sid;

 #endif


   const int & point_c = new_sid;

   int point_a, point_b, point_d, point_e;


   /*

    *

    *    A   B    C    D    E

    *    ---- ---- ---- ----

    *

    *   ---> sig direction

    *

    *   C is the current point (sid)

    *

    */


   Matrix<RealA,3> Uab, Ubc, Ude, Uef;

   Matrix<RealA,3> Ox, Oy, Oz;


   // compute the force for forward long links

   for(int sig=0; sig<4; ++sig){


     dx[sig]++;

     point_d = linkIndex(x,dx,E);


     dx[sig]++;

     point_e = linkIndex(x,dx,E);


     dx[sig] = -1;

     point_b = linkIndex(x,dx,E);


     dx[sig]--;

     point_a = linkIndex(x,dx,E);

     dx[sig]=0;


     loadLink<18>(linkEven, linkOdd, sig, point_a, Uab.data, oddBit, kparam.thin_link_stride);

     loadLink<18>(linkEven, linkOdd, sig, point_b, Ubc.data, 1-oddBit, kparam.thin_link_stride);

     loadLink<18>(linkEven, linkOdd, sig, point_d, Ude.data, 1-oddBit, kparam.thin_link_stride);

     loadLink<18>(linkEven, linkOdd, sig, point_e, Uef.data, oddBit, kparam.thin_link_stride);


     loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_c, Oz.data, oddBit, kparam.color_matrix_stride);

     loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_b, Oy.data, 1-oddBit, kparam.color_matrix_stride);

     loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_a, Ox.data, oddBit, kparam.color_matrix_stride);


     Matrix<RealA,3> temp = Ude*Uef*Oz - Ude*Oy*Ubc + Ox*Uab*Ubc;


     addMatrixToField(temp.data, sig, new_sid,  coeff, outputEven, outputOdd, oddBit, kparam.color_matrix_stride);

   } // loop over sig


   return;

 }


 // Flops count: 4 matrix multiplications per lattice site = 792 Flops per site

 template<class RealA, class RealB, int oddBit>

   __global__ void

 HISQ_KERNEL_NAME(do_complete_force, EXT)(const RealB* const linkEven, const RealB* const linkOdd,

     const RealA* const oprodEven, const RealA* const oprodOdd,

     RealA* const forceEven, RealA* const forceOdd,

     hisq_kernel_param_t kparam)

 {

   int sid = blockIdx.x * blockDim.x + threadIdx.x;

   if (sid >= kparam.threads) return;


   int x[4];

   int dx[4] = {0,0,0,0};

   getCoords(x, sid, kparam.X, oddBit);


   int new_sid=sid;

 #ifdef MULTI_GPU

   x[0] = x[0]+2;

   x[1] = x[1]+2;

   x[2] = x[2]+2;

   x[3] = x[3]+2;

   int E[4] = {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

   new_sid = linkIndex(x,dx,E);

 #endif


   for(int sig=0; sig<4; ++sig){


     Matrix<RealA,3> Uw, Ow, Ox;


     loadLink<18>(linkEven, linkOdd, sig, new_sid, Uw.data, oddBit, kparam.thin_link_stride);


     loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, Ox.data, oddBit, kparam.color_matrix_stride);

     typename RealTypeId<RealA>::Type coeff = (oddBit==1) ? -1 : 1;

     Ow = Uw*Ox;


     storeMatrixToMomentumField(Ow.data, sig, sid, coeff, forceEven, forceOdd, oddBit, kparam.momentum_stride);

   }

   return;

 }


 #undef EXT

quda::linkIndex
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
Definition: ks_force_quda.cu:40

bc_link_nbr_idx
int bc_link_nbr_idx
Definition: hisq_paths_force_core.h:117

Uad
Matrix< RealA, 3 > Uad
Definition: hisq_paths_force_core.h:104

point_d
int point_d
Definition: hisq_paths_force_core.h:116

conj
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Definition: hisq_force_reference2.cpp:231

QprevOdd
__global__ void const RealA *const const RealA *const const RealA *const QprevOdd
Definition: hisq_paths_force_core.h:82

QprodOdd
__global__ void const RealA *const const RealA *const const RealA *const QprodOdd
Definition: hisq_paths_force_core.h:435

mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
Definition: hisq_paths_force_core.h:82

addMatrixToField
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)

P3Odd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const P3Odd
Definition: hisq_paths_force_core.h:82

newOprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodEven
Definition: hisq_paths_force_core.h:82

QmuEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const QmuEven
Definition: hisq_paths_force_core.h:82

naikOprodOdd
__global__ void const RealB *const const RealA *const const RealA *const naikOprodOdd
Definition: hisq_paths_force_core.h:803

point_b
int point_b
Definition: hisq_paths_force_core.h:116

forceOdd
__global__ void const RealB *const const RealA *const const RealA *const RealA *const RealA *const forceOdd
Definition: hisq_paths_force_core.h:885

sid
int sid
Definition: hisq_paths_force_core.h:96

HISQ_KERNEL_NAME
__global__ void HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA *const oprodEven

point_c
int point_c
Definition: hisq_paths_force_core.h:116

OPP_DIR
#define OPP_DIR(dir)
Definition: force_common.h:16

Uab
Matrix< RealA, 3 > Uab
Definition: hisq_paths_force_core.h:104

loadLink< 18 >
loadLink< 18 >(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride)

E
int E[4]
Definition: hisq_paths_force_core.h:133

oprodEven
__global__ void const RealB *const const RealA *const oprodEven
Definition: hisq_paths_force_core.h:885

QmuOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const QmuOdd
Definition: hisq_paths_force_core.h:82

addMatrixToNewOprod
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)

forceEven
__global__ void const RealB *const const RealA *const const RealA *const RealA *const forceEven
Definition: hisq_paths_force_core.h:885

linkOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
Definition: hisq_paths_force_core.h:82

QprevEven
__global__ void const RealA *const const RealA *const QprevEven
Definition: hisq_paths_force_core.h:82

outputEven
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
Definition: hisq_paths_force_core.h:803

Ubc
Matrix< RealA, 3 > Ubc
Definition: hisq_paths_force_core.h:104

mysig
int mysig
Definition: hisq_paths_force_core.h:151

EXT
#define EXT
Definition: hisq_paths_force_core.h:10

coeff
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type coeff
Definition: hisq_paths_force_core.h:82

outputOdd
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
Definition: hisq_paths_force_core.h:803

new_sid
int new_sid
Definition: hisq_paths_force_core.h:134

updateCoords
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])

accumu_coeff
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
Definition: hisq_paths_force_core.h:435

QprodEven
__global__ void const RealA *const const RealA *const QprodEven
Definition: hisq_paths_force_core.h:435

point_a
int point_a
Definition: hisq_paths_force_core.h:829

x
int x[4]
Definition: hisq_paths_force_core.h:99

Uef
Matrix< RealA, 3 > Uef
Definition: hisq_paths_force_core.h:844

shortPOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const RealA *const shortPOdd
Definition: hisq_paths_force_core.h:435

Sign
Definition: hisq_force_reference2.cpp:23

PmuEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const PmuEven
Definition: hisq_paths_force_core.h:82

dx
int dx[4]
Definition: hisq_paths_force_core.h:98

Matrix
Definition: hisq_force_reference2.cpp:131

kparam
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
Definition: hisq_paths_force_core.h:92

P3Even
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const P3Even
Definition: hisq_paths_force_core.h:82

naikOprodEven
__global__ void const RealB *const const RealA *const naikOprodEven
Definition: hisq_paths_force_core.h:803

point_e
int point_e
Definition: hisq_paths_force_core.h:829

ab_link_nbr_idx
int ab_link_nbr_idx
Definition: hisq_paths_force_core.h:117

Ox
Matrix< RealA, 3 > Ox
Definition: hisq_paths_force_core.h:105

storeMatrixToField
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)

Oz
Matrix< RealA, 3 > Oz
Definition: hisq_paths_force_core.h:673

Ow
Matrix< RealA, 3 > Ow
Definition: hisq_paths_force_core.h:105

Ude
Matrix< RealA, 3 > Ude
Definition: hisq_paths_force_core.h:844

shortPEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const shortPEven
Definition: hisq_paths_force_core.h:435

sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
Definition: hisq_paths_force_core.h:82

newOprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodOdd
Definition: hisq_paths_force_core.h:82

Oy
Matrix< RealA, 3 > Oy
Definition: hisq_paths_force_core.h:105

linkEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
Definition: hisq_paths_force_core.h:82

mycoeff
RealTypeId< RealA >::Type mycoeff
Definition: hisq_paths_force_core.h:496

mymu
int mymu
Definition: hisq_paths_force_core.h:118

getCoords
getCoords(x, sid, kparam.D, oddBit)

ad_link_nbr_idx
int ad_link_nbr_idx
Definition: hisq_paths_force_core.h:117

loadMatrixFromField
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)

y
int y[4]
Definition: hisq_paths_force_core.h:137

oddBit
int oddBit
Definition: hisq_paths_force_core.h:263

oprodOdd
__global__ void const RealA *const oprodOdd
Definition: hisq_paths_force_core.h:82

PmuOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const PmuOdd
Definition: hisq_paths_force_core.h:82