quda-ref/v0.7.0/hisq__paths__force__quda_8cu_source.html

 #include <quda_internal.h>

 #include <lattice_field.h>

 #include <read_gauge.h>

 #include <gauge_field.h>

 #include <ks_improved_force.h>

 #include <hw_quda.h>

 #include <hisq_force_macros.h>

 #include <utility>

 #include <quda_matrix.h>

 #include <force_common.h>

 #include <tune_quda.h>

 #include <color_spinor_field.h>


 #include <face_quda.h>


 #ifdef GPU_HISQ_FORCE


 //DEBUG : control compile

 #define COMPILE_HISQ_DP_18

 #define COMPILE_HISQ_DP_12

 #define COMPILE_HISQ_SP_18

 #define COMPILE_HISQ_SP_12


 // Disable texture read for now. Need to revisit this.

 #define HISQ_SITE_MATRIX_LOAD_TEX 1

 #define HISQ_NEW_OPROD_LOAD_TEX 1


 #ifdef USE_TEXTURE_OBJECTS

 #define TEX1DFETCH(type, tex, idx) tex1Dfetch<type>((tex), idx)

 #else

 #define TEX1DFETCH(type, tex, idx) tex1Dfetch((tex), idx)

 #endif


 #if (__COMPUTE_CAPABILITY__ >= 130)


   template<typename Tex>

 static __inline__ __device__ double fetch_double(Tex t, int i)

 {

   int2 v = TEX1DFETCH(int2, t, i);

   return __hiloint2double(v.y, v.x);

 }


   template <typename Tex>

 static __inline__ __device__ double2 fetch_double2(Tex t, int i)

 {

   int4 v = TEX1DFETCH(int4, t, i);

   return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));

 }


 static __inline__ __device__ double2 fetch_double2_old(texture<int4, 1> t, int i)

 {

   int4 v = tex1Dfetch(t,i);

   return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));

 }


 #endif //__COMPUTE_CAPABILITY__ >= 130


 namespace quda {

   namespace fermion_force {


     struct hisq_kernel_param_t{

       unsigned long threads;

       int X[4];

       int D[4];

       int D1h;

       int base_idx[4];

       int ghostDim[4];

       int color_matrix_stride;

       int thin_link_stride;

       int momentum_stride;


       void setStride(const QudaGaugeParam& param){

         int half_volume = param.X[0]*param.X[1]*param.X[2]*param.X[3]/2;

 #ifdef MULTI_GPU

         int extended_half_volume = (param.X[0]+4)*(param.X[1]+4)*(param.X[2]+4)*(param.X[3]+4)/2;

         thin_link_stride = extended_half_volume + param.site_ga_pad;

         color_matrix_stride = extended_half_volume;

 #else

         thin_link_stride  = half_volume + param.site_ga_pad;

         color_matrix_stride = half_volume;

 #endif

         momentum_stride = half_volume + param.mom_ga_pad;

       }

     };


     //Double precision for site link

     texture<int4, 1> thinLink0TexDouble;

     texture<int4, 1> thinLink1TexDouble;


     //Single precision for site link

     texture<float2, 1, cudaReadModeElementType> thinLink0TexSingle;

     texture<float2, 1, cudaReadModeElementType> thinLink1TexSingle;


     texture<float4, 1, cudaReadModeElementType> thinLink0TexSingle_recon;

     texture<float4, 1, cudaReadModeElementType> thinLink1TexSingle_recon;


     texture<int4, 1> newOprod0TexDouble;

     texture<int4, 1> newOprod1TexDouble;

     texture<float2, 1, cudaReadModeElementType>  newOprod0TexSingle;

     texture<float2, 1, cudaReadModeElementType> newOprod1TexSingle;


     inline __device__  __host__ int linkIndex(int x[], int dx[], const int X[4]) {

       int y[4];

       for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];

       int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;

       return idx;

     }


     inline __device__ __host__ void updateCoords(int x[], int dir, int shift, const int X[4], const int partitioned){

 #ifdef MULTI_GPU

       if(shift == 1){

         x[dir] = (partitioned || (x[dir] != X[dir]+1)) ? x[dir]+1 : 2;

       }else if(shift == -1){

         x[dir] = (partitioned || (x[dir] != 2)) ? x[dir]-1 : X[dir]+1;

       }

 #else

       x[dir] = (x[dir]+shift + X[dir])%X[dir];

 #endif

       return;

     }


     __device__ __host__ inline void getCoords(int x[4], int cb_index, const int X[4], int parity)

     {

       x[3] = cb_index/(X[2]*X[1]*X[0]/2);

       x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];

       x[1] = (cb_index/(X[0]/2)) % X[1];

       x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);


       return;

     }


     __device__ __host__ inline int posDir(int dir){

       return (dir >= 4) ? 7-dir : dir;

     }


     //struct for holding the fattening path coefficients

     template<class Real>

       struct PathCoefficients

       {

         Real one;

         Real three;

         Real five;

         Real seven;

         Real naik;

         Real lepage;

       };


     inline __device__ float2 operator*(float a, const float2 & b)

     {

       return make_float2(a*b.x,a*b.y);

     }


     inline __device__ double2 operator*(double a, const double2 & b)

     {

       return make_double2(a*b.x,a*b.y);

     }


     inline __device__ const float2 & operator+=(float2 & a, const float2 & b)

     {

       a.x += b.x;

       a.y += b.y;

       return a;

     }


     inline __device__ const double2 & operator+=(double2 & a, const double2 & b)

     {

       a.x += b.x;

       a.y += b.y;

       return a;

     }


     inline __device__ const float4 & operator+=(float4 & a, const float4 & b)

     {

       a.x += b.x;

       a.y += b.y;

       a.z += b.z;

       a.w += b.w;

       return a;

     }


     // Replication of code

     // This structure is already defined in

     // unitarize_utilities.h


     template<class T>

       struct RealTypeId;


     template<>

       struct RealTypeId<float2>

       {

         typedef float Type;

       };


     template<>

       struct RealTypeId<double2>

       {

         typedef double Type;

       };


     template<class T>

       inline __device__

       void adjointMatrix(T* mat)

       {

 #define CONJ_INDEX(i,j) j*3 + i


         T tmp;

         mat[CONJ_INDEX(0,0)] = Conj(mat[0]);

         mat[CONJ_INDEX(1,1)] = Conj(mat[4]);

         mat[CONJ_INDEX(2,2)] = Conj(mat[8]);

         tmp  = Conj(mat[1]);

         mat[CONJ_INDEX(1,0)] = Conj(mat[3]);

         mat[CONJ_INDEX(0,1)] = tmp;

         tmp = Conj(mat[2]);

         mat[CONJ_INDEX(2,0)] = Conj(mat[6]);

         mat[CONJ_INDEX(0,2)] = tmp;

         tmp = Conj(mat[5]);

         mat[CONJ_INDEX(2,1)] = Conj(mat[7]);

         mat[CONJ_INDEX(1,2)] = tmp;


 #undef CONJ_INDEX

         return;

       }


     template<int N, class T>

       inline __device__

       void loadMatrixFromField(const T* const field_even, const T* const field_odd,

           int dir, int idx, T* const mat, int oddness, int stride)

       {

         const T* const field = (oddness)?field_odd:field_even;

         for(int i = 0;i < N ;i++){

           mat[i] = field[idx + dir*N*stride + i*stride];

         }

         return;

       }


     template<class T>

       inline __device__

       void loadMatrixFromField(const T* const field_even, const T* const field_odd,

           int dir, int idx, T* const mat, int oddness, int stride)

       {

         loadMatrixFromField<9> (field_even, field_odd, dir, idx, mat, oddness, stride);

         return;

       }


     inline __device__

       void loadMatrixFromField(const float4* const field_even, const float4* const field_odd,

           int dir, int idx, float2* const mat, int oddness, int stride)

       {

         const float4* const field = oddness?field_odd: field_even;

         float4 tmp;

         tmp = field[idx + dir*stride*3];

         mat[0] = make_float2(tmp.x, tmp.y);

         mat[1] = make_float2(tmp.z, tmp.w);

         tmp = field[idx + dir*stride*3 + stride];

         mat[2] = make_float2(tmp.x, tmp.y);

         mat[3] = make_float2(tmp.z, tmp.w);

         tmp = field[idx + dir*stride*3 + 2*stride];

         mat[4] = make_float2(tmp.x, tmp.y);

         mat[5] = make_float2(tmp.z, tmp.w);

         return;

       }


     template<class T>

       inline __device__

       void loadMatrixFromField(const T* const field_even, const T* const field_odd, int idx, T* const mat, int oddness, int stride)

       {

         const T* const field = (oddness)?field_odd:field_even;

         mat[0] = field[idx];

         mat[1] = field[idx + stride];

         mat[2] = field[idx + stride*2];

         mat[3] = field[idx + stride*3];

         mat[4] = field[idx + stride*4];

         mat[5] = field[idx + stride*5];

         mat[6] = field[idx + stride*6];

         mat[7] = field[idx + stride*7];

         mat[8] = field[idx + stride*8];


         return;

       }


     template<class U>

       inline __device__

       void  addMatrixToNewOprod(const double2* const mat,  int dir, int idx, U coeff,

           double2* const field_even, double2* const field_odd, int oddness, int stride){

         double2* const field = (oddness)?field_odd: field_even;

         double2 value[9];


 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)

         value[0] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9);

         value[1] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + stride);

         value[2] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 2*stride);

         value[3] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 3*stride);

         value[4] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 4*stride);

         value[5] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 5*stride);

         value[6] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 6*stride);

         value[7] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 7*stride);

         value[8] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 8*stride);

 #else

         for(int i=0; i<9; ++i) value[i] = field[i];

 #endif


         field[idx + dir*stride*9]              = value[0] + coeff*mat[0];

         field[idx + dir*stride*9 + stride]     = value[1] + coeff*mat[1];

         field[idx + dir*stride*9 + stride*2]   = value[2] + coeff*mat[2];

         field[idx + dir*stride*9 + stride*3]   = value[3] + coeff*mat[3];

         field[idx + dir*stride*9 + stride*4]   = value[4] + coeff*mat[4];

         field[idx + dir*stride*9 + stride*5]   = value[5] + coeff*mat[5];

         field[idx + dir*stride*9 + stride*6]   = value[6] + coeff*mat[6];

         field[idx + dir*stride*9 + stride*7]   = value[7] + coeff*mat[7];

         field[idx + dir*stride*9 + stride*8]   = value[8] + coeff*mat[8];


         return;

       }


     template<class U>

       inline __device__

       void  addMatrixToNewOprod(const float2* const mat,  int dir, int idx, U coeff,

           float2* const field_even, float2* const field_odd, int oddness, int stride){

         float2* const field = (oddness)?field_odd: field_even;

         float2 value[9];


 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)

         value[0] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9);

         value[1] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + stride);

         value[2] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 2*stride);

         value[3] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 3*stride);

         value[4] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 4*stride);

         value[5] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 5*stride);

         value[6] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 6*stride);

         value[7] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 7*stride);

         value[8] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle),  idx+dir*stride*9 + 8*stride);

 #else

         for(int i=0; i<9; ++i) value[i] = field[i];

 #endif

         field[idx + dir*stride*9]              = value[0] + coeff*mat[0];

         field[idx + dir*stride*9 + stride]     = value[1] + coeff*mat[1];

         field[idx + dir*stride*9 + stride*2]   = value[2] + coeff*mat[2];

         field[idx + dir*stride*9 + stride*3]   = value[3] + coeff*mat[3];

         field[idx + dir*stride*9 + stride*4]   = value[4] + coeff*mat[4];

         field[idx + dir*stride*9 + stride*5]   = value[5] + coeff*mat[5];

         field[idx + dir*stride*9 + stride*6]   = value[6] + coeff*mat[6];

         field[idx + dir*stride*9 + stride*7]   = value[7] + coeff*mat[7];

         field[idx + dir*stride*9 + stride*8]   = value[8] + coeff*mat[8];


         return;

       }


     // only works if Promote<T,U>::Type = T


     template<class T, class U>

       inline __device__

       void addMatrixToField(const T* const mat, int dir, int idx, U coeff,

           T* const field_even, T* const field_odd, int oddness, int stride)

       {

         T* const field = (oddness)?field_odd: field_even;

         field[idx + dir*stride*9]          += coeff*mat[0];

         field[idx + dir*stride*9 + stride]     += coeff*mat[1];

         field[idx + dir*stride*9 + stride*2]   += coeff*mat[2];

         field[idx + dir*stride*9 + stride*3]   += coeff*mat[3];

         field[idx + dir*stride*9 + stride*4]   += coeff*mat[4];

         field[idx + dir*stride*9 + stride*5]   += coeff*mat[5];

         field[idx + dir*stride*9 + stride*6]   += coeff*mat[6];

         field[idx + dir*stride*9 + stride*7]   += coeff*mat[7];

         field[idx + dir*stride*9 + stride*8]   += coeff*mat[8];


         return;

       }


     template<class T, class U>

       inline __device__

       void addMatrixToField(const T* const mat, int idx, U coeff, T* const field_even,

           T* const field_odd, int oddness, int stride)

       {

         T* const field = (oddness)?field_odd: field_even;

         field[idx ]         += coeff*mat[0];

         field[idx + stride]     += coeff*mat[1];

         field[idx + stride*2]   += coeff*mat[2];

         field[idx + stride*3]   += coeff*mat[3];

         field[idx + stride*4]   += coeff*mat[4];

         field[idx + stride*5]   += coeff*mat[5];

         field[idx + stride*6]   += coeff*mat[6];

         field[idx + stride*7]   += coeff*mat[7];

         field[idx + stride*8]   += coeff*mat[8];


         return;

       }


     template<class T, class U>

       inline __device__

       void addMatrixToField_test(const T* const mat, int idx, U coeff, T* const field_even,

           T* const field_odd, int oddness, int stride)

       {

         T* const field = (oddness)?field_odd: field_even;

         //T oldvalue=field[idx];

         field[idx ]         += coeff*mat[0];

         field[idx + stride]     += coeff*mat[1];

         field[idx + stride*2]   += coeff*mat[2];

         field[idx + stride*3]   += coeff*mat[3];

         field[idx + stride*4]   += coeff*mat[4];

         field[idx + stride*5]   += coeff*mat[5];

         field[idx + stride*6]   += coeff*mat[6];

         field[idx + stride*7]   += coeff*mat[7];

         field[idx + stride*8]   += coeff*mat[8];


 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))

         printf("value is  coeff(%f) * mat[0].x(%f)=%f\n", coeff, mat[0].x, field[idx].x);

 #endif

         return;

       }


     template<class T>

       inline __device__

       void storeMatrixToField(const T* const mat, int dir, int idx, T* const field_even, T* const field_odd, int oddness, int stride)

       {

         T* const field = (oddness)?field_odd: field_even;

         field[idx + dir*stride*9]          = mat[0];

         field[idx + dir*stride*9 + stride]     = mat[1];

         field[idx + dir*stride*9 + stride*2]   = mat[2];

         field[idx + dir*stride*9 + stride*3]   = mat[3];

         field[idx + dir*stride*9 + stride*4]   = mat[4];

         field[idx + dir*stride*9 + stride*5]   = mat[5];

         field[idx + dir*stride*9 + stride*6]   = mat[6];

         field[idx + dir*stride*9 + stride*7]   = mat[7];

         field[idx + dir*stride*9 + stride*8]   = mat[8];


         return;

       }


     template<class T>

       inline __device__

       void storeMatrixToField(const T* const mat, int idx, T* const field_even, T* const field_odd, int oddness, int stride)

       {

         T* const field = (oddness)?field_odd: field_even;

         field[idx]          = mat[0];

         field[idx + stride]     = mat[1];

         field[idx + stride*2]   = mat[2];

         field[idx + stride*3]   = mat[3];

         field[idx + stride*4]   = mat[4];

         field[idx + stride*5]   = mat[5];

         field[idx + stride*6]   = mat[6];

         field[idx + stride*7]   = mat[7];

         field[idx + stride*8]   = mat[8];


         return;

       }


     template<class T, class U>

       inline __device__

       void storeMatrixToMomentumField(const T* const mat, int dir, int idx, U coeff,

           T* const mom_even, T* const mom_odd, int oddness, int stride)

       {

         T* const mom_field = (oddness)?mom_odd:mom_even;

         T temp2;

         temp2.x = (mat[1].x - mat[3].x)*0.5*coeff;

         temp2.y = (mat[1].y + mat[3].y)*0.5*coeff;

         mom_field[idx + dir*stride*5] = temp2;


         temp2.x = (mat[2].x - mat[6].x)*0.5*coeff;

         temp2.y = (mat[2].y + mat[6].y)*0.5*coeff;

         mom_field[idx + dir*stride*5 + stride] = temp2;


         temp2.x = (mat[5].x - mat[7].x)*0.5*coeff;

         temp2.y = (mat[5].y + mat[7].y)*0.5*coeff;

         mom_field[idx + dir*stride*5 + stride*2] = temp2;


         const typename RealTypeId<T>::Type temp = (mat[0].y + mat[4].y + mat[8].y)*0.3333333333333333333333333;

         temp2.x =  (mat[0].y-temp)*coeff;

         temp2.y =  (mat[4].y-temp)*coeff;

         mom_field[idx + dir*stride*5 + stride*3] = temp2;


         temp2.x = (mat[8].y - temp)*coeff;

         temp2.y = 0.0;

         mom_field[idx + dir*stride*5 + stride*4] = temp2;


         return;

       }


     // Struct to determine the coefficient sign at compile time

     template<int pos_dir, int odd_lattice>

       struct CoeffSign

       {

         static const int result = -1;

       };


     template<>

       struct CoeffSign<0,1>

       {

         static const int result = -1;

       };


     template<>

       struct CoeffSign<0,0>

       {

         static const int result = 1;

       };


     template<>

       struct CoeffSign<1,1>

       {

         static const int result = 1;

       };


     template<int odd_lattice>

       struct Sign

       {

         static const int result = 1;

       };


     template<>

       struct Sign<1>

       {

         static const int result = -1;

       };


     template<class RealX>

       struct ArrayLength

       {

         static const int result=9;

       };


     template<>

       struct ArrayLength<float4>

       {

         static const int result=5;

       };


     // Flops: four matrix additions per lattice site = 72 Flops per lattice site

     template<class RealA, int oddBit>

       __global__ void

       do_one_link_term_kernel(const RealA* const oprodEven, const RealA* const oprodOdd,

           typename RealTypeId<RealA>::Type coeff,

           RealA* const outputEven, RealA* const outputOdd, hisq_kernel_param_t kparam)

       {

         int sid = blockIdx.x * blockDim.x + threadIdx.x;

         if (sid >= kparam.threads) return;

 #ifdef MULTI_GPU

         int dx[4] = {0,0,0,0};

         int x[4];

         getCoords(x, sid, kparam.X, oddBit);

         int E[4] = {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};

         for(int dir=0; dir<4; ++dir) x[dir] += 2;

         int new_sid = linkIndex(x,dx,E);

 #else

         int new_sid = sid;

 #endif

         for(int sig=0; sig<4; ++sig){

           RealA COLOR_MAT_W[ArrayLength<RealA>::result];

           loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, COLOR_MAT_W, oddBit, kparam.color_matrix_stride);

           addMatrixToField(COLOR_MAT_W, sig, new_sid, coeff, outputEven, outputOdd, oddBit, kparam.color_matrix_stride);

         }

         return;

       }


     template<int N>

       __device__ void loadLink(const double2* const linkEven, const double2* const linkOdd, int dir, int idx, double2* const var, int oddness, int stride){

 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)

         HISQ_LOAD_MATRIX_18_DOUBLE_TEX((oddness)?thinLink1TexDouble:thinLink0TexDouble,  (oddness)?linkOdd:linkEven, dir, idx, var, stride);

 #else

         loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);

 #endif

       }


     template<>

       void loadLink<12>(const double2* const linkEven, const double2* const linkOdd, int dir, int idx, double2* const var, int oddness, int stride){

 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)

         HISQ_LOAD_MATRIX_12_DOUBLE_TEX((oddness)?thinLink1TexDouble:thinLink0TexDouble,  (oddness)?linkOdd:linkEven,dir, idx, var, stride);

 #else

         loadMatrixFromField<6>(linkEven, linkOdd, dir, idx, var, oddness, stride);

 #endif

       }


     template<int N>

       __device__ void loadLink(const float4* const linkEven, const float4* const linkOdd, int dir, int idx, float2* const var, int oddness, int stride){

 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)

         HISQ_LOAD_MATRIX_12_SINGLE_TEX((oddness)?thinLink1TexSingle_recon:thinLink0TexSingle_recon, dir, idx, var, stride);

 #else

         loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);

 #endif

       }


     template<int N>

       __device__ void loadLink(const float2* const linkEven, const float2* const linkOdd, int dir, int idx, float2* const var , int oddness, int stride){

 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)

         HISQ_LOAD_MATRIX_18_SINGLE_TEX((oddness)?thinLink1TexSingle:thinLink0TexSingle, dir, idx, var, stride);

 #else

         loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);

 #endif

       }


 #define DD_CONCAT(n,r) n ## r ## kernel


 #define HISQ_KERNEL_NAME(a,b) DD_CONCAT(a,b)

     //precision: 0 is for double, 1 is for single


     //double precision, recon=18

 #define PRECISION 0

 #define RECON 18

 #include "hisq_paths_force_core.h"

 #undef PRECISION

 #undef RECON


     //double precision, recon=12

 #define PRECISION 0

 #define RECON 12

 #include "hisq_paths_force_core.h"

 #undef PRECISION

 #undef RECON


     //single precision, recon=18

 #define PRECISION 1

 #define RECON 18

 #include "hisq_paths_force_core.h"

 #undef PRECISION

 #undef RECON


     //single precision, recon=12

 #define PRECISION 1

 #define RECON 12

 #include "hisq_paths_force_core.h"

 #undef PRECISION

 #undef RECON


     template<class RealA, class RealB>

       class MiddleLink : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &oprod;

           const cudaGaugeField &Qprev;

           const int sig;

           const int mu;

           const typename RealTypeId<RealA>::Type &coeff;

           cudaGaugeField &Pmu;

           cudaGaugeField &P3;

           cudaGaugeField &Qmu;

           cudaGaugeField &newOprod;

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return kparam.threads; }


         public:

           MiddleLink(const cudaGaugeField &link,

               const cudaGaugeField &oprod,

               const cudaGaugeField &Qprev,

               int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff,

               cudaGaugeField &Pmu, // write only

               cudaGaugeField &P3,  // write only

               cudaGaugeField &Qmu,

               cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),

             coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)

         {       ; }

           // need alternative constructor to hack around null pointer passing

           MiddleLink(const cudaGaugeField &link,

               const cudaGaugeField &oprod,

               int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff,

               cudaGaugeField &Pmu, // write only

               cudaGaugeField &P3,  // write only

               cudaGaugeField &Qmu,

               cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), oprod(oprod), Qprev(link), sig(sig), mu(mu),

             coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)

         {       ; }

           virtual ~MiddleLink() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << kparam.D[0] << "x";

             vol << kparam.D[1] << "x";

             vol << kparam.D[2] << "x";

             vol << kparam.D[3];

             aux << "threads=" << kparam.threads << ",prec=" << link.Precision();

             aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>>            \

           ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(),                       \

            (typeA*)Qprev_even, (typeA*)Qprev_odd,                               \

            (typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            sig, mu, coeff,                                                      \

            (typeA*)Pmu.Even_p(), (typeA*)Pmu.Odd_p(),                   \

            (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(),                             \

            (typeA*)Qmu.Even_p(), (typeA*)Qmu.Odd_p(),                   \

            (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)


 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign)                      \

           if(oddness_change ==0 ){                                      \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \

                 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \

                 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                      \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }else{                                                                \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \

                 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \

                 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                      \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();

             int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]

                 + kparam.base_idx[2] + kparam.base_idx[3])&1;


             const void *Qprev_even = (&Qprev == &link) ? NULL : Qprev.Even_p();

             const void *Qprev_odd = (&Qprev == &link) ? NULL : Qprev.Odd_p();


             if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(1,1);

             }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(1,0);

             }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(0,1);

             }else{

               CALL_MIDDLE_LINK_KERNEL(0,0);

             }

           }


 #undef CALL_ARGUMENTS

 #undef CALL_MIDDLE_LINK_KERNEL


           void preTune() {

             Pmu.backup();

             P3.backup();

             Qmu.backup();

             newOprod.backup();

           }


           void postTune() {

             Pmu.restore();

             P3.restore();

             Qmu.restore();

             newOprod.restore();

           }


           long long flops() const { return 0; }

       };


     template<class RealA, class RealB>

       class LepageMiddleLink : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &oprod;

           const cudaGaugeField &Qprev;

           const int sig;

           const int mu;

           const typename RealTypeId<RealA>::Type &coeff;

           cudaGaugeField &P3; // write only

           cudaGaugeField &newOprod;

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return kparam.threads; }


         public:

           LepageMiddleLink(const cudaGaugeField &link,

               const cudaGaugeField &oprod,

               const cudaGaugeField &Qprev,

               int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff,

               cudaGaugeField &P3, cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),

             coeff(coeff), P3(P3), newOprod(newOprod), kparam(kparam)

         {       ; }

           virtual ~LepageMiddleLink() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << kparam.D[0] << "x";

             vol << kparam.D[1] << "x";

             vol << kparam.D[2] << "x";

             vol << kparam.D[3];

             aux << "threads=" << kparam.threads << ",prec=" << link.Precision();

             aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>>            \

           ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(),                       \

            (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(),                       \

            (typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            sig, mu, coeff,                                                      \

            (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(),                             \

            (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(),         \

            kparam)


 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign)                      \

           if(oddness_change == 0){                                              \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \

                 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \

                 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                      \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }else{                                                                \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \

                 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \

                 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                      \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();

             int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]

                 + kparam.base_idx[2] + kparam.base_idx[3])&1;


             if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(1,1);

             }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(1,0);

             }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_MIDDLE_LINK_KERNEL(0,1);

             }else{

               CALL_MIDDLE_LINK_KERNEL(0,0);

             }


           }


 #undef CALL_ARGUMENTS

 #undef CALL_MIDDLE_LINK_KERNEL


           void preTune() {

             P3.backup();

             newOprod.backup();

           }


           void postTune() {

             P3.restore();

             newOprod.restore();

           }


           long long flops() const {

             if(GOES_FORWARDS(sig)) return 810*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];

             return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*396;

           }

       };


     template<class RealA, class RealB>

       class SideLink : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &P3;

           const cudaGaugeField &oprod;

           const int sig;

           const int mu;

           const typename RealTypeId<RealA>::Type &coeff;

           const typename RealTypeId<RealA>::Type &accumu_coeff;

           cudaGaugeField &shortP;

           cudaGaugeField &newOprod;

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return kparam.threads; }


         public:

           SideLink(const cudaGaugeField &link,

               const cudaGaugeField &P3,

               const cudaGaugeField &oprod,

               int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff,

               const typename RealTypeId<RealA>::Type &accumu_coeff,

               cudaGaugeField &shortP,

               cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), P3(P3), oprod(oprod),

             sig(sig), mu(mu), coeff(coeff), accumu_coeff(accumu_coeff),

             shortP(shortP), newOprod(newOprod), kparam(kparam)

         {       ; }

           virtual ~SideLink() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << kparam.D[0] << "x";

             vol << kparam.D[1] << "x";

             vol << kparam.D[2] << "x";

             vol << kparam.D[3];

             aux << "threads=" << kparam.threads << ",prec=" << link.Precision();

             aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>>            \

           ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(),                             \

            (typeA*)oprod.Even_p(),  (typeA*)oprod.Odd_p(),                      \

            (typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            sig, mu,                                                             \

            coeff,                       \

            (typename RealTypeId<typeA>::Type) accumu_coeff,                     \

            (typeA*)shortP.Even_p(), (typeA*)shortP.Odd_p(),                     \

            (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(),         \

            kparam)


 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign)                        \

           if(oddness_change == 0){                                              \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \

                 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \

                 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }else{                                                                \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \

                 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \

                 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();

             int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]

                 + kparam.base_idx[2] + kparam.base_idx[3])&1;


             if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(1,1);

             }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(1,0);

             }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(0,1);

             }else{

               CALL_SIDE_LINK_KERNEL(0,0);

             }

           }


 #undef CALL_SIDE_LINK_KERNEL

 #undef CALL_ARGUMENTS


           void preTune() {

             shortP.backup();

             newOprod.backup();

           }


           void postTune() {

             shortP.restore();

             newOprod.restore();

           }


           long long flops() const { return 0; }

       };


     template<class RealA, class RealB>

       class SideLinkShort : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &P3;

           const int sig;

           const int mu;

           const typename RealTypeId<RealA>::Type &coeff;

           cudaGaugeField &newOprod;

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return kparam.threads; }


         public:

           SideLinkShort(const cudaGaugeField &link, const cudaGaugeField &P3, int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff, cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), P3(P3), sig(sig), mu(mu), coeff(coeff), newOprod(newOprod), kparam(kparam)

         {       ; }

           virtual ~SideLinkShort() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << kparam.D[0] << "x";

             vol << kparam.D[1] << "x";

             vol << kparam.D[2] << "x";

             vol << kparam.D[3];

             aux << "threads=" << kparam.threads << ",prec=" << link.Precision();

             aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>>            \

           ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(),                             \

            (typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            sig, mu,     (typename RealTypeId<typeA>::Type) coeff,               \

            (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)


 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign)                        \

           if(oddness_change == 0){                                              \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \

                 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \

                 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }else{                                                                \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \

                 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \

                 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();

             int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]

                 + kparam.base_idx[2] + kparam.base_idx[3])&1;


             if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(1,1);

             }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(1,0);


             }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_SIDE_LINK_KERNEL(0,1);

             }else{

               CALL_SIDE_LINK_KERNEL(0,0);

             }

           }


 #undef CALL_SIDE_LINK_KERNEL

 #undef CALL_ARGUMENTS


           void preTune() {

             newOprod.backup();

           }


           void postTune() {

             newOprod.restore();

           }


           long long flops() const { return 0; }

       };


     template<class RealA, class RealB>

       class AllLink : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &oprod;

           const cudaGaugeField &Qprev;

           const int sig;

           const int mu;

           const typename RealTypeId<RealA>::Type &coeff;

           const typename RealTypeId<RealA>::Type &accumu_coeff;

           cudaGaugeField &shortP;

           cudaGaugeField &newOprod;

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return kparam.threads; }


         public:

           AllLink(const cudaGaugeField &link,

               const cudaGaugeField &oprod,

               const cudaGaugeField &Qprev,

               int sig, int mu,

               const typename RealTypeId<RealA>::Type &coeff,

               const typename RealTypeId<RealA>::Type &accumu_coeff,

               cudaGaugeField &shortP, cudaGaugeField &newOprod,

               const hisq_kernel_param_t &kparam) :

             link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),

             coeff(coeff), accumu_coeff(accumu_coeff), shortP(shortP),

             newOprod(newOprod), kparam(kparam)

         { ; }

           virtual ~AllLink() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << kparam.D[0] << "x";

             vol << kparam.D[1] << "x";

             vol << kparam.D[2] << "x";

             vol << kparam.D[3];

             aux << "threads=" << kparam.threads << ",prec=" << link.Precision();

             aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>>            \

           ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(),                       \

            (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(),                       \

            (typeB*)link.Even_p(), (typeB*)link.Odd_p(), sig,  mu,               \

            (typename RealTypeId<typeA>::Type)coeff,                             \

            (typename RealTypeId<typeA>::Type)accumu_coeff,                      \

            (typeA*)shortP.Even_p(),(typeA*)shortP.Odd_p(),                      \

            (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)


 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign)                         \

           if(oddness_change == 0){                                              \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \

                 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \

                 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \

                 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }else{                                                                \

             if(sizeof(RealA) == sizeof(float2)){                                \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \

                 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \

               }else{                                                    \

                 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \

                 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \

               }                                                         \

             }else{                                                              \

               if(recon  == QUDA_RECONSTRUCT_NO){                                \

                 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }else{                                                    \

                 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \

                 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \

               }                                                         \

             }                                                           \

           }

           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();

             int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]

                 + kparam.base_idx[2] + kparam.base_idx[3])&1;


             if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_ALL_LINK_KERNEL(1, 1);

             }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){

               CALL_ALL_LINK_KERNEL(1, 0);

             }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){

               CALL_ALL_LINK_KERNEL(0, 1);

             }else{

               CALL_ALL_LINK_KERNEL(0, 0);

             }


             return;

           }


 #undef CALL_ARGUMENTS

 #undef CALL_ALL_LINK_KERNEL


           void preTune() {

             shortP.backup();

             newOprod.backup();

           }


           void postTune() {

             shortP.restore();

             newOprod.restore();

           }


           virtual void initTuneParam(TuneParam &param) const

           {

             Tunable::initTuneParam(param);

             param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);

           }


           void defaultTuneParam(TuneParam &param) const

           {

             Tunable::defaultTuneParam(param);

             param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);

           }


           long long flops() const {

             if(GOES_FORWARDS(sig)) return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*1242;


             return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*828;

           }

       };


     template<class RealA, class RealB>

       class OneLinkTerm : public Tunable {


         private:

           const cudaGaugeField &oprod;

           const typename RealTypeId<RealA>::Type &coeff;

           cudaGaugeField &ForceMatrix;

           int X[4];

           hisq_kernel_param_t kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }


         public:

           OneLinkTerm(const cudaGaugeField &oprod,

               const typename RealTypeId<RealA>::Type &coeff,

               cudaGaugeField &ForceMatrix, const QudaGaugeParam& param) :

             oprod(oprod), coeff(coeff), ForceMatrix(ForceMatrix)

         {

           for(int dir=0; dir<4; ++dir) X[dir] = param.X[dir];


           kparam.threads = X[0]*X[1]*X[2]*X[3]/2;

           for(int dir=0; dir<4; ++dir){

             kparam.X[dir] = X[dir];

           }

           kparam.setStride(param);

         }


           virtual ~OneLinkTerm() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << X[0] << "x";

             vol << X[1] << "x";

             vol << X[2] << "x";

             vol << X[3];

             int threads = X[0]*X[1]*X[2]*X[3]/2;

             aux << "threads=" << threads << ",prec=" << oprod.Precision();

             aux << ",coeff=" << coeff;

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());


             do_one_link_term_kernel<RealA,0><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),

                 static_cast<const RealA*>(oprod.Odd_p()),

                 coeff,

                 static_cast<RealA*>(ForceMatrix.Even_p()),

                 static_cast<RealA*>(ForceMatrix.Odd_p()),

                 kparam);

             do_one_link_term_kernel<RealA,1><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),

                 static_cast<const RealA*>(oprod.Odd_p()),

                 coeff,

                 static_cast<RealA*>(ForceMatrix.Even_p()),

                 static_cast<RealA*>(ForceMatrix.Odd_p()),

                 kparam);


           }


           void preTune() {

             ForceMatrix.backup();

           }


           void postTune() {

             ForceMatrix.restore();

           }


           long long flops() const {

             return 72*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];

           }

       };


     template<class RealA, class RealB>

       class LongLinkTerm : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &naikOprod;

           const typename RealTypeId<RealA>::Type naik_coeff;

           cudaGaugeField &output;

           int X[4];

           const hisq_kernel_param_t &kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }


         public:

           LongLinkTerm(const cudaGaugeField &link, const cudaGaugeField &naikOprod,

               const typename RealTypeId<RealA>::Type &naik_coeff,

               cudaGaugeField &output, const hisq_kernel_param_t &kparam) :

             link(link), naikOprod(naikOprod),  naik_coeff(naik_coeff), output(output),

             kparam(kparam)

         { for(int dir=0; dir<4; ++dir) X[dir] = kparam.X[dir]; }


           virtual ~LongLinkTerm() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << X[0] << "x";

             vol << X[1] << "x";

             vol << X[2] << "x";

             vol << X[3];

             int threads = X[0]*X[1]*X[2]*X[3]/2;

             aux << "threads=" << threads << ",prec=" << link.Precision();

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid,tp.block>>>             \

           ((typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            (typeA*)naikOprod.Even_p(),  (typeA*)naikOprod.Odd_p(),              \

            naik_coeff,                                                  \

            (typeA*)output.Even_p(), (typeA*)output.Odd_p(),                     \

            kparam);


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();


             if(sizeof(RealA) == sizeof(float2)){

               if(recon == QUDA_RECONSTRUCT_NO){

                 do_longlink_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);

                 do_longlink_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);

               }else{

                 do_longlink_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);

                 do_longlink_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);

               }

             }else{

               if(recon == QUDA_RECONSTRUCT_NO){

                 do_longlink_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);

                 do_longlink_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);

               }else{

                 do_longlink_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);

                 do_longlink_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);

               }

             }

           }


 #undef CALL_ARGUMENTS


           void preTune() {

             output.backup();

           }


           void postTune() {

             output.restore();

           }


           long long flops() const { return 4968*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]; }

       };


     template<class RealA, class RealB>

       class CompleteForce : public Tunable {


         private:

           const cudaGaugeField &link;

           const cudaGaugeField &oprod;

           cudaGaugeField &mom;

           int X[4];

           hisq_kernel_param_t kparam;


           unsigned int sharedBytesPerThread() const { return 0; }

           unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


           // don't tune the grid dimension

           bool tuneGridDim() const { return false; }

           unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }


         public:

           CompleteForce(const cudaGaugeField &link, const cudaGaugeField &oprod,

              cudaGaugeField &mom, const QudaGaugeParam &param) :

             link(link), oprod(oprod), mom(mom)

         {


           for(int dir=0; dir<4; ++dir){

             X[dir] = param.X[dir];

             kparam.X[dir] = X[dir];

           }

           kparam.threads = X[0]*X[1]*X[2]*X[3]/2;

           kparam.setStride(param);

         }


           virtual ~CompleteForce() { ; }


           TuneKey tuneKey() const {

             std::stringstream vol, aux;

             vol << X[0] << "x";

             vol << X[1] << "x";

             vol << X[2] << "x";

             vol << X[3];

             int threads = X[0]*X[1]*X[2]*X[3]/2;

             aux << "threads=" << threads << ",prec=" << link.Precision();

             return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());

           }


 #define CALL_ARGUMENTS(typeA, typeB)  <<<tp.grid, tp.block>>>           \

           ((typeB*)link.Even_p(), (typeB*)link.Odd_p(),                 \

            (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(),                       \

            (typeA*)mom.Even_p(), (typeA*)mom.Odd_p(),                   \

            kparam);


           void apply(const cudaStream_t &stream) {

             TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

             QudaReconstructType recon = link.Reconstruct();;


             if(sizeof(RealA) == sizeof(float2)){

               if(recon == QUDA_RECONSTRUCT_NO){

                 do_complete_force_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);

                 do_complete_force_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);

               }else{

                 do_complete_force_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);

                 do_complete_force_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);

               }

             }else{

               if(recon == QUDA_RECONSTRUCT_NO){

                 do_complete_force_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);

                 do_complete_force_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);

               }else{

                 do_complete_force_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);

                 do_complete_force_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);

               }

             }

           }


 #undef CALL_ARGUMENTS


           void preTune() {

             mom.backup();

           }


           void postTune() {

             mom.restore();

           }


           long long flops() const {

             return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*792;

           }

       };


     static void

       bind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)

       {

         if(link.Precision() == QUDA_DOUBLE_PRECISION){

           cudaBindTexture(0, thinLink0TexDouble, link.Even_p(), link.Bytes()/2);

           cudaBindTexture(0, thinLink1TexDouble, link.Odd_p(), link.Bytes()/2);


           cudaBindTexture(0, newOprod0TexDouble, newOprod.Even_p(), newOprod.Bytes()/2);

           cudaBindTexture(0, newOprod1TexDouble, newOprod.Odd_p(), newOprod.Bytes()/2);

         }else{

           if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){

             cudaBindTexture(0, thinLink0TexSingle, link.Even_p(), link.Bytes()/2);

             cudaBindTexture(0, thinLink1TexSingle, link.Odd_p(), link.Bytes()/2);

           }else{

             cudaBindTexture(0, thinLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);

             cudaBindTexture(0, thinLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);

           }

           cudaBindTexture(0, newOprod0TexSingle, newOprod.Even_p(), newOprod.Bytes()/2);

           cudaBindTexture(0, newOprod1TexSingle, newOprod.Odd_p(), newOprod.Bytes()/2);


         }

       }


     static void

       unbind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)

       {

         if(link.Precision() == QUDA_DOUBLE_PRECISION){

           cudaUnbindTexture(thinLink0TexDouble);

           cudaUnbindTexture(thinLink1TexDouble);

           cudaUnbindTexture(newOprod0TexDouble);

           cudaUnbindTexture(newOprod1TexDouble);

         }else{

           if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){

             cudaUnbindTexture(thinLink0TexSingle);

             cudaUnbindTexture(thinLink1TexSingle);

           }else{

             cudaUnbindTexture(thinLink0TexSingle_recon);

             cudaUnbindTexture(thinLink1TexSingle_recon);

           }

           cudaUnbindTexture(newOprod0TexSingle);

           cudaUnbindTexture(newOprod1TexSingle);

         }

       }


     template<class Real, class RealA, class RealB>

       static void

       do_hisq_staples_force_cuda( PathCoefficients<Real> act_path_coeff,

           const QudaGaugeParam& param,

           const cudaGaugeField &oprod,

           const cudaGaugeField &link,

           cudaGaugeField &Pmu,

           cudaGaugeField &P3,

           cudaGaugeField &P5,

           cudaGaugeField &Pnumu,

           cudaGaugeField &Qmu,

           cudaGaugeField &Qnumu,

           cudaGaugeField &newOprod)

       {


         Real coeff;

         Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;

         Real mLepage, mFiveSt, mThreeSt;


         OneLink = act_path_coeff.one;

         ThreeSt = act_path_coeff.three; mThreeSt = -ThreeSt;

         FiveSt  = act_path_coeff.five; mFiveSt  = -FiveSt;

         SevenSt = act_path_coeff.seven;

         Lepage  = act_path_coeff.lepage; mLepage  = -Lepage;


         OneLinkTerm<RealA, RealB> oneLink(oprod, OneLink, newOprod, param);

         oneLink.apply(0);

         checkCudaError();


         int ghostDim[4]={

           commDimPartitioned(0),

           commDimPartitioned(1),

           commDimPartitioned(2),

           commDimPartitioned(3)

         };


         hisq_kernel_param_t kparam_1g, kparam_2g;


         for(int dir=0; dir<4; ++dir){

           kparam_1g.X[dir] = param.X[dir];

           kparam_2g.X[dir] = param.X[dir];

         }


         kparam_1g.setStride(param);

         kparam_2g.setStride(param);


 #ifdef MULTI_GPU

         kparam_1g.D[0] = commDimPartitioned(0)?(param.X[0]+2):(param.X[0]);

         kparam_1g.D[1] = commDimPartitioned(1)?(param.X[1]+2):(param.X[1]);

         kparam_1g.D[2] = commDimPartitioned(2)?(param.X[2]+2):(param.X[2]);

         kparam_1g.D[3] = commDimPartitioned(3)?(param.X[3]+2):(param.X[3]);

         kparam_1g.D1h =  kparam_1g.D[0]/2;

         kparam_1g.base_idx[0]=commDimPartitioned(0)?1:2;

         kparam_1g.base_idx[1]=commDimPartitioned(1)?1:2;

         kparam_1g.base_idx[2]=commDimPartitioned(2)?1:2;

         kparam_1g.base_idx[3]=commDimPartitioned(3)?1:2;

         kparam_1g.threads = kparam_1g.D[0]*kparam_1g.D[1]*kparam_1g.D[2]*kparam_1g.D[3]/2;


         kparam_2g.D[0] = commDimPartitioned(0)?(param.X[0]+4):(param.X[0]);

         kparam_2g.D[1] = commDimPartitioned(1)?(param.X[1]+4):(param.X[1]);

         kparam_2g.D[2] = commDimPartitioned(2)?(param.X[2]+4):(param.X[2]);

         kparam_2g.D[3] = commDimPartitioned(3)?(param.X[3]+4):(param.X[3]);

         kparam_2g.D1h = kparam_2g.D[0]/2;

         kparam_2g.base_idx[0]=commDimPartitioned(0)?0:2;

         kparam_2g.base_idx[1]=commDimPartitioned(1)?0:2;

         kparam_2g.base_idx[2]=commDimPartitioned(2)?0:2;

         kparam_2g.base_idx[3]=commDimPartitioned(3)?0:2;

         kparam_2g.threads = kparam_2g.D[0]*kparam_2g.D[1]*kparam_2g.D[2]*kparam_2g.D[3]/2;


         for(int i=0;i < 4; i++){

           kparam_1g.ghostDim[i] = kparam_2g.ghostDim[i]=kparam_1g.ghostDim[i]=kparam_2g.ghostDim[i] = ghostDim[i];

         }

 #else

         hisq_kernel_param_t kparam;

         kparam.D[0] = param.X[0];

         kparam.D[1] = param.X[1];

         kparam.D[2] = param.X[2];

         kparam.D[3] = param.X[3];

         kparam.D1h = param.X[0]/2;

         kparam.threads=param.X[0]*param.X[1]*param.X[2]*param.X[3]/2;

         kparam.base_idx[0]=0;

         kparam.base_idx[1]=0;

         kparam.base_idx[2]=0;

         kparam.base_idx[3]=0;

         kparam_2g.threads = kparam_1g.threads = kparam.threads;


         for(int i=0; i<4; ++i){

           kparam_2g.D[i] = kparam_1g.D[i] = kparam.D[i];

           kparam_2g.D1h  = kparam_1g.D1h  = kparam.D1h;

           kparam_2g.base_idx[i] = kparam_1g.base_idx[i] = 0;

           kparam_2g.ghostDim[i] = kparam_1g.ghostDim[i] = 0;

         }

 #endif

         for(int sig=0; sig<8; sig++){

           for(int mu=0; mu<8; mu++){

             if ( (mu == sig) || (mu == OPP_DIR(sig))){

               continue;

             }

             //3-link

             //Kernel A: middle link


             MiddleLink<RealA,RealB> middleLink( link, oprod,  // read only

                 sig, mu, mThreeSt,

                 Pmu, P3, Qmu, // write only

                 newOprod, kparam_2g);

             middleLink.apply(0);

             checkCudaError();


             for(int nu=0; nu < 8; nu++){

               if (nu == sig || nu == OPP_DIR(sig)

                   || nu == mu || nu == OPP_DIR(mu)){

                 continue;

               }

               //5-link: middle link

               //Kernel B

               MiddleLink<RealA,RealB> middleLink( link, Pmu, Qmu, // read only

                   sig, nu, FiveSt,

                   Pnumu, P5, Qnumu, // write only

                   newOprod, kparam_1g);

               middleLink.apply(0);

               checkCudaError();


               for(int rho = 0; rho < 8; rho++){

                 if (rho == sig || rho == OPP_DIR(sig)

                     || rho == mu || rho == OPP_DIR(mu)

                     || rho == nu || rho == OPP_DIR(nu)){

                   continue;

                 }


                 //7-link: middle link and side link

                 if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;

                 AllLink<RealA,RealB> allLink(link, Pnumu, Qnumu, sig, rho, SevenSt, coeff,

                     P5, newOprod, kparam_1g);


                 allLink.apply(0);

                 checkCudaError();


                 //return;

               }//rho


               //5-link: side link

               if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;

               SideLink<RealA,RealB> sideLink(link, P5, Qmu, //read only

                   sig, nu, mFiveSt, coeff,

                   P3, // write only

                   newOprod, kparam_1g);

               sideLink.apply(0);

               checkCudaError();


             } //nu


             //lepage

             if(Lepage != 0.){

               LepageMiddleLink<RealA,RealB>

                 lepageMiddleLink ( link, Pmu, Qmu, // read only

                     sig, mu, Lepage,

                     P5, // write only

                     newOprod, kparam_2g);

               lepageMiddleLink.apply(0);

               checkCudaError();


               if(ThreeSt != 0)coeff = Lepage/ThreeSt ; else coeff = 0;


               SideLink<RealA, RealB> sideLink(link, P5, Qmu, // read only

                   sig, mu, mLepage, coeff,

                   P3, //write only

                   newOprod, kparam_2g);


               sideLink.apply(0);

               checkCudaError();


             } // Lepage != 0.0


             //3-link side link

             SideLinkShort<RealA,RealB> sideLinkShort(link, P3, // read only

                 sig, mu, ThreeSt,

                 newOprod, kparam_1g);

             sideLinkShort.apply(0);

             checkCudaError();


           }//mu

         }//sig


         return;

       } // do_hisq_staples_force_cuda


 #undef Pmu

 #undef Pnumu

 #undef P3

 #undef P5

 #undef Qmu

 #undef Qnumu


     void hisqCompleteForceCuda(const QudaGaugeParam &param,

         const cudaGaugeField &oprod,

         const cudaGaugeField &link,

         cudaGaugeField* force,

         long long* flops)

     {

       bind_tex_link(link, oprod);


       if(param.cuda_prec == QUDA_DOUBLE_PRECISION){

         CompleteForce<double2,double2> completeForce(link, oprod, *force, param);

         completeForce.apply(0);

         if(flops) *flops = completeForce.flops();

         checkCudaError();

       }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){

         CompleteForce<float2,float2> completeForce(link, oprod, *force, param);

         completeForce.apply(0);

         if(flops) *flops = completeForce.flops();

         checkCudaError();

       }else{

           errorQuda("Unsupported precision");

       }


       unbind_tex_link(link, oprod);

       return;

     }


     void hisqLongLinkForceCuda(double coeff,

         const QudaGaugeParam &param,

         const cudaGaugeField &oldOprod,

         const cudaGaugeField &link,

         cudaGaugeField  *newOprod,

         long long* flops)

     {

       bind_tex_link(link, *newOprod);

       const int volume = param.X[0]*param.X[1]*param.X[2]*param.X[3];

       hisq_kernel_param_t kparam;

       for(int i=0; i<4; i++){

         kparam.X[i] = param.X[i];

         kparam.ghostDim[i] = commDimPartitioned(i);

       }

       kparam.threads = volume/2;

       kparam.setStride(param);


       if(param.cuda_prec == QUDA_DOUBLE_PRECISION){

         LongLinkTerm<double2,double2> longLink(link, oldOprod, coeff, *newOprod, kparam);

         longLink.apply(0);

         if(flops) (*flops) = longLink.flops();

         checkCudaError();

       }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){

         LongLinkTerm<float2,float2> longLink(link, oldOprod, static_cast<float>(coeff), *newOprod, kparam);

         longLink.apply(0);

         if(flops) (*flops) = longLink.flops();

         checkCudaError();

       }else{

         errorQuda("Unsupported precision");

       }

       unbind_tex_link(link, *newOprod);

       return;

     }


     void

       hisqStaplesForceCuda(const double path_coeff_array[6],

           const QudaGaugeParam &param,

           const cudaGaugeField &oprod,

           const cudaGaugeField &link,

           cudaGaugeField* newOprod,

           long long* flops)

       {


 #ifdef MULTI_GPU

         int X[4] = {

           param.X[0]+4,  param.X[1]+4,  param.X[2]+4,  param.X[3]+4

         };

 #else

         int X[4] = {

           param.X[0],  param.X[1],  param.X[2],  param.X[3]

         };

 #endif


         // create color matrix fields with zero padding

         int pad = 0;

         GaugeFieldParam gauge_param(X, param.cuda_prec, QUDA_RECONSTRUCT_NO, pad, QUDA_SCALAR_GEOMETRY);


         gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;

         gauge_param.siteSubset = QUDA_FULL_SITE_SUBSET;

         gauge_param.order = QUDA_FLOAT2_GAUGE_ORDER;

         cudaGaugeField Pmu(gauge_param);

         cudaGaugeField P3(gauge_param);

         cudaGaugeField P5(gauge_param);

         cudaGaugeField Pnumu(gauge_param);

         cudaGaugeField Qmu(gauge_param);

         cudaGaugeField Qnumu(gauge_param);


         bind_tex_link(link, *newOprod);


         cudaEvent_t start, end;


         cudaEventCreate(&start);

         cudaEventCreate(&end);


         cudaEventRecord(start);

         if (param.cuda_prec == QUDA_DOUBLE_PRECISION){


           PathCoefficients<double> act_path_coeff;

           act_path_coeff.one    = path_coeff_array[0];

           act_path_coeff.naik   = path_coeff_array[1];

           act_path_coeff.three  = path_coeff_array[2];

           act_path_coeff.five   = path_coeff_array[3];

           act_path_coeff.seven  = path_coeff_array[4];

           act_path_coeff.lepage = path_coeff_array[5];

           do_hisq_staples_force_cuda<double,double2,double2>( act_path_coeff,

               param,

               oprod,

               link,

               Pmu,

               P3,

               P5,

               Pnumu,

               Qmu,

               Qnumu,

               *newOprod);


         }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){

           PathCoefficients<float> act_path_coeff;

           act_path_coeff.one    = path_coeff_array[0];

           act_path_coeff.naik   = path_coeff_array[1];

           act_path_coeff.three  = path_coeff_array[2];

           act_path_coeff.five   = path_coeff_array[3];

           act_path_coeff.seven  = path_coeff_array[4];

           act_path_coeff.lepage = path_coeff_array[5];


           do_hisq_staples_force_cuda<float,float2,float2>( act_path_coeff,

               param,

               oprod,

               link,

               Pmu,

               P3,

               P5,

               Pnumu,

               Qmu,

               Qnumu,

               *newOprod);

         }else{

           errorQuda("Unsupported precision");

         }


         cudaEventRecord(end);

         cudaEventSynchronize(end);

         float runtime;

         cudaEventElapsedTime(&runtime, start, end);


         if(flops){

           int volume = param.X[0]*param.X[1]*param.X[2]*param.X[3];

           // Middle Link, side link, short side link, AllLink, OneLink

           *flops = (134784 + 24192 + 103680 + 864 + 397440 + 72);


           if(path_coeff_array[5] != 0.) *flops += 28944; // Lepage contribution

           *flops *= volume;

         }


         unbind_tex_link(link, *newOprod);


         cudaEventDestroy(start);

         cudaEventDestroy(end);


         return;

       }


   } // namespace fermion_force

 } // namespace quda


 #endif // GPU_HISQ_FORCE

gauge_param
QudaGaugeParam gauge_param
Definition: dslash_test.cpp:37

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

quda::linkIndex
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
Definition: ks_force_quda.cu:40

hisq_paths_force_core.h

commDimPartitioned
int commDimPartitioned(int dir)
Definition: face_buffer.cpp:539

lattice_field.h

y
int y[4]
Definition: staggered_dslash_core.h:356

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

color_spinor_field.h

hisq_force_macros.h

mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
Definition: hisq_paths_force_core.h:82

HISQ_LOAD_MATRIX_12_SINGLE_TEX
#define HISQ_LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)
Definition: hisq_force_macros.h:432

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:30

addMatrixToField
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:816

threads
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099

QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:277

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

PathCoefficients
Definition: hisq_force_reference2.cpp:1247

PathCoefficients::three
Real three
Definition: hisq_force_reference2.cpp:1250

HISQ_LOAD_MATRIX_18_SINGLE_TEX
#define HISQ_LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)
Definition: hisq_force_macros.h:420

HISQ_LOAD_MATRIX_12_DOUBLE_TEX
#define HISQ_LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
Definition: hisq_force_macros.h:457

PathCoefficients::five
Real five
Definition: hisq_force_reference2.cpp:1251

quda::Tunable::initTuneParam
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175

PathCoefficients::seven
Real seven
Definition: hisq_force_reference2.cpp:1252

sid
int sid
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:400

Qmu
#define Qmu
Definition: hisq_force_reference.cpp:721

fetch_double
__inline__ __device__ double fetch_double(texture< int2, 1 > t, int i)
Definition: texture.h:88

OPP_DIR
#define OPP_DIR(dir)
Definition: force_common.h:16

param
QudaGaugeParam param
Definition: pack_test.cpp:17

PathCoefficients::lepage
Real lepage
Definition: hisq_force_reference2.cpp:1254

E
int E[4]
Definition: hisq_paths_force_core.h:133

PathCoefficients::naik
Real naik
Definition: hisq_force_reference2.cpp:1253

quda::Conj
__device__ __host__ Cmplx Conj(const Cmplx &a)
Definition: quda_matrix.h:267

force_common.h

oprodEven
__global__ void const RealB *const const RealA *const oprodEven
Definition: hisq_paths_force_core.h:885

tmp
cudaColorSpinorField * tmp
Definition: staggered_dslash_test.cpp:48

PathCoefficients::one
Real one
Definition: hisq_force_reference2.cpp:1249

addMatrixToNewOprod
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)

QudaGaugeParam_s::site_ga_pad
int site_ga_pad
Definition: quda.h:55

QudaGaugeParam_s
Definition: quda.h:25

linkOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
Definition: hisq_paths_force_core.h:82

outputEven
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
Definition: hisq_paths_force_core.h:803

quda::fermion_force::hisqCompleteForceCuda
void hisqCompleteForceCuda(const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)

face_quda.h

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271

coeff
__constant__ double coeff
Definition: dslash_constants.h:180

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:392

outputOdd
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
Definition: hisq_paths_force_core.h:803

new_sid
int new_sid
Definition: hisq_paths_force_core.h:134

updateCoords
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])

P5
#define P5
Definition: fermion_force_reference.cpp:358

accumu_coeff
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
Definition: hisq_paths_force_core.h:435

fetch_double2
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90

QudaGaugeParam_s::cuda_prec
QudaPrecision cuda_prec
Definition: quda.h:42

QudaGaugeParam_s::X
int X[4]
Definition: quda.h:29

READ_DOUBLE2_TEXTURE
#define READ_DOUBLE2_TEXTURE(x_tex, x, i)
Definition: force_common.h:143

x
int x[4]
Definition: hisq_paths_force_core.h:99

tune_quda.h

quda::Tunable::defaultTuneParam
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199

Sign
Definition: hisq_force_reference2.cpp:23

Pnumu
#define Pnumu
Definition: fermion_force_reference.cpp:353

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

quda::fermion_force::hisqLongLinkForceCuda
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)

P3
#define P3
Definition: fermion_force_reference.cpp:359

dx
int dx[4]
Definition: hisq_paths_force_core.h:98

kparam
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
Definition: hisq_paths_force_core.h:92

quda::fermion_force::hisqStaplesForceCuda
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:342

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

storeMatrixToField
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)

GOES_BACKWARDS
#define GOES_BACKWARDS(dir)
Definition: force_common.h:18

quda::operator*
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:692

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

QudaGaugeParam_s::mom_ga_pad
int mom_ga_pad
Definition: quda.h:59

hw_quda.h

Qnumu
#define Qnumu
Definition: hisq_force_reference.cpp:722

quda::operator+=
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:83

Pmu
#define Pmu
Definition: fermion_force_reference.cpp:352

D1h
#define D1h
Definition: llfat_core.h:16

GOES_FORWARDS
#define GOES_FORWARDS(dir)
Definition: force_common.h:17

HISQ_LOAD_MATRIX_18_DOUBLE_TEX
#define HISQ_LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
Definition: hisq_force_macros.h:445

checkCudaError
#define checkCudaError()
Definition: util_quda.h:110

sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
Definition: hisq_paths_force_core.h:82

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:385

read_gauge.h

linkEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
Definition: hisq_paths_force_core.h:82

getTuning
QudaTune getTuning()
Definition: util_quda.cpp:32

loadMatrixFromField
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)

parity
const QudaParity parity
Definition: dslash_test.cpp:29

gauge_field.h

oddBit
int oddBit
Definition: hisq_paths_force_core.h:263

end
void end()
Definition: dslash_test.cpp:428

TEX1DFETCH
#define TEX1DFETCH(type, tex, idx)
Definition: dslash_textures.h:7

oprodOdd
__global__ void const RealA *const oprodOdd
Definition: hisq_paths_force_core.h:82

ks_improved_force.h

quda::getCoords
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)
Definition: ks_force_quda.cu:48

quda_internal.h