3 #if (PRECISION == 0 && RECON == 18)
5 #elif (PRECISION == 0 && RECON == 12)
7 #elif (PRECISION == 1 && RECON == 18)
14 #define print_matrix(mul) \
15 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
16 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
17 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
80 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
86 typename RealTypeId<RealA>::Type
coeff,
91 hisq_kernel_param_t
kparam)
96 int sid = blockIdx.x * blockDim.x + threadIdx.x;
97 if(sid >=
kparam.threads)
return;
98 int dx[4] = {0,0,0,0};
125 x[0] = x[0] +
kparam.base_idx[0];
126 x[1] = x[1] +
kparam.base_idx[1];
127 x[2] = x[2] +
kparam.base_idx[2];
128 x[3] = x[3] +
kparam.base_idx[3];
130 oddBit = _oddBit ^ oddness_change;
137 int y[4] = {x[0], x[1], x[2], x[3]};
160 for(
int dir=0; dir<4; ++dir) y[dir] = x[dir];
188 if(!sig_positive) Oy =
conj(Oy);
212 if(!mu_positive) Uad =
conj(Uad);
249 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
255 typename RealTypeId<RealA>::Type
coeff,
258 hisq_kernel_param_t
kparam)
261 int sid = blockIdx.x * blockDim.x + threadIdx.x;
262 if(sid >=
kparam.threads)
return;
263 int oddBit = _oddBit;
285 int dx[4] = {0,0,0,0};
290 x[0] = x[0] +
kparam.base_idx[0];
291 x[1] = x[1] +
kparam.base_idx[1];
292 x[2] = x[2] +
kparam.base_idx[2];
293 x[3] = x[3] +
kparam.base_idx[3];
295 oddBit = _oddBit ^ oddness_change;
303 int y[4] = {x[0], x[1], x[2], x[3]};
315 int mysig = posDir(sig);
326 for(
int dir=0; dir<4; ++dir) y[dir] = x[dir];
369 if(!mu_positive) Uad =
conj(Uad);
433 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
439 typename RealTypeId<RealA>::Type
coeff,
443 hisq_kernel_param_t
kparam)
445 int oddBit = _oddBit;
446 int sid = blockIdx.x * blockDim.x + threadIdx.x;
447 if(sid >=
kparam.threads)
return;
451 int dx[4] = {0,0,0,0};
460 x[0] = x[0] +
kparam.base_idx[0];
461 x[1] = x[1] +
kparam.base_idx[1];
462 x[2] = x[2] +
kparam.base_idx[2];
463 x[3] = x[3] +
kparam.base_idx[3];
465 oddBit = _oddBit ^ oddness_change;
494 int y[4] = {x[0], x[1], x[2], x[3]};
499 int mymu = posDir(mu);
522 addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit,
kparam.color_matrix_stride);
523 mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*
coeff;
529 if(!oddBit){ mycoeff = -
mycoeff; }
544 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
549 typename RealTypeId<RealA>::Type
coeff,
551 hisq_kernel_param_t
kparam)
553 int oddBit = _oddBit;
554 int sid = blockIdx.x * blockDim.x + threadIdx.x;
555 if(sid >=
kparam.threads)
return;
559 int dx[4] = {0,0,0,0};
564 x[0] = x[0] +
kparam.base_idx[0];
565 x[1] = x[1] +
kparam.base_idx[1];
566 x[2] = x[2] +
kparam.base_idx[2];
567 x[3] = x[3] +
kparam.base_idx[3];
569 oddBit = _oddBit ^ oddness_change;
591 typename RealTypeId<RealA>::Type
mycoeff;
593 int mymu = posDir(mu);
594 int y[4] = {x[0], x[1], x[2], x[3]};
598 mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*
coeff;
601 if(!oddBit){ mycoeff = -
mycoeff;}
604 if(oddBit){ mycoeff = -
mycoeff; }
649 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
655 typename RealTypeId<RealA>::Type
coeff,
659 hisq_kernel_param_t
kparam)
661 int oddBit = _oddBit;
662 int sid = blockIdx.x * blockDim.x + threadIdx.x;
663 if(sid >=
kparam.threads)
return;
667 int dx[4] = {0,0,0,0};
689 x[0] = x[0] +
kparam.base_idx[0];
690 x[1] = x[1] +
kparam.base_idx[1];
691 x[2] = x[2] +
kparam.base_idx[2];
692 x[3] = x[3] +
kparam.base_idx[3];
696 oddBit = _oddBit ^ oddness_change;
702 int y[4] = {x[0], x[1], x[2], x[3]};
703 int mysig = posDir(sig);
707 ab_link_nbr_idx = (sig_positive) ? new_sid : point_b;
709 for(
int dir=0; dir<4; ++dir) y[dir] = x[dir];
712 const typename RealTypeId<RealA>::Type & mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*
coeff;
748 addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit,
kparam.color_matrix_stride);
790 addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit,
kparam.color_matrix_stride);
801 template<
class RealA,
class RealB,
int oddBit>
805 typename RealTypeId<RealA>::Type
coeff,
807 hisq_kernel_param_t
kparam)
809 int sid = blockIdx.x * blockDim.x + threadIdx.x;
810 if (sid >=
kparam.threads)
return;
814 int dx[4] = {0,0,0,0};
819 for(
int i=0; i<4; ++i) x[i] += 2;
848 for(
int sig=0; sig<4; ++
sig){
875 addMatrixToField(temp.data, sig, new_sid, coeff, outputEven, outputOdd, oddBit,
kparam.color_matrix_stride);
883 template<
class RealA,
class RealB,
int oddBit>
888 hisq_kernel_param_t
kparam)
890 int sid = blockIdx.x * blockDim.x + threadIdx.x;
891 if (sid >=
kparam.threads)
return;
895 int dx[4] = {0,0,0,0};
908 for(
int sig=0; sig<4; ++
sig){
915 typename RealTypeId<RealA>::Type coeff = (oddBit==1) ? -1 : 1;
918 storeMatrixToMomentumField(Ow.data, sig, sid, coeff, forceEven, forceOdd, oddBit,
kparam.momentum_stride);
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
__global__ void const RealA *const const RealA *const const RealA *const QprevOdd
__global__ void const RealA *const const RealA *const const RealA *const QprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const P3Odd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const QmuEven
__global__ void const RealB *const const RealA *const const RealA *const naikOprodOdd
__global__ void const RealB *const const RealA *const const RealA *const RealA *const RealA *const forceOdd
__global__ void HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA *const oprodEven
loadLink< 18 >(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride)
__global__ void const RealB *const const RealA *const oprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const QmuOdd
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)
__global__ void const RealB *const const RealA *const const RealA *const RealA *const forceEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
__global__ void const RealA *const const RealA *const QprevEven
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type coeff
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
__global__ void const RealA *const const RealA *const QprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const RealA *const shortPOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const PmuEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const P3Even
__global__ void const RealB *const const RealA *const naikOprodEven
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const shortPEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
RealTypeId< RealA >::Type mycoeff
getCoords(x, sid, kparam.D, oddBit)
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
__global__ void const RealA *const oprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const PmuOdd