3 #if (PRECISION == 0 && RECON == 18)
5 #ifdef COMPILE_HISQ_DP_18
8 #elif (PRECISION == 0 && RECON == 12)
10 #ifdef COMPILE_HISQ_DP_12
11 #define KERNEL_ENABLED
13 #elif (PRECISION == 1 && RECON == 18)
15 #ifdef COMPILE_HISQ_SP_18
16 #define KERNEL_ENABLED
20 #ifdef COMPILE_HISQ_SP_12
21 #define KERNEL_ENABLED
37 #define D1h kparam.D1h
41 #define xcomm kparam.ghostDim[0]
42 #define ycomm kparam.ghostDim[1]
43 #define zcomm kparam.ghostDim[2]
44 #define tcomm kparam.ghostDim[3]
47 #define print_matrix(mul) \
48 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
49 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
50 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
94 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
108 #ifdef KERNEL_ENABLED
109 int oddBit = _oddBit;
110 int sid = blockIdx.x * blockDim.x + threadIdx.x;
111 if(sid >=
kparam.threads)
return;
119 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
120 x[0] = 2*x1h +
x1odd;
130 RealA
ab_link[ArrayLength<RealA>::result];
131 RealA
bc_link[ArrayLength<RealA>::result];
132 RealA
ad_link[ArrayLength<RealA>::result];
151 x[0] = x[0] +
kparam.base_idx[0];
152 x[1] = x[1] +
kparam.base_idx[1];
153 x[2] = x[2] +
kparam.base_idx[2];
154 x[3] = x[3] +
kparam.base_idx[3];
159 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
160 int new_sid=(new_mem_idx >> 1);
161 oddBit = _oddBit ^ oddness_change;
180 point_d = (new_mem_idx >> 1);
197 point_c = (new_mem_idx >> 1);
208 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
222 point_b = (new_mem_idx >> 1);
306 MAT_MUL_MAT(COLOR_MAT_W, COLOR_MAT_X, COLOR_MAT_Y);
318 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
330 #ifdef KERNEL_ENABLED
331 int oddBit = _oddBit;
332 int sid = blockIdx.x * blockDim.x + threadIdx.x;
333 if(sid >=
kparam.threads)
return;
341 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
342 x[0] = 2*x1h +
x1odd;
352 RealA
ab_link[ArrayLength<RealA>::result];
353 RealA
bc_link[ArrayLength<RealA>::result];
354 RealA
ad_link[ArrayLength<RealA>::result];
373 x[0] = x[0] +
kparam.base_idx[0];
374 x[1] = x[1] +
kparam.base_idx[1];
375 x[2] = x[2] +
kparam.base_idx[2];
376 x[3] = x[3] +
kparam.base_idx[3];
381 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
382 int new_sid=(new_mem_idx >> 1);
383 oddBit = _oddBit ^ oddness_change;
400 point_d = (new_mem_idx >> 1);
417 point_c = (new_mem_idx >> 1);
428 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
443 point_b = (new_mem_idx >> 1);
495 MAT_MUL_MAT(COLOR_MAT_W, COLOR_MAT_X, COLOR_MAT_Y);
536 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
548 #ifdef KERNEL_ENABLED
549 int oddBit = _oddBit;
550 int sid = blockIdx.x * blockDim.x + threadIdx.x;
551 if(sid >=
kparam.threads)
return;
555 int x1h = sid - z1*
D1h;
560 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
561 x[0] = 2*x1h +
x1odd;
571 x[0] = x[0] +
kparam.base_idx[0];
572 x[1] = x[1] +
kparam.base_idx[1];
573 x[2] = x[2] +
kparam.base_idx[2];
574 x[3] = x[3] +
kparam.base_idx[3];
581 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
582 int new_sid=(new_mem_idx >> 1);
583 oddBit = _oddBit ^ oddness_change;
585 int X = 2*sid +
x1odd;
595 RealA ad_link[ArrayLength<RealA>::result];
597 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
598 RealA COLOR_MAT_X[ArrayLength<RealA>::result];
599 RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
626 point_d = (new_mem_idx >> 1);
638 HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, 1-oddBit);
640 HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, oddBit);
645 addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
646 mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*
coeff;
650 MAT_MUL_MAT(COLOR_MAT_Y, COLOR_MAT_X, COLOR_MAT_W);
651 if(!oddBit){ mycoeff = -
mycoeff; }
652 addMatrixToNewOprod(COLOR_MAT_W, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
655 if(oddBit){ mycoeff = -
mycoeff; }
665 template<
class RealA,
class RealB,
int sig_positive,
int mu_positive,
int _oddBit,
int oddness_change>
674 #ifdef KERNEL_ENABLED
675 int oddBit = _oddBit;
676 int sid = blockIdx.x * blockDim.x + threadIdx.x;
677 if(sid >=
kparam.threads)
return;
681 int x1h = sid - z1*
D1h;
686 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
687 x[0] = 2*x1h +
x1odd;
693 x[0] = x[0] +
kparam.base_idx[0];
694 x[1] = x[1] +
kparam.base_idx[1];
695 x[2] = x[2] +
kparam.base_idx[2];
696 x[3] = x[3] +
kparam.base_idx[3];
703 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
704 int new_sid=(new_mem_idx >> 1);
705 oddBit = _oddBit ^ oddness_change;
707 int X = 2*sid +
x1odd;
727 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
728 RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
742 point_d = (new_mem_idx >> 1);
743 mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*
coeff;
746 if(!oddBit){ mycoeff = -
mycoeff;}
747 addMatrixToNewOprod(COLOR_MAT_Y, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
749 if(oddBit){ mycoeff = -
mycoeff; }
750 ADJ_MAT(COLOR_MAT_Y, COLOR_MAT_W);
753 #endif // KERNEL_ENABLED
788 template<
class RealA,
class RealB, SHORT sig_positive, SHORT mu_positive, SHORT _oddBit,
int oddness_change>
800 #ifdef KERNEL_ENABLED
801 SHORT oddBit = _oddBit;
802 int sid = blockIdx.x * blockDim.x + threadIdx.x;
803 if(sid >=
kparam.threads)
return;
812 SHORT x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
813 x[0] = 2*x1h +
x1odd;
823 RealA ab_link[ArrayLength<RealA>::result];
824 RealA bc_link[ArrayLength<RealA>::result];
825 RealA ad_link[ArrayLength<RealA>::result];
827 RealA COLOR_MAT_X[ArrayLength<RealA>::result];
828 RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
830 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
847 x[0] = x[0] +
kparam.base_idx[0];
848 x[1] = x[1] +
kparam.base_idx[1];
849 x[2] = x[2] +
kparam.base_idx[2];
850 x[3] = x[3] +
kparam.base_idx[3];
858 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
859 int new_sid=(new_mem_idx >> 1);
860 oddBit = _oddBit ^ oddness_change;
862 int X = 2*sid +
x1odd;
876 point_b = (new_mem_idx >> 1);
877 ab_link_nbr_idx = (sig_positive) ? new_sid : point_b;
892 new_mem_idx = new_x[3]*
E3E2E1 + new_x[2]*
E2E1 + new_x[1]*
E1 + new_x[0];
904 point_d = (new_mem_idx >> 1);
913 point_c = (new_mem_idx >> 1);
916 HISQ_LOAD_LINK(linkEven, linkOdd, mu, point_d, ad_link, 1-oddBit);
930 MAT_MUL_MAT(COLOR_MAT_Z, COLOR_MAT_Y, COLOR_MAT_W);
936 HISQ_LOAD_LINK(linkEven, linkOdd, sig, ab_link_nbr_idx, ab_link, oddBit);
944 MAT_MUL_MAT(COLOR_MAT_Y, COLOR_MAT_X, COLOR_MAT_W);
946 addMatrixToNewOprod(COLOR_MAT_W, mu, point_d, -
Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
949 addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
953 point_d = (new_mem_idx >> 1);
961 point_c = (new_mem_idx >> 1);
969 HISQ_LOAD_LINK(linkEven, linkOdd, mu, point_b, bc_link, 1-oddBit);
977 MAT_MUL_MAT(COLOR_MAT_Z, COLOR_MAT_W, COLOR_MAT_Y);
983 HISQ_LOAD_LINK(linkEven, linkOdd, sig, ab_link_nbr_idx, ab_link, oddBit);
992 addMatrixToNewOprod(COLOR_MAT_W, mu, new_sid,
Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
995 addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
1005 template<
class RealA,
class RealB,
int oddBit>
1013 #ifdef KERNEL_ENABLED
1014 int sid = blockIdx.x * blockDim.x + threadIdx.x;
1015 if (sid >=
kparam.threads)
return;
1019 int x1h = sid - z1*
X1h;
1023 x[2] = z2 - x[3]*
X3;
1024 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
1025 x[0] = 2*x1h +
x1odd;
1036 int new_sid = (X>> 1);
1039 int X = 2*sid +
x1odd;
1046 int new_mem_idx =
X;
1049 RealA ab_link[ArrayLength<RealA>::result];
1050 RealA bc_link[ArrayLength<RealA>::result];
1051 RealA
de_link[ArrayLength<RealA>::result];
1052 RealA
ef_link[ArrayLength<RealA>::result];
1055 int ab_link_sign =1;
1056 int bc_link_sign =1;
1063 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
1064 RealA COLOR_MAT_X[ArrayLength<RealA>::result];
1065 RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
1066 RealA COLOR_MAT_Z[ArrayLength<RealA>::result];
1069 const int & point_c =
new_sid;
1089 point_d = (new_mem_idx >> 1);
1094 point_e = (new_mem_idx >> 1);
1104 point_b = (new_mem_idx >> 1);
1109 point_a = (new_mem_idx >> 1);
1112 HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_a, ab_link, oddBit);
1113 HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_b, bc_link, 1-oddBit);
1114 HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_d, de_link, 1-oddBit);
1115 HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_e, ef_link, oddBit);
1137 addMatrixToField(COLOR_MAT_V, sig, new_sid, coeff, outputEven, outputOdd, oddBit);
1144 template<
class RealA,
class RealB,
int oddBit>
1152 #ifdef KERNEL_ENABLED
1153 int sid = blockIdx.x * blockDim.x + threadIdx.x;
1158 int x1h = sid - z1*
X1h;
1162 x[2] = z2 - x[3]*
X3;
1163 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
1164 x[0] = 2*x1h +
x1odd;
1177 new_sid = ( x[3]*
E3E2E1 + x[2]*
E2E1+x[1]*
E1 + x[0])>>1;
1180 RealA
LINK_W[ArrayLength<RealA>::result];
1181 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
1182 RealA COLOR_MAT_X[ArrayLength<RealA>::result];
1200 #undef KERNEL_ENABLED