1 #define Vsh_x ghostFace[0]
2 #define Vsh_y ghostFace[1]
3 #define Vsh_z ghostFace[2]
4 #define Vsh_t ghostFace[3]
5 #define xcomm kparam.ghostDim[0]
6 #define ycomm kparam.ghostDim[1]
7 #define zcomm kparam.ghostDim[2]
8 #define tcomm kparam.ghostDim[3]
9 #define dimcomm kparam.ghostDim
16 #define D1h kparam.D1h
18 #if (RECONSTRUCT == 18)
19 #define DECLARE_VAR_SIGN
22 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
24 #define DECLARE_X_ARRAY
26 #else //RECONSTRUCT == 12
27 #define DECLARE_VAR_SIGN short sign=1
28 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \
29 short new_x3=x3; short new_x4=x4;
30 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
34 #if (PRECISION == 1 && RECONSTRUCT == 12)
112 #define WRITE_LONG_MATRIX WRITE_GAUGE_MATRIX_FLOAT4
191 #define WRITE_LONG_MATRIX WRITE_GAUGE_MATRIX_FLOAT2
196 #define bb00_re BB0.x
197 #define bb00_im BB0.y
198 #define bb01_re BB1.x
199 #define bb01_im BB1.y
200 #define bb02_re BB2.x
201 #define bb02_im BB2.y
202 #define bb10_re BB3.x
203 #define bb10_im BB3.y
204 #define bb11_re BB4.x
205 #define bb11_im BB4.y
206 #define bb12_re BB5.x
207 #define bb12_im BB5.y
208 #define bb20_re BB6.x
209 #define bb20_im BB6.y
210 #define bb21_re BB7.x
211 #define bb21_im BB7.y
212 #define bb22_re BB8.x
213 #define bb22_im BB8.y
217 #define aT00_re (+a00_re)
218 #define aT00_im (-a00_im)
219 #define aT01_re (+a10_re)
220 #define aT01_im (-a10_im)
221 #define aT02_re (+a20_re)
222 #define aT02_im (-a20_im)
223 #define aT10_re (+a01_re)
224 #define aT10_im (-a01_im)
225 #define aT11_re (+a11_re)
226 #define aT11_im (-a11_im)
227 #define aT12_re (+a21_re)
228 #define aT12_im (-a21_im)
229 #define aT20_re (+a02_re)
230 #define aT20_im (-a02_im)
231 #define aT21_re (+a12_re)
232 #define aT21_im (-a12_im)
233 #define aT22_re (+a22_re)
234 #define aT22_im (-a22_im)
236 #define bT00_re (+b00_re)
237 #define bT00_im (-b00_im)
238 #define bT01_re (+b10_re)
239 #define bT01_im (-b10_im)
240 #define bT02_re (+b20_re)
241 #define bT02_im (-b20_im)
242 #define bT10_re (+b01_re)
243 #define bT10_im (-b01_im)
244 #define bT11_re (+b11_re)
245 #define bT11_im (-b11_im)
246 #define bT12_re (+b21_re)
247 #define bT12_im (-b21_im)
248 #define bT20_re (+b02_re)
249 #define bT20_im (-b02_im)
250 #define bT21_re (+b12_re)
251 #define bT21_im (-b12_im)
252 #define bT22_re (+b22_re)
253 #define bT22_im (-b22_im)
255 #define cT00_re (+c00_re)
256 #define cT00_im (-c00_im)
257 #define cT01_re (+c10_re)
258 #define cT01_im (-c10_im)
259 #define cT02_re (+c20_re)
260 #define cT02_im (-c20_im)
261 #define cT10_re (+c01_re)
262 #define cT10_im (-c01_im)
263 #define cT11_re (+c11_re)
264 #define cT11_im (-c11_im)
265 #define cT12_re (+c21_re)
266 #define cT12_im (-c21_im)
267 #define cT20_re (+c02_re)
268 #define cT20_im (-c02_im)
269 #define cT21_re (+c12_re)
270 #define cT21_im (-c12_im)
271 #define cT22_re (+c22_re)
272 #define cT22_im (-c22_im)
275 #define tempa00_re TEMPA0.x
276 #define tempa00_im TEMPA0.y
277 #define tempa01_re TEMPA1.x
278 #define tempa01_im TEMPA1.y
279 #define tempa02_re TEMPA2.x
280 #define tempa02_im TEMPA2.y
281 #define tempa10_re TEMPA3.x
282 #define tempa10_im TEMPA3.y
283 #define tempa11_re TEMPA4.x
284 #define tempa11_im TEMPA4.y
285 #define tempa12_re TEMPA5.x
286 #define tempa12_im TEMPA5.y
287 #define tempa20_re TEMPA6.x
288 #define tempa20_im TEMPA6.y
289 #define tempa21_re TEMPA7.x
290 #define tempa21_im TEMPA7.y
291 #define tempa22_re TEMPA8.x
292 #define tempa22_im TEMPA8.y
294 #define tempb00_re TEMPB0.x
295 #define tempb00_im TEMPB0.y
296 #define tempb01_re TEMPB1.x
297 #define tempb01_im TEMPB1.y
298 #define tempb02_re TEMPB2.x
299 #define tempb02_im TEMPB2.y
300 #define tempb10_re TEMPB3.x
301 #define tempb10_im TEMPB3.y
302 #define tempb11_re TEMPB4.x
303 #define tempb11_im TEMPB4.y
304 #define tempb12_re TEMPB5.x
305 #define tempb12_im TEMPB5.y
306 #define tempb20_re TEMPB6.x
307 #define tempb20_im TEMPB6.y
308 #define tempb21_re TEMPB7.x
309 #define tempb21_im TEMPB7.y
310 #define tempb22_re TEMPB8.x
311 #define tempb22_im TEMPB8.y
313 #define fat00_re FAT0.x
314 #define fat00_im FAT0.y
315 #define fat01_re FAT1.x
316 #define fat01_im FAT1.y
317 #define fat02_re FAT2.x
318 #define fat02_im FAT2.y
319 #define fat10_re FAT3.x
320 #define fat10_im FAT3.y
321 #define fat11_re FAT4.x
322 #define fat11_im FAT4.y
323 #define fat12_re FAT5.x
324 #define fat12_im FAT5.y
325 #define fat20_re FAT6.x
326 #define fat20_im FAT6.y
327 #define fat21_re FAT7.x
328 #define fat21_im FAT7.y
329 #define fat22_re FAT8.x
330 #define fat22_im FAT8.y
333 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x]
334 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ]
335 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ]
336 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ]
337 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ]
340 #undef UPDATE_COOR_PLUS
341 #undef UPDATE_COOR_MINUS
342 #undef UPDATE_COOR_LOWER_STAPLE
343 #undef UPDATE_COOR_LOWER_STAPLE_DIAG
344 #undef UPDATE_COOR_LOWER_STAPLE_EX
345 #undef COMPUTE_RECONSTRUCT_SIGN
346 #if (RECONSTRUCT != 18)
347 #define UPDATE_COOR_PLUS(mydir, n, idx) do { \
348 new_x1 = x1; new_x2 = x2; new_x3=x3; new_x4 = x4; \
365 #define UPDATE_COOR_MINUS(mydir, idx) do { \
366 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
382 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do { \
383 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
384 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
444 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
446 new_x[3] = x4; new_x[1] = x2; new_x[0] = x1; \
454 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do { \
455 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
484 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
488 if ( (i4 & 1) != 0){ \
493 if ( ((i4+i1) & 1) != 0){ \
498 if ( ((i4+i1+i2) & 1) != 0){ \
503 if (i4 == X4m1 && PtNm1){ \
505 }else if(i4 == -1 && Pt0){ \
515 #define UPDATE_COOR_PLUS(mydir, n, idx)
516 #define UPDATE_COOR_MINUS(mydir, idx)
517 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2)
518 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2)
519 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4)
520 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2)
525 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx) do { \
528 new_mem_idx = (x1>=(X1-n))? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx-(X1-n))/2*(1-xcomm)):((idx+n)>>1); \
531 new_mem_idx = (x2>=(X2-n))? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-(X2-n)*X1)/2*(1-ycomm)):((idx+n*X1)>>1); \
534 new_mem_idx = (x3>=(X3-n))? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-(X3-n)*X2X1)/2*(1-zcomm):((idx+n*X2X1)>>1); \
537 new_mem_idx = ( (x4>=(X4-n))? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-(X4-n)*X3X2X1)/2*(1-tcomm): (idx+n*X3X2X1)>>1); \
540 UPDATE_COOR_PLUS(mydir, n, idx); \
545 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
548 new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \
551 new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \
554 new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \
557 new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \
560 UPDATE_COOR_MINUS(mydir, idx); \
564 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
565 int local_new_x1=x1; \
566 int local_new_x2=x2; \
567 int local_new_x3=x3; \
568 int local_new_x4=x4; \
570 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
573 new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1); \
574 local_new_x1 = (x1==0)?X1m1:(x1 - 1); \
577 new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1); \
578 local_new_x2 = (x2==0)?X2m1:(x2 - 1); \
581 new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \
582 local_new_x3 = (x3==0)?X3m1:(x3 -1); \
585 new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \
586 local_new_x4 = (x4==0)?X4m1:(x4 - 1); \
591 new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \
594 new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \
597 new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \
600 new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \
607 new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1); \
608 local_new_x1 = (x1==X1m1)?0:(x1+1); \
611 new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \
612 local_new_x2 = (x2==X2m1)?0:(x2+1); \
615 new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \
616 local_new_x3 = (x3==X3m1)?0:(x3+1); \
619 new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \
620 local_new_x4 = (x4==X4m1)?0:(x4+1); \
625 new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \
628 new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \
631 new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \
634 new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \
638 new_mem_idx = new_mem_idx >> 1; \
639 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
645 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
646 new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \
647 UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2); \
653 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx) do { \
656 new_mem_idx = ( (x1>=(X1-n))?idx-(X1-n):idx+n)>>1; \
659 new_mem_idx = ( (x2>=(X2-n))?idx-(X2-n)*X1:idx+n*X1)>>1; \
662 new_mem_idx = ( (x3>=(X3-n))?idx-(X3-n)*X2X1:idx+n*X2X1)>>1; \
665 new_mem_idx = ( (x4>=(X4-n))?idx-(X4-n)*X3X2X1 : idx+n*X3X2X1)>>1; \
668 UPDATE_COOR_PLUS(mydir, n, idx); \
672 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
675 new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \
678 new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \
681 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \
684 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \
687 UPDATE_COOR_MINUS(mydir, idx); \
691 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
694 new_mem_idx = ( (x1==0)?X+X1m1:X-1); \
697 new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \
700 new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \
703 new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \
708 new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \
711 new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
714 new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
717 new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
720 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
726 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, n, idx) do { \
729 new_mem_idx = (idx+n)>>1; \
732 new_mem_idx = (idx+n*E1)>>1; \
735 new_mem_idx = (idx+n*E2E1)>>1; \
738 new_mem_idx = (idx+n*E3E2E1)>>1; \
741 UPDATE_COOR_PLUS(mydir, n, idx); \
744 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do { \
747 new_mem_idx = (idx-1) >> 1; \
750 new_mem_idx = (idx-E1) >> 1; \
753 new_mem_idx = (idx-E2E1) >> 1; \
756 new_mem_idx = (idx-E3E2E1) >> 1; \
759 UPDATE_COOR_MINUS(mydir, idx); \
763 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do { \
769 new_mem_idx = X-E1; \
772 new_mem_idx = X-E2E1; \
775 new_mem_idx = X-E3E2E1; \
780 new_mem_idx = (new_mem_idx+1)>> 1; \
783 new_mem_idx = (new_mem_idx+E1) >> 1; \
786 new_mem_idx = (new_mem_idx+E2E1) >> 1; \
789 new_mem_idx = (new_mem_idx+E3E2E1) >> 1; \
792 UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2); \
798 template<
int mu,
int nu,
int odd_bit>
812 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
842 int spacecon_x = (x4*
X3X2+x3*X2+
x2)>>1;
843 int spacecon_y = (x4*
X3X1+x3*
X1+
x1)>>1;
844 int spacecon_z = (x4*
X2X1+x2*
X1+
x1)>>1;
845 int spacecon_t = (x3*
X2X1+x2*
X1+
x1)>>1;
913 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
941 template<
int mu,
int nu,
int odd_bit,
int save_staple>
956 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
958 int z1 = mem_idx /
X1h;
959 int x1h = mem_idx - z1*
X1h;
965 int x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
966 int x1 = 2*x1h +
x1odd;
967 int X = 2*mem_idx +
x1odd;
981 int spacecon_x = (x4*
X3X2+x3*X2+
x2)>>1;
982 int spacecon_y = (x4*
X3X1+x3*
X1+
x1)>>1;
983 int spacecon_z = (x4*
X2X1+x2*
X1+
x1)>>1;
984 int spacecon_t = (x3*
X2X1+x2*
X1+
x1)>>1;
1054 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
1090 WRITE_FAT_MATRIX(fatlink_even,
mu, mem_idx);
1102 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1108 #if (RECONSTRUCT != 18)
1115 #if (RECONSTRUCT != 18)
1118 mem_idx = mem_idx -
Vh;
1123 #if (RECONSTRUCT != 18)
1124 int z1 = mem_idx /
X1h;
1125 int x1h = mem_idx - z1*
X1h;
1127 int x2 = z1 - z2*
X2;
1129 int x3 = z2 - x4*
X3;
1130 int x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1131 int x1 = 2*x1h +
x1odd;
1135 for(
int dir=0;dir < 4; dir++){
1136 LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1140 LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx);
1142 SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1144 WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx);
1153 template<
int mu,
int nu,
int odd_bit>
1161 extern __shared__ FloatM sd_data[];
1168 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1169 if(mem_idx >=
kparam.threads)
return;
1171 int z1 = mem_idx/
D1h;
1172 short x1h = mem_idx - z1*
D1h;
1174 short x2 = z1 - z2*
D2;
1176 short x3 = z2 - x4*
D3;
1178 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1179 short x1 = 2*x1h +
x1odd;
1270 if( !(x1 == 1 || x1 ==
X1 + 2 || x2 == 1 || x2 == X2 + 2
1271 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 ==
X4 + 2)){
1272 int orig_idx = ((x4-2)*
X3X2X1 + (x3-2)*
X2X1 + (x2-2)*
X1 + (x1-2))>>1;
1276 WRITE_FAT_MATRIX(fatlink_even,
mu, orig_idx);
1285 template<
int mu,
int nu,
int odd_bit,
int save_staple>
1295 extern __shared__ FloatM sd_data[];
1300 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1301 if(mem_idx >=
kparam.threads)
return;
1303 int z1 = mem_idx/
D1h;
1304 short x1h = mem_idx - z1*
D1h;
1306 short x2 = z1 - z2*
D2;
1308 short x3 = z2 - x4*
D3;
1310 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1311 short x1 = 2*x1h +
x1odd;
1391 if( !(x1 == 1 || x1 ==
X1 + 2 || x2 == 1 || x2 == X2 + 2
1392 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 ==
X4 + 2)){
1393 int orig_idx = ((x4-2)*
X3X2X1 + (x3-2)*
X2X1 + (x2-2)*
X1 + (x1-2))>>1;
1396 WRITE_FAT_MATRIX(fatlink_even,
mu, orig_idx);
1417 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1420 if(sid >= 2*
kparam.threads)
return;
1428 idx = idx -
kparam.threads;
1434 short x1h = idx - z1*
D1h;
1436 short x2 = z1 - z2*
D2;
1438 short x3 = z2 - x4*
D3;
1439 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1440 short x1 = 2*x1h +
x1odd;
1450 for(
int dir=0;dir < 4; dir++){
1451 LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1455 LOAD_FAT_MATRIX(my_fatlink, dir, idx);
1457 SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1459 WRITE_FAT_MATRIX(my_fatlink,dir, idx);
1467 template<
int odd_bit>
1468 __global__
void LLFAT_KERNEL(computeLongLinkParity,RECONSTRUCT)
1469 (FloatN*
const outField,
1474 int idx = blockIdx.x*blockDim.x + threadIdx.x;
1476 if(mem_idx >=
kparam.threads)
return;
1478 int z1 = mem_idx/
D1h;
1479 short x1h = mem_idx - z1*
D1h;
1481 short x2 = z1 - z2*
D2;
1483 short x3 = z2 - x4*
D3;
1485 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1486 short x1 = 2*x1h +
x1odd;
1505 for(
int dir=0; dir<4; ++dir){
1536 SCALAR_MULT_SU3_MATRIX(coeff, a, f);
1552 #undef DECLARE_VAR_SIGN
1553 #undef DECLARE_NEW_X
1554 #undef DECLARE_X_ARRAY
1769 #undef WRITE_LONG_MATRIX
__global__ void FloatM * staple_odd
#define LLFAT_EXTERIOR_KERNEL_BACK_X
__global__ void FloatM const FloatN const FloatN FloatM * fatlink_even
__global__ void const FloatN *const const FloatN *const Float coeff
__global__ void const FloatN FloatM FloatM Float coeff0
__global__ void FloatM const FloatN const FloatN * sitelink_odd
__global__ void const FloatN FloatM FloatM Float Float coeff5
struct quda::llfat_kernel_param_s llfat_kernel_param_t
#define LLFAT_EXTERIOR_KERNEL_FWD_X
__global__ void FloatM const FloatN const FloatN FloatM FloatM * fatlink_odd
__global__ void LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM *staple_even
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B)
#define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx)
__global__ void const FloatN FloatM FloatM Float Float int threads
#define LLFAT_EXTERIOR_KERNEL_FWD_Z
MULT_SU3_NA(tempa, c, staple)
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM * mulink_even
__global__ void FloatM const FloatN const FloatN FloatM FloatM Float mycoeff
#define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1, i2, i3, i4)
#define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2)
#define LLFAT_EXTERIOR_KERNEL_BACK_T
#define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx)
RECONSTRUCT_SITE_LINK(sign, a)
#define LLFAT_EXTERIOR_KERNEL_FWD_Y
__shared__ spinorFloat sd_data[]
#define LLFAT_INTERIOR_KERNEL
#define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, n, idx)
__global__ void LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM *staple_even
LOAD_EVEN_FAT_MATRIX(mu, mem_idx)
FloatingPoint< float > Float
#define LLFAT_EXTERIOR_KERNEL_BACK_Y
#define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx)
LLFAT_ADD_SU3_MATRIX(b, staple, staple)
#define LLFAT_EXTERIOR_KERNEL_BACK_Z
__constant__ fat_force_const_t fl
#define WRITE_LONG_MATRIX
#define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc)
#define LLFAT_EXTERIOR_KERNEL_FWD_T
__global__ void FloatM const FloatN const FloatN FloatM FloatM Float llfat_kernel_param_t kparam
WRITE_STAPLE_MATRIX(staple_even, mem_idx)
LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB)
__global__ void FloatM const FloatN * sitelink_even
#define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2)
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM const FloatM * mulink_odd
LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C)