1 #define Vsh_x ghostFace[0]
2 #define Vsh_y ghostFace[1]
3 #define Vsh_z ghostFace[2]
4 #define Vsh_t ghostFace[3]
6 #define xcomm kparam.ghostDim[0]
7 #define ycomm kparam.ghostDim[1]
8 #define zcomm kparam.ghostDim[2]
9 #define tcomm kparam.ghostDim[3]
10 #define dimcomm kparam.ghostDim
17 #define D1h kparam.D1h
19 #if (RECONSTRUCT == 18)
20 #define DECLARE_VAR_SIGN
23 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
25 #define DECLARE_X_ARRAY
27 #else //RECONSTRUCT == 12
28 #define DECLARE_VAR_SIGN short sign=1
29 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \
31 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
35 #if (PRECISION == 1 && RECONSTRUCT == 12)
155 #define bb00_re BB0.x
156 #define bb00_im BB0.y
157 #define bb01_re BB1.x
158 #define bb01_im BB1.y
159 #define bb02_re BB2.x
160 #define bb02_im BB2.y
161 #define bb10_re BB3.x
162 #define bb10_im BB3.y
163 #define bb11_re BB4.x
164 #define bb11_im BB4.y
165 #define bb12_re BB5.x
166 #define bb12_im BB5.y
167 #define bb20_re BB6.x
168 #define bb20_im BB6.y
169 #define bb21_re BB7.x
170 #define bb21_im BB7.y
171 #define bb22_re BB8.x
172 #define bb22_im BB8.y
176 #define aT00_re (+a00_re)
177 #define aT00_im (-a00_im)
178 #define aT01_re (+a10_re)
179 #define aT01_im (-a10_im)
180 #define aT02_re (+a20_re)
181 #define aT02_im (-a20_im)
182 #define aT10_re (+a01_re)
183 #define aT10_im (-a01_im)
184 #define aT11_re (+a11_re)
185 #define aT11_im (-a11_im)
186 #define aT12_re (+a21_re)
187 #define aT12_im (-a21_im)
188 #define aT20_re (+a02_re)
189 #define aT20_im (-a02_im)
190 #define aT21_re (+a12_re)
191 #define aT21_im (-a12_im)
192 #define aT22_re (+a22_re)
193 #define aT22_im (-a22_im)
195 #define bT00_re (+b00_re)
196 #define bT00_im (-b00_im)
197 #define bT01_re (+b10_re)
198 #define bT01_im (-b10_im)
199 #define bT02_re (+b20_re)
200 #define bT02_im (-b20_im)
201 #define bT10_re (+b01_re)
202 #define bT10_im (-b01_im)
203 #define bT11_re (+b11_re)
204 #define bT11_im (-b11_im)
205 #define bT12_re (+b21_re)
206 #define bT12_im (-b21_im)
207 #define bT20_re (+b02_re)
208 #define bT20_im (-b02_im)
209 #define bT21_re (+b12_re)
210 #define bT21_im (-b12_im)
211 #define bT22_re (+b22_re)
212 #define bT22_im (-b22_im)
214 #define cT00_re (+c00_re)
215 #define cT00_im (-c00_im)
216 #define cT01_re (+c10_re)
217 #define cT01_im (-c10_im)
218 #define cT02_re (+c20_re)
219 #define cT02_im (-c20_im)
220 #define cT10_re (+c01_re)
221 #define cT10_im (-c01_im)
222 #define cT11_re (+c11_re)
223 #define cT11_im (-c11_im)
224 #define cT12_re (+c21_re)
225 #define cT12_im (-c21_im)
226 #define cT20_re (+c02_re)
227 #define cT20_im (-c02_im)
228 #define cT21_re (+c12_re)
229 #define cT21_im (-c12_im)
230 #define cT22_re (+c22_re)
231 #define cT22_im (-c22_im)
234 #define tempa00_re TEMPA0.x
235 #define tempa00_im TEMPA0.y
236 #define tempa01_re TEMPA1.x
237 #define tempa01_im TEMPA1.y
238 #define tempa02_re TEMPA2.x
239 #define tempa02_im TEMPA2.y
240 #define tempa10_re TEMPA3.x
241 #define tempa10_im TEMPA3.y
242 #define tempa11_re TEMPA4.x
243 #define tempa11_im TEMPA4.y
244 #define tempa12_re TEMPA5.x
245 #define tempa12_im TEMPA5.y
246 #define tempa20_re TEMPA6.x
247 #define tempa20_im TEMPA6.y
248 #define tempa21_re TEMPA7.x
249 #define tempa21_im TEMPA7.y
250 #define tempa22_re TEMPA8.x
251 #define tempa22_im TEMPA8.y
253 #define tempb00_re TEMPB0.x
254 #define tempb00_im TEMPB0.y
255 #define tempb01_re TEMPB1.x
256 #define tempb01_im TEMPB1.y
257 #define tempb02_re TEMPB2.x
258 #define tempb02_im TEMPB2.y
259 #define tempb10_re TEMPB3.x
260 #define tempb10_im TEMPB3.y
261 #define tempb11_re TEMPB4.x
262 #define tempb11_im TEMPB4.y
263 #define tempb12_re TEMPB5.x
264 #define tempb12_im TEMPB5.y
265 #define tempb20_re TEMPB6.x
266 #define tempb20_im TEMPB6.y
267 #define tempb21_re TEMPB7.x
268 #define tempb21_im TEMPB7.y
269 #define tempb22_re TEMPB8.x
270 #define tempb22_im TEMPB8.y
272 #define fat00_re FAT0.x
273 #define fat00_im FAT0.y
274 #define fat01_re FAT1.x
275 #define fat01_im FAT1.y
276 #define fat02_re FAT2.x
277 #define fat02_im FAT2.y
278 #define fat10_re FAT3.x
279 #define fat10_im FAT3.y
280 #define fat11_re FAT4.x
281 #define fat11_im FAT4.y
282 #define fat12_re FAT5.x
283 #define fat12_im FAT5.y
284 #define fat20_re FAT6.x
285 #define fat20_im FAT6.y
286 #define fat21_re FAT7.x
287 #define fat21_im FAT7.y
288 #define fat22_re FAT8.x
289 #define fat22_im FAT8.y
292 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x]
293 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ]
294 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ]
295 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ]
296 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ]
299 #undef UPDATE_COOR_PLUS
300 #undef UPDATE_COOR_MINUS
301 #undef UPDATE_COOR_LOWER_STAPLE
302 #undef UPDATE_COOR_LOWER_STAPLE_DIAG
303 #undef UPDATE_COOR_LOWER_STAPLE_EX
304 #undef COMPUTE_RECONSTRUCT_SIGN
305 #if (RECONSTRUCT != 18)
306 #define UPDATE_COOR_PLUS(mydir, idx) do { \
307 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
323 #define UPDATE_COOR_MINUS(mydir, idx) do { \
324 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
340 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do { \
341 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
342 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
402 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
404 new_x[3] = x4; new_x[1] = x2; new_x[0] = x1; \
412 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do { \
413 new_x1 = x1; new_x2 = x2; new_x4 = x4; \
442 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
446 if ( (i4 & 1) != 0){ \
451 if ( ((i4+i1) & 1) != 0){ \
456 if ( ((i4+i1+i2) & 1) != 0){ \
461 if (i4 == X4m1 && PtNm1){ \
463 }else if(i4 == -1 && Pt0){ \
473 #define UPDATE_COOR_PLUS(mydir, idx)
474 #define UPDATE_COOR_MINUS(mydir, idx)
475 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2)
476 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2)
477 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4)
478 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2)
483 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
486 new_mem_idx = (x1==X1m1)? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx - X1m1)/2*(1-xcomm)):((idx+1)>>1); \
489 new_mem_idx = (x2==X2m1)? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-X2X1mX1)/2*(1-ycomm)):((idx+X1)>>1); \
492 new_mem_idx = (x3==X3m1)? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-X3X2X1mX2X1)/2*(1-zcomm):((idx+X2X1)>>1); \
495 new_mem_idx = ( (x4==X4m1)? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-X4X3X2X1mX3X2X1)/2*(1-tcomm): (idx+X3X2X1)>>1); \
498 UPDATE_COOR_PLUS(mydir, idx); \
503 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
506 new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \
509 new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \
512 new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \
515 new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \
518 UPDATE_COOR_MINUS(mydir, idx); \
522 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
523 int local_new_x1=x1; \
524 int local_new_x2=x2; \
525 int local_new_x3=x3; \
526 int local_new_x4=x4; \
528 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
531 new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1); \
532 local_new_x1 = (x1==0)?X1m1:(x1 - 1); \
535 new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1); \
536 local_new_x2 = (x2==0)?X2m1:(x2 - 1); \
539 new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \
540 local_new_x3 = (x3==0)?X3m1:(x3 -1); \
543 new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \
544 local_new_x4 = (x4==0)?X4m1:(x4 - 1); \
549 new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \
552 new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \
555 new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \
558 new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \
565 new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1); \
566 local_new_x1 = (x1==X1m1)?0:(x1+1); \
569 new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \
570 local_new_x2 = (x2==X2m1)?0:(x2+1); \
573 new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \
574 local_new_x3 = (x3==X3m1)?0:(x3+1); \
577 new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \
578 local_new_x4 = (x4==X4m1)?0:(x4+1); \
583 new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \
586 new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \
589 new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \
592 new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \
596 new_mem_idx = new_mem_idx >> 1; \
597 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
603 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
604 new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \
605 UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2); \
611 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
614 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1; \
617 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1; \
620 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1; \
623 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1: idx+X3X2X1)>>1; \
626 UPDATE_COOR_PLUS(mydir, idx); \
630 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
633 new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \
636 new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \
639 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \
642 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \
645 UPDATE_COOR_MINUS(mydir, idx); \
649 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
652 new_mem_idx = ( (x1==0)?X+X1m1:X-1); \
655 new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \
658 new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \
661 new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \
666 new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \
669 new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
672 new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
675 new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
678 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
684 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, idx) do { \
687 new_mem_idx = (idx+1)>>1; \
690 new_mem_idx = (idx+E1)>>1; \
693 new_mem_idx = (idx+E2E1)>>1; \
696 new_mem_idx = (idx+E3E2E1)>>1; \
699 UPDATE_COOR_PLUS(mydir, idx); \
702 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do { \
705 new_mem_idx = (idx-1) >> 1; \
708 new_mem_idx = (idx-E1) >> 1; \
711 new_mem_idx = (idx-E2E1) >> 1; \
714 new_mem_idx = (idx-E3E2E1) >> 1; \
717 UPDATE_COOR_MINUS(mydir, idx); \
721 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do { \
727 new_mem_idx = X-E1; \
730 new_mem_idx = X-E2E1; \
733 new_mem_idx = X-E3E2E1; \
738 new_mem_idx = (new_mem_idx+1)>> 1; \
741 new_mem_idx = (new_mem_idx+E1) >> 1; \
744 new_mem_idx = (new_mem_idx+E2E1) >> 1; \
747 new_mem_idx = (new_mem_idx+E3E2E1) >> 1; \
750 UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2); \
756 template<
int mu,
int nu,
int odd_bit>
770 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
800 int spacecon_x = (x4*
X3X2+x3*X2+
x2)>>1;
801 int spacecon_y = (x4*
X3X1+x3*
X1+
x1)>>1;
802 int spacecon_z = (x4*
X2X1+x2*
X1+
x1)>>1;
803 int spacecon_t = (x3*
X2X1+x2*
X1+
x1)>>1;
899 template<
int mu,
int nu,
int odd_bit,
int save_staple>
914 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
916 int z1 = mem_idx /
X1h;
917 int x1h = mem_idx - z1*
X1h;
923 int x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
924 int x1 = 2*x1h +
x1odd;
925 int X = 2*mem_idx +
x1odd;
939 int spacecon_x = (x4*
X3X2+x3*X2+
x2)>>1;
940 int spacecon_y = (x4*
X3X1+x3*
X1+
x1)>>1;
941 int spacecon_z = (x4*
X2X1+x2*
X1+
x1)>>1;
942 int spacecon_t = (x3*
X2X1+x2*
X1+
x1)>>1;
1060 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1063 #if (RECONSTRUCT != 18)
1070 #if (RECONSTRUCT != 18)
1073 mem_idx = mem_idx -
Vh;
1078 #if (RECONSTRUCT != 18)
1079 int z1 = mem_idx /
X1h;
1080 int x1h = mem_idx - z1*
X1h;
1082 int x2 = z1 - z2*
X2;
1084 int x3 = z2 - x4*
X3;
1085 int x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1086 int x1 = 2*x1h +
x1odd;
1108 template<
int mu,
int nu,
int odd_bit>
1116 extern __shared__
FloatM sd_data[];
1123 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1124 if(mem_idx >=
kparam.threads)
return;
1126 int z1 = mem_idx/
D1h;
1127 short x1h = mem_idx - z1*
D1h;
1129 short x2 = z1 - z2*
D2;
1131 short x3 = z2 - x4*
D3;
1133 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1134 short x1 = 2*x1h +
x1odd;
1225 if( !(x1 == 1 || x1 ==
X1 + 2 || x2 == 1 || x2 == X2 + 2
1226 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 ==
X4 + 2)){
1227 int orig_idx = ((x4-2)*
X3X2X1 + (x3-2)*
X2X1 + (x2-2)*
X1 + (x1-2))>>1;
1240 template<
int mu,
int nu,
int odd_bit,
int save_staple>
1250 extern __shared__
FloatM sd_data[];
1255 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1256 if(mem_idx >=
kparam.threads)
return;
1258 int z1 = mem_idx/
D1h;
1259 short x1h = mem_idx - z1*
D1h;
1261 short x2 = z1 - z2*
D2;
1263 short x3 = z2 - x4*
D3;
1265 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1266 short x1 = 2*x1h +
x1odd;
1346 if( !(x1 == 1 || x1 ==
X1 + 2 || x2 == 1 || x2 == X2 + 2
1347 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 ==
X4 + 2)){
1348 int orig_idx = ((x4-2)*
X3X2X1 + (x3-2)*
X2X1 + (x2-2)*
X1 + (x1-2))>>1;
1372 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1375 if(sid >= 2*
kparam.threads)
return;
1383 idx = idx -
kparam.threads;
1389 short x1h = idx - z1*
D1h;
1391 short x2 = z1 - z2*
D2;
1393 short x3 = z2 - x4*
D3;
1394 short x1odd = (x2 + x3 + x4 +
odd_bit) & 1;
1395 short x1 = 2*x1h +
x1odd;
1427 #undef DECLARE_VAR_SIGN
1428 #undef DECLARE_NEW_X
1429 #undef DECLARE_X_ARRAY