QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_core.h
Go to the documentation of this file.
1 #define Vsh_x ghostFace[0]
2 #define Vsh_y ghostFace[1]
3 #define Vsh_z ghostFace[2]
4 #define Vsh_t ghostFace[3]
5 
6 #define xcomm kparam.ghostDim[0]
7 #define ycomm kparam.ghostDim[1]
8 #define zcomm kparam.ghostDim[2]
9 #define tcomm kparam.ghostDim[3]
10 #define dimcomm kparam.ghostDim
11 
12 
13 #define D1 kparam.D1
14 #define D2 kparam.D2
15 #define D3 kparam.D3
16 #define D4 kparam.D4
17 #define D1h kparam.D1h
18 
19 #if (RECONSTRUCT == 18)
20 #define DECLARE_VAR_SIGN
21 #define DECLARE_NEW_X
22 #ifdef MULTI_GPU
23 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
24 #else
25 #define DECLARE_X_ARRAY
26 #endif
27 #else //RECONSTRUCT == 12
28 #define DECLARE_VAR_SIGN short sign=1
29 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \
30  /*short new_x3=x3; */short new_x4=x4;
31 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
32 
33 #endif
34 
35 #if (PRECISION == 1 && RECONSTRUCT == 12)
36 
37 #define a00_re A0.x
38 #define a00_im A0.y
39 #define a01_re A0.z
40 #define a01_im A0.w
41 #define a02_re A1.x
42 #define a02_im A1.y
43 #define a10_re A1.z
44 #define a10_im A1.w
45 #define a11_re A2.x
46 #define a11_im A2.y
47 #define a12_re A2.z
48 #define a12_im A2.w
49 #define a20_re A3.x
50 #define a20_im A3.y
51 #define a21_re A3.z
52 #define a21_im A3.w
53 #define a22_re A4.x
54 #define a22_im A4.y
55 
56 #define b00_re B0.x
57 #define b00_im B0.y
58 #define b01_re B0.z
59 #define b01_im B0.w
60 #define b02_re B1.x
61 #define b02_im B1.y
62 #define b10_re B1.z
63 #define b10_im B1.w
64 #define b11_re B2.x
65 #define b11_im B2.y
66 #define b12_re B2.z
67 #define b12_im B2.w
68 #define b20_re B3.x
69 #define b20_im B3.y
70 #define b21_re B3.z
71 #define b21_im B3.w
72 #define b22_re B4.x
73 #define b22_im B4.y
74 
75 #define c00_re C0.x
76 #define c00_im C0.y
77 #define c01_re C0.z
78 #define c01_im C0.w
79 #define c02_re C1.x
80 #define c02_im C1.y
81 #define c10_re C1.z
82 #define c10_im C1.w
83 #define c11_re C2.x
84 #define c11_im C2.y
85 #define c12_re C2.z
86 #define c12_im C2.w
87 #define c20_re C3.x
88 #define c20_im C3.y
89 #define c21_re C3.z
90 #define c21_im C3.w
91 #define c22_re C4.x
92 #define c22_im C4.y
93 
94 #else
95 #define a00_re A0.x
96 #define a00_im A0.y
97 #define a01_re A1.x
98 #define a01_im A1.y
99 #define a02_re A2.x
100 #define a02_im A2.y
101 #define a10_re A3.x
102 #define a10_im A3.y
103 #define a11_re A4.x
104 #define a11_im A4.y
105 #define a12_re A5.x
106 #define a12_im A5.y
107 
108 #define a20_re A6.x
109 #define a20_im A6.y
110 #define a21_re A7.x
111 #define a21_im A7.y
112 #define a22_re A8.x
113 #define a22_im A8.y
114 
115 #define b00_re B0.x
116 #define b00_im B0.y
117 #define b01_re B1.x
118 #define b01_im B1.y
119 #define b02_re B2.x
120 #define b02_im B2.y
121 #define b10_re B3.x
122 #define b10_im B3.y
123 #define b11_re B4.x
124 #define b11_im B4.y
125 #define b12_re B5.x
126 #define b12_im B5.y
127 #define b20_re B6.x
128 #define b20_im B6.y
129 #define b21_re B7.x
130 #define b21_im B7.y
131 #define b22_re B8.x
132 #define b22_im B8.y
133 
134 #define c00_re C0.x
135 #define c00_im C0.y
136 #define c01_re C1.x
137 #define c01_im C1.y
138 #define c02_re C2.x
139 #define c02_im C2.y
140 #define c10_re C3.x
141 #define c10_im C3.y
142 #define c11_re C4.x
143 #define c11_im C4.y
144 #define c12_re C5.x
145 #define c12_im C5.y
146 #define c20_re C6.x
147 #define c20_im C6.y
148 #define c21_re C7.x
149 #define c21_im C7.y
150 #define c22_re C8.x
151 #define c22_im C8.y
152 
153 #endif
154 
155 #define bb00_re BB0.x
156 #define bb00_im BB0.y
157 #define bb01_re BB1.x
158 #define bb01_im BB1.y
159 #define bb02_re BB2.x
160 #define bb02_im BB2.y
161 #define bb10_re BB3.x
162 #define bb10_im BB3.y
163 #define bb11_re BB4.x
164 #define bb11_im BB4.y
165 #define bb12_re BB5.x
166 #define bb12_im BB5.y
167 #define bb20_re BB6.x
168 #define bb20_im BB6.y
169 #define bb21_re BB7.x
170 #define bb21_im BB7.y
171 #define bb22_re BB8.x
172 #define bb22_im BB8.y
173 
174 
175 
176 #define aT00_re (+a00_re)
177 #define aT00_im (-a00_im)
178 #define aT01_re (+a10_re)
179 #define aT01_im (-a10_im)
180 #define aT02_re (+a20_re)
181 #define aT02_im (-a20_im)
182 #define aT10_re (+a01_re)
183 #define aT10_im (-a01_im)
184 #define aT11_re (+a11_re)
185 #define aT11_im (-a11_im)
186 #define aT12_re (+a21_re)
187 #define aT12_im (-a21_im)
188 #define aT20_re (+a02_re)
189 #define aT20_im (-a02_im)
190 #define aT21_re (+a12_re)
191 #define aT21_im (-a12_im)
192 #define aT22_re (+a22_re)
193 #define aT22_im (-a22_im)
194 
195 #define bT00_re (+b00_re)
196 #define bT00_im (-b00_im)
197 #define bT01_re (+b10_re)
198 #define bT01_im (-b10_im)
199 #define bT02_re (+b20_re)
200 #define bT02_im (-b20_im)
201 #define bT10_re (+b01_re)
202 #define bT10_im (-b01_im)
203 #define bT11_re (+b11_re)
204 #define bT11_im (-b11_im)
205 #define bT12_re (+b21_re)
206 #define bT12_im (-b21_im)
207 #define bT20_re (+b02_re)
208 #define bT20_im (-b02_im)
209 #define bT21_re (+b12_re)
210 #define bT21_im (-b12_im)
211 #define bT22_re (+b22_re)
212 #define bT22_im (-b22_im)
213 
214 #define cT00_re (+c00_re)
215 #define cT00_im (-c00_im)
216 #define cT01_re (+c10_re)
217 #define cT01_im (-c10_im)
218 #define cT02_re (+c20_re)
219 #define cT02_im (-c20_im)
220 #define cT10_re (+c01_re)
221 #define cT10_im (-c01_im)
222 #define cT11_re (+c11_re)
223 #define cT11_im (-c11_im)
224 #define cT12_re (+c21_re)
225 #define cT12_im (-c21_im)
226 #define cT20_re (+c02_re)
227 #define cT20_im (-c02_im)
228 #define cT21_re (+c12_re)
229 #define cT21_im (-c12_im)
230 #define cT22_re (+c22_re)
231 #define cT22_im (-c22_im)
232 
233 
234 #define tempa00_re TEMPA0.x
235 #define tempa00_im TEMPA0.y
236 #define tempa01_re TEMPA1.x
237 #define tempa01_im TEMPA1.y
238 #define tempa02_re TEMPA2.x
239 #define tempa02_im TEMPA2.y
240 #define tempa10_re TEMPA3.x
241 #define tempa10_im TEMPA3.y
242 #define tempa11_re TEMPA4.x
243 #define tempa11_im TEMPA4.y
244 #define tempa12_re TEMPA5.x
245 #define tempa12_im TEMPA5.y
246 #define tempa20_re TEMPA6.x
247 #define tempa20_im TEMPA6.y
248 #define tempa21_re TEMPA7.x
249 #define tempa21_im TEMPA7.y
250 #define tempa22_re TEMPA8.x
251 #define tempa22_im TEMPA8.y
252 
253 #define tempb00_re TEMPB0.x
254 #define tempb00_im TEMPB0.y
255 #define tempb01_re TEMPB1.x
256 #define tempb01_im TEMPB1.y
257 #define tempb02_re TEMPB2.x
258 #define tempb02_im TEMPB2.y
259 #define tempb10_re TEMPB3.x
260 #define tempb10_im TEMPB3.y
261 #define tempb11_re TEMPB4.x
262 #define tempb11_im TEMPB4.y
263 #define tempb12_re TEMPB5.x
264 #define tempb12_im TEMPB5.y
265 #define tempb20_re TEMPB6.x
266 #define tempb20_im TEMPB6.y
267 #define tempb21_re TEMPB7.x
268 #define tempb21_im TEMPB7.y
269 #define tempb22_re TEMPB8.x
270 #define tempb22_im TEMPB8.y
271 
272 #define fat00_re FAT0.x
273 #define fat00_im FAT0.y
274 #define fat01_re FAT1.x
275 #define fat01_im FAT1.y
276 #define fat02_re FAT2.x
277 #define fat02_im FAT2.y
278 #define fat10_re FAT3.x
279 #define fat10_im FAT3.y
280 #define fat11_re FAT4.x
281 #define fat11_im FAT4.y
282 #define fat12_re FAT5.x
283 #define fat12_im FAT5.y
284 #define fat20_re FAT6.x
285 #define fat20_im FAT6.y
286 #define fat21_re FAT7.x
287 #define fat21_im FAT7.y
288 #define fat22_re FAT8.x
289 #define fat22_im FAT8.y
290 
291 #define NUM_FLOATS 5
292 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x]
293 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ]
294 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ]
295 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ]
296 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ]
297 
298 
299 #undef UPDATE_COOR_PLUS
300 #undef UPDATE_COOR_MINUS
301 #undef UPDATE_COOR_LOWER_STAPLE
302 #undef UPDATE_COOR_LOWER_STAPLE_DIAG
303 #undef UPDATE_COOR_LOWER_STAPLE_EX
304 #undef COMPUTE_RECONSTRUCT_SIGN
305 #if (RECONSTRUCT != 18)
306 #define UPDATE_COOR_PLUS(mydir, idx) do { \
307  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
308  switch(mydir){ \
309  case 0: \
310  new_x1 = x1+1; \
311  break; \
312  case 1: \
313  new_x2 = x2+1; \
314  break; \
315  case 2: \
316  break; \
317  case 3: \
318  new_x4 = x4+1; \
319  break; \
320  } \
321  }while(0)
322 
323 #define UPDATE_COOR_MINUS(mydir, idx) do { \
324  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
325  switch(mydir){ \
326  case 0: \
327  new_x1 = x1-1; \
328  break; \
329  case 1: \
330  new_x2 = x2-1; \
331  break; \
332  case 2: \
333  break; \
334  case 3: \
335  new_x4 = x4-1; \
336  break; \
337  } \
338  }while(0)
339 
340 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do { \
341  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
342  if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
343  switch(mydir1){ \
344  case 0: \
345  new_x1 = x1 - 1; \
346  break; \
347  case 1: \
348  new_x2 = x2 - 1; \
349  break; \
350  case 2: \
351  break; \
352  case 3: \
353  new_x4 = x4 - 1; \
354  break; \
355  } \
356  switch(mydir2){ \
357  case 0: \
358  new_x1 = x1+1; \
359  break; \
360  case 1: \
361  new_x2 = x2+1; \
362  break; \
363  case 2: \
364  break; \
365  case 3: \
366  new_x4 = x4+1; \
367  break; \
368  } \
369  }else{ \
370  /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
371  switch(mydir2){ \
372  case 0: \
373  new_x1 = x1+1; \
374  break; \
375  case 1: \
376  new_x2 = x2+1; \
377  break; \
378  case 2: \
379  break; \
380  case 3: \
381  new_x4 = x4+1; \
382  break; \
383  } \
384  switch(mydir1){/*mydir1 is 0 here */ \
385  case 0: \
386  new_x1 = x1-1; \
387  break; \
388  case 1: \
389  new_x2 = x2-1; \
390  break; \
391  case 2: \
392  break; \
393  case 3: \
394  new_x4 = x4-1; \
395  break; \
396  } \
397  } \
398  }while(0)
399 
400 
401 
402 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
403  int new_x[4]; \
404  new_x[3] = x4; new_x[1] = x2; new_x[0] = x1; \
405  new_x[nu] = -1; \
406  new_x[mu] = 0; \
407  new_x1 = new_x[0]; \
408  new_x2 = new_x[1]; \
409  new_x4 = new_x[3]; \
410  }while(0)
411 
412 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do { \
413  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
414  switch(mydir1){ \
415  case 0: \
416  new_x1 = x1 - 1; \
417  break; \
418  case 1: \
419  new_x2 = x2 - 1; \
420  break; \
421  case 2: \
422  break; \
423  case 3: \
424  new_x4 = x4 - 1; \
425  break; \
426  } \
427  switch(mydir2){ \
428  case 0: \
429  new_x1 = x1+1; \
430  break; \
431  case 1: \
432  new_x2 = x2+1; \
433  break; \
434  case 2: \
435  break; \
436  case 3: \
437  new_x4 = x4+1; \
438  break; \
439  } \
440  }while(0)
441 
442 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
443  sign =1; \
444  switch(dir){ \
445  case XUP: \
446  if ( (i4 & 1) != 0){ \
447  sign = -1; \
448  } \
449  break; \
450  case YUP: \
451  if ( ((i4+i1) & 1) != 0){ \
452  sign = -1; \
453  } \
454  break; \
455  case ZUP: \
456  if ( ((i4+i1+i2) & 1) != 0){ \
457  sign = -1; \
458  } \
459  break; \
460  case TUP: \
461  if (i4 == X4m1 && PtNm1){ \
462  sign = -1; \
463  }else if(i4 == -1 && Pt0){ \
464  sign = -1; \
465  } \
466  break; \
467  } \
468  }while (0)
469 
470 
471 #else
472 
473 #define UPDATE_COOR_PLUS(mydir, idx)
474 #define UPDATE_COOR_MINUS(mydir, idx)
475 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2)
476 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2)
477 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4)
478 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2)
479 #endif
480 
481 #ifdef MULTI_GPU
482 
483 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
484  switch(mydir){ \
485  case 0: \
486  new_mem_idx = (x1==X1m1)? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx - X1m1)/2*(1-xcomm)):((idx+1)>>1); \
487  break; \
488  case 1: \
489  new_mem_idx = (x2==X2m1)? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-X2X1mX1)/2*(1-ycomm)):((idx+X1)>>1); \
490  break; \
491  case 2: \
492  new_mem_idx = (x3==X3m1)? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-X3X2X1mX2X1)/2*(1-zcomm):((idx+X2X1)>>1); \
493  break; \
494  case 3: \
495  new_mem_idx = ( (x4==X4m1)? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-X4X3X2X1mX3X2X1)/2*(1-tcomm): (idx+X3X2X1)>>1); \
496  break; \
497  } \
498  UPDATE_COOR_PLUS(mydir, idx); \
499  }while(0)
500 
501 
502 
503 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
504  switch(mydir){ \
505  case 0: \
506  new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \
507  break; \
508  case 1: \
509  new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \
510  break; \
511  case 2: \
512  new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \
513  break; \
514  case 3: \
515  new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \
516  break; \
517  } \
518  UPDATE_COOR_MINUS(mydir, idx); \
519  }while(0)
520 
521 
522 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
523  int local_new_x1=x1; \
524  int local_new_x2=x2; \
525  int local_new_x3=x3; \
526  int local_new_x4=x4; \
527  new_mem_idx=X; \
528  if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
529  switch(mydir1){/*mydir1 is not partitioned or x[mydir1]!= 0*/ \
530  case 0: \
531  new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1); \
532  local_new_x1 = (x1==0)?X1m1:(x1 - 1); \
533  break; \
534  case 1: \
535  new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1); \
536  local_new_x2 = (x2==0)?X2m1:(x2 - 1); \
537  break; \
538  case 2: \
539  new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \
540  local_new_x3 = (x3==0)?X3m1:(x3 -1); \
541  break; \
542  case 3: \
543  new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \
544  local_new_x4 = (x4==0)?X4m1:(x4 - 1); \
545  break; \
546  } \
547  switch(mydir2){ \
548  case 0: \
549  new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \
550  break; \
551  case 1: \
552  new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \
553  break; \
554  case 2: \
555  new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \
556  break; \
557  case 3: \
558  new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \
559  break; \
560  } \
561  }else{ \
562  /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
563  switch(mydir2){ /*mydir2 is not partitioned or x[mydir2]!= 0*/ \
564  case 0: \
565  new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1); \
566  local_new_x1 = (x1==X1m1)?0:(x1+1); \
567  break; \
568  case 1: \
569  new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \
570  local_new_x2 = (x2==X2m1)?0:(x2+1); \
571  break; \
572  case 2: \
573  new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \
574  local_new_x3 = (x3==X3m1)?0:(x3+1); \
575  break; \
576  case 3: \
577  new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \
578  local_new_x4 = (x4==X4m1)?0:(x4+1); \
579  break; \
580  } \
581  switch(mydir1){/*mydir1 is 0 here */ \
582  case 0: \
583  new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \
584  break; \
585  case 1: \
586  new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \
587  break; \
588  case 2: \
589  new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \
590  break; \
591  case 3: \
592  new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \
593  break; \
594  } \
595  } \
596  new_mem_idx = new_mem_idx >> 1; \
597  UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
598  }while(0)
599 
600 
601 
602 
603 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
604  new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \
605  UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2); \
606  }while(0)
607 
608 
609 #else
610 
611 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
612  switch(mydir){ \
613  case 0: \
614  new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1; \
615  break; \
616  case 1: \
617  new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1; \
618  break; \
619  case 2: \
620  new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1; \
621  break; \
622  case 3: \
623  new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1: idx+X3X2X1)>>1; \
624  break; \
625  } \
626  UPDATE_COOR_PLUS(mydir, idx); \
627  }while(0)
628 
629 
630 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
631  switch(mydir){ \
632  case 0: \
633  new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \
634  break; \
635  case 1: \
636  new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \
637  break; \
638  case 2: \
639  new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \
640  break; \
641  case 3: \
642  new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \
643  break; \
644  } \
645  UPDATE_COOR_MINUS(mydir, idx); \
646  }while(0)
647 
648 
649 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
650  switch(mydir1){ \
651  case 0: \
652  new_mem_idx = ( (x1==0)?X+X1m1:X-1); \
653  break; \
654  case 1: \
655  new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \
656  break; \
657  case 2: \
658  new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \
659  break; \
660  case 3: \
661  new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \
662  break; \
663  } \
664  switch(mydir2){ \
665  case 0: \
666  new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \
667  break; \
668  case 1: \
669  new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
670  break; \
671  case 2: \
672  new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
673  break; \
674  case 3: \
675  new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
676  break; \
677  } \
678  UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
679  }while(0)
680 
681 #endif
682 
683 
684 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, idx) do { \
685  switch(mydir){ \
686  case 0: \
687  new_mem_idx = (idx+1)>>1; \
688  break; \
689  case 1: \
690  new_mem_idx = (idx+E1)>>1; \
691  break; \
692  case 2: \
693  new_mem_idx = (idx+E2E1)>>1; \
694  break; \
695  case 3: \
696  new_mem_idx = (idx+E3E2E1)>>1; \
697  break; \
698  } \
699  UPDATE_COOR_PLUS(mydir, idx); \
700  }while(0)
701 
702 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do { \
703  switch(mydir){ \
704  case 0: \
705  new_mem_idx = (idx-1) >> 1; \
706  break; \
707  case 1: \
708  new_mem_idx = (idx-E1) >> 1; \
709  break; \
710  case 2: \
711  new_mem_idx = (idx-E2E1) >> 1; \
712  break; \
713  case 3: \
714  new_mem_idx = (idx-E3E2E1) >> 1; \
715  break; \
716  } \
717  UPDATE_COOR_MINUS(mydir, idx); \
718  }while(0)
719 
720 
721 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do { \
722  switch(mydir1){ \
723  case 0: \
724  new_mem_idx = X-1; \
725  break; \
726  case 1: \
727  new_mem_idx = X-E1; \
728  break; \
729  case 2: \
730  new_mem_idx = X-E2E1; \
731  break; \
732  case 3: \
733  new_mem_idx = X-E3E2E1; \
734  break; \
735  } \
736  switch(mydir2){ \
737  case 0: \
738  new_mem_idx = (new_mem_idx+1)>> 1; \
739  break; \
740  case 1: \
741  new_mem_idx = (new_mem_idx+E1) >> 1; \
742  break; \
743  case 2: \
744  new_mem_idx = (new_mem_idx+E2E1) >> 1; \
745  break; \
746  case 3: \
747  new_mem_idx = (new_mem_idx+E3E2E1) >> 1; \
748  break; \
749  } \
750  UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2); \
751  }while(0)
752 
753 
754 
755 
756 template<int mu, int nu, int odd_bit>
757  __global__ void
758  LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
759  const FloatN* sitelink_even, const FloatN* sitelink_odd,
762 {
763  __shared__ FloatM sd_data[NUM_FLOATS*64];
764 
765  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
768  //FloatM STAPLE6, STAPLE7, STAPLE8;
769 
770  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
771 
772  int z1 = mem_idx / X1h;
773  short x1h = mem_idx - z1*X1h;
774  int z2 = z1 / X2;
775  short x2 = z1 - z2*X2;
776  short x4 = z2 / X3;
777  short x3 = z2 - x4*X3;
778 
779  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
780  short x1 = 2*x1h + x1odd;
781  int X = 2*mem_idx + x1odd;
782 
783  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
784  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
785  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
786  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
787  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
788  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
789  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
790  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
791 
796 
797  //int x[4] = {x1,x2,x3, x4};
798 #ifdef MULTI_GPU
799  int Z[4] ={X1,X2,X3,X4};
800  int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
801  int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
802  int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
803  int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
804 #endif
805 
806  /* Upper staple */
807  /* Computes the staple :
808  * mu (B)
809  * +-------+
810  * nu | |
811  * (A) | |(C)
812  * X X
813  *
814  */
815 
816  {
817  /* load matrix A*/
818  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
819  COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
820  RECONSTRUCT_SITE_LINK(sign, a);
821 
822 
823  /* load matrix B*/
825  LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
826  COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);
827  RECONSTRUCT_SITE_LINK(sign, b);
828 
829 
830  MULT_SU3_NN(a, b, tempa);
831 
832  /* load matrix C*/
833 
835  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
836  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
837  RECONSTRUCT_SITE_LINK(sign, c);
838 
839  MULT_SU3_NA(tempa, c, staple);
840  }
841 
842  /***************lower staple****************
843  *
844  * X X
845  * nu | |
846  * (A) | | (C)
847  * +-------+
848  * mu (B)
849  *
850  *********************************************/
851  {
852  /* load matrix A*/
854 
855  LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
856  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
857  RECONSTRUCT_SITE_LINK(sign, a);
858 
859  /* load matrix B*/
860  LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
861  COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);
862  RECONSTRUCT_SITE_LINK(sign, b);
863 
864  MULT_SU3_AN(a, b, tempa);
865 
866  /* load matrix C*/
867  //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
868 #ifdef MULTI_GPU
869  if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
870  int idx = nu*4+mu;
871  LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
872  }else{
874  }
875 #else
877 #endif
878 
879  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
880 
881  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
882  RECONSTRUCT_SITE_LINK(sign, c);
883 
884 
885  MULT_SU3_NN(tempa, c, b);
886  LLFAT_ADD_SU3_MATRIX(b, staple, staple);
887  }
888 
889  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
890  LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
891  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
892  WRITE_FAT_MATRIX(fatlink_even,mu, mem_idx);
893  }
894  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
895 
896  return;
897 }
898 
899 template<int mu, int nu, int odd_bit, int save_staple>
900  __global__ void
901  LLFAT_KERNEL(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
902  const FloatN* sitelink_even, const FloatN* sitelink_odd,
904  const FloatM* mulink_even, const FloatM* mulink_odd,
906 {
907  __shared__ FloatM sd_data[NUM_FLOATS*64];
908  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
912  //FloatM STAPLE6, STAPLE7, STAPLE8;
913 
914  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
915 
916  int z1 = mem_idx / X1h;
917  int x1h = mem_idx - z1*X1h;
918  int z2 = z1 / X2;
919  int x2 = z1 - z2*X2;
920  int x4 = z2 / X3;
921  int x3 = z2 - x4*X3;
922 
923  int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
924  int x1 = 2*x1h + x1odd;
925  int X = 2*mem_idx + x1odd;
926 
927  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
928  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
929  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
930  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
931  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
932  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
933  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
934  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
935 
937 #ifdef MULTI_GPU
938  int Z[4] ={X1,X2,X3,X4};
939  int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
940  int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
941  int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
942  int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
943 #endif
944 
945  int new_mem_idx;
948 
949  /* Upper staple */
950  /* Computes the staple :
951  * mu (BB)
952  * +-------+
953  * nu | |
954  * (A) | |(C)
955  * X X
956  *
957  */
958  {
959  /* load matrix A*/
960  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
961  COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
962  RECONSTRUCT_SITE_LINK(sign, a);
963 
964  /* load matrix BB*/
966  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
967 
968  MULT_SU3_NN(a, bb, tempa);
969 
970  /* load matrix C*/
972  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
973  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
974  RECONSTRUCT_SITE_LINK(sign, c);
975  if (save_staple){
976  MULT_SU3_NA(tempa, c, staple);
977  }else{
978  MULT_SU3_NA(tempa, c, tempb);
979  }
980  }
981 
982  /***************lower staple****************
983  *
984  * X X
985  * nu | |
986  * (A) | | (C)
987  * +-------+
988  * mu (B)
989  *
990  *********************************************/
991 
992 
993  {
994  /* load matrix A*/
996 
997  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
998  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
999  RECONSTRUCT_SITE_LINK(sign, a);
1000 
1001  /* load matrix B*/
1003  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1004 
1005  MULT_SU3_AN(a, bb, tempa);
1006 
1007  /* load matrix C*/
1008  //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
1009 #ifdef MULTI_GPU
1010  if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
1011  int idx = nu*4+mu;
1012  LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
1013  }else{
1015  }
1016 #else
1018 #endif
1019 
1020  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1021  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
1022  RECONSTRUCT_SITE_LINK(sign, c);
1023 
1024  MULT_SU3_NN(tempa, c, a);
1025  if(save_staple){
1026  LLFAT_ADD_SU3_MATRIX(staple, a, staple);
1027  }else{
1028  LLFAT_ADD_SU3_MATRIX(a, tempb, tempb);
1029  }
1030  }
1031 
1032  LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
1033  if(save_staple){
1034  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
1035  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1036  }
1037  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1038  }else{
1039  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
1040  SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat);
1041  }else{
1042  //The code should never be here
1043  //because it makes no sense to split kernels when no staple is stored
1044  //print error?
1045  }
1046  }
1047 
1048  WRITE_FAT_MATRIX(fatlink_even, mu, mem_idx);
1049 
1050  return;
1051 }
1052 
1053 __global__ void
1058  const FloatN* my_sitelink;
1060  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1061  int mem_idx = sid;
1062 
1063 #if (RECONSTRUCT != 18)
1064  int odd_bit= 0;
1065 #endif
1066 
1067  my_sitelink = sitelink_even;
1068  my_fatlink = fatlink_even;
1069  if (mem_idx >= Vh){
1070 #if (RECONSTRUCT != 18)
1071  odd_bit=1;
1072 #endif
1073  mem_idx = mem_idx - Vh;
1074  my_sitelink = sitelink_odd;
1075  my_fatlink = fatlink_odd;
1076  }
1077 
1078 #if (RECONSTRUCT != 18)
1079  int z1 = mem_idx / X1h;
1080  int x1h = mem_idx - z1*X1h;
1081  int z2 = z1 / X2;
1082  int x2 = z1 - z2*X2;
1083  int x4 = z2 / X3;
1084  int x3 = z2 - x4*X3;
1085  int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1086  int x1 = 2*x1h + x1odd;
1088 #endif
1089 
1090  for(int dir=0;dir < 4; dir++){
1091  LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1092  COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4);
1093  RECONSTRUCT_SITE_LINK(sign, a);
1094 
1095  LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx);
1096 
1097  SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1098 
1099  WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx);
1100  }
1101 
1102  return;
1103 }
1104 
1105 
1106 
1107 
1108 template<int mu, int nu, int odd_bit>
1109  __global__ void
1110  LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
1111  const FloatN* sitelink_even, const FloatN* sitelink_odd,
1114 {
1115 #if 1
1116  extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
1117 
1118 
1119  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1122 
1123  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1124  if(mem_idx >= kparam.threads) return;
1125 
1126  int z1 = mem_idx/D1h;
1127  short x1h = mem_idx - z1*D1h;
1128  int z2 = z1/D2;
1129  short x2 = z1 - z2*D2;
1130  short x4 = z2/D3;
1131  short x3 = z2 - x4*D3;
1132 
1133  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1134  short x1 = 2*x1h + x1odd;
1135 
1136  x1 += kparam.base_idx;
1137  x2 += kparam.base_idx;
1138  x3 += kparam.base_idx;
1139  x4 += kparam.base_idx;
1140  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1141  mem_idx = X/2;
1142 
1143  int new_mem_idx;
1145  DECLARE_NEW_X;
1146 
1147  /* Upper staple */
1148  /* Computes the staple :
1149  * mu (B)
1150  * +-------+
1151  * nu | |
1152  * (A) | |(C)
1153  * X X
1154  *
1155  */
1156 
1157  {
1158  /* load matrix A*/
1159  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
1160  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
1161  RECONSTRUCT_SITE_LINK(sign, a);
1162 
1163 
1164 
1165  /* load matrix B*/
1167  LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
1168  COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1169  RECONSTRUCT_SITE_LINK(sign, b);
1170 
1171 
1172  MULT_SU3_NN(a, b, tempa);
1173 
1174  /* load matrix C*/
1175 
1177  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
1178  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1179  RECONSTRUCT_SITE_LINK(sign, c);
1180 
1181  MULT_SU3_NA(tempa, c, staple);
1182 
1183  }
1184 
1185  /***************lower staple****************
1186  *
1187  * X X
1188  * nu | |
1189  * (A) | | (C)
1190  * +-------+
1191  * mu (B)
1192  *
1193  *********************************************/
1194  {
1195  /* load matrix A*/
1197 
1198  LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
1199  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1200  RECONSTRUCT_SITE_LINK(sign, a);
1201 
1202  /* load matrix B*/
1203  LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
1204  COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1205  RECONSTRUCT_SITE_LINK(sign, b);
1206 
1207  MULT_SU3_AN(a, b, tempa);
1208 
1209 
1210 
1211  /* load matrix C*/
1213  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1214 
1215  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1216  RECONSTRUCT_SITE_LINK(sign, c);
1217 
1218 
1219  MULT_SU3_NN(tempa, c, b);
1220  LLFAT_ADD_SU3_MATRIX(b, staple, staple);
1221 
1222  }
1223 
1224 
1225  if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
1226  || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
1227  int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
1228 
1229  LOAD_EVEN_FAT_MATRIX(mu, orig_idx);
1230  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1231  WRITE_FAT_MATRIX(fatlink_even,mu, orig_idx);
1232  }
1233  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1234 
1235 #endif
1236 
1237  return;
1238 }
1239 
1240 template<int mu, int nu, int odd_bit, int save_staple>
1241  __global__ void
1242  LLFAT_KERNEL_EX(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
1243  const FloatN* sitelink_even, const FloatN* sitelink_odd,
1245  const FloatM* mulink_even, const FloatM* mulink_odd,
1247 {
1248 #if 1
1249  //__shared__ FloatM sd_data[NUM_FLOATS*64];
1250  extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
1251  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1254 
1255  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1256  if(mem_idx >= kparam.threads) return;
1257 
1258  int z1 = mem_idx/D1h;
1259  short x1h = mem_idx - z1*D1h;
1260  int z2 = z1/D2;
1261  short x2 = z1 - z2*D2;
1262  short x4 = z2/D3;
1263  short x3 = z2 - x4*D3;
1264 
1265  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1266  short x1 = 2*x1h + x1odd;
1267 
1268  x1 += kparam.base_idx;
1269  x2 += kparam.base_idx;
1270  x3 += kparam.base_idx;
1271  x4 += kparam.base_idx;
1272  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1273  mem_idx = X/2;
1274 
1275  int new_mem_idx;
1277  DECLARE_NEW_X;
1278 
1279  /* Upper staple */
1280  /* Computes the staple :
1281  * mu (BB)
1282  * +-------+
1283  * nu | |
1284  * (A) | |(C)
1285  * X X
1286  *
1287  */
1288  {
1289  /* load matrix A*/
1290  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
1291  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
1292  RECONSTRUCT_SITE_LINK(sign, a);
1293 
1294  /* load matrix BB*/
1296  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1297  MULT_SU3_NN(a, bb, tempa);
1298 
1299  /* load matrix C*/
1301  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
1302  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1303  RECONSTRUCT_SITE_LINK(sign, c);
1304 
1305  MULT_SU3_NA(tempa, c, staple);
1306  }
1307 
1308  /***************lower staple****************
1309  *
1310  * X X
1311  * nu | |
1312  * (A) | | (C)
1313  * +-------+
1314  * mu (B)
1315  *
1316  *********************************************/
1317 
1318  {
1319  /* load matrix A*/
1321 
1322  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
1323  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1324  RECONSTRUCT_SITE_LINK(sign, a);
1325 
1326  /* load matrix B*/
1328  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1329 
1330  MULT_SU3_AN(a, bb, tempa);
1331 
1332  /* load matrix C*/
1333 
1335 
1336  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1337  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1338  RECONSTRUCT_SITE_LINK(sign, c);
1339 
1340  MULT_SU3_NN(tempa, c, a);
1341 
1342  LLFAT_ADD_SU3_MATRIX(a, staple, staple);
1343  }
1344 
1345 
1346  if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
1347  || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
1348  int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
1349  LOAD_EVEN_FAT_MATRIX(mu, orig_idx);
1350  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1351  WRITE_FAT_MATRIX(fatlink_even, mu, orig_idx);
1352  }
1353 
1354  if(save_staple){
1355  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1356  }
1357 #endif
1358 
1359  return;
1360 }
1361 
1362 
1363 __global__ void
1364 LLFAT_KERNEL_EX(llfatOneLink, RECONSTRUCT)(const FloatN* sitelink_even, const FloatN* sitelink_odd,
1367 {
1368 #if 1
1369 
1370  const FloatN* my_sitelink;
1371  FloatM* my_fatlink;
1372  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1373  int idx = sid;
1374 
1375  if(sid >= 2*kparam.threads) return;
1376 
1377  short odd_bit= 0;
1378 
1379  my_sitelink = sitelink_even;
1380  my_fatlink = fatlink_even;
1381  if (idx >= kparam.threads){
1382  odd_bit=1;
1383  idx = idx - kparam.threads;
1384  my_sitelink = sitelink_odd;
1385  my_fatlink = fatlink_odd;
1386  }
1387 
1388  int z1 = idx/D1h;
1389  short x1h = idx - z1*D1h;
1390  int z2 = z1/D2;
1391  short x2 = z1 - z2*D2;
1392  int x4 = z2/D3;
1393  short x3 = z2 - x4*D3;
1394  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1395  short x1 = 2*x1h + x1odd;
1397 
1398  x1 += kparam.base_idx;
1399  x2 += kparam.base_idx;
1400  x3 += kparam.base_idx;
1401  x4 += kparam.base_idx;
1402  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1403  int mem_idx = X/2;
1404 
1405  for(int dir=0;dir < 4; dir++){
1406  LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1407  COMPUTE_RECONSTRUCT_SIGN(sign, dir, (x1-2), (x2-2), (x3-2), (x4-2));
1408  RECONSTRUCT_SITE_LINK(sign, a);
1409 
1410  LOAD_FAT_MATRIX(my_fatlink, dir, idx);
1411 
1412  SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1413 
1414  WRITE_FAT_MATRIX(my_fatlink,dir, idx);
1415  }
1416 #endif
1417 
1418  return;
1419 }
1420 
1421 #undef D1
1422 #undef D2
1423 #undef D3
1424 #undef D4
1425 #undef D1h
1426 
1427 #undef DECLARE_VAR_SIGN
1428 #undef DECLARE_NEW_X
1429 #undef DECLARE_X_ARRAY
1430 
1431 #undef a00_re
1432 #undef a00_im
1433 #undef a01_re
1434 #undef a01_im
1435 #undef a02_re
1436 #undef a02_im
1437 #undef a10_re
1438 #undef a10_im
1439 #undef a11_re
1440 #undef a11_im
1441 #undef a12_re
1442 #undef a12_im
1443 #undef a20_re
1444 #undef a20_im
1445 #undef a21_re
1446 #undef a21_im
1447 #undef a22_re
1448 #undef a22_im
1449 
1450 #undef b00_re
1451 #undef b00_im
1452 #undef b01_re
1453 #undef b01_im
1454 #undef b02_re
1455 #undef b02_im
1456 #undef b10_re
1457 #undef b10_im
1458 #undef b11_re
1459 #undef b11_im
1460 #undef b12_re
1461 #undef b12_im
1462 #undef b20_re
1463 #undef b20_im
1464 #undef b21_re
1465 #undef b21_im
1466 #undef b22_re
1467 #undef b22_im
1468 
1469 #undef bb00_re
1470 #undef bb00_im
1471 #undef bb01_re
1472 #undef bb01_im
1473 #undef bb02_re
1474 #undef bb02_im
1475 #undef bb10_re
1476 #undef bb10_im
1477 #undef bb11_re
1478 #undef bb11_im
1479 #undef bb12_re
1480 #undef bb12_im
1481 #undef bb20_re
1482 #undef bb20_im
1483 #undef bb21_re
1484 #undef bb21_im
1485 #undef bb22_re
1486 #undef bb22_im
1487 
1488 #undef c00_re
1489 #undef c00_im
1490 #undef c01_re
1491 #undef c01_im
1492 #undef c02_re
1493 #undef c02_im
1494 #undef c10_re
1495 #undef c10_im
1496 #undef c11_re
1497 #undef c11_im
1498 #undef c12_re
1499 #undef c12_im
1500 #undef c20_re
1501 #undef c20_im
1502 #undef c21_re
1503 #undef c21_im
1504 #undef c22_re
1505 #undef c22_im
1506 
1507 #undef aT00_re
1508 #undef aT00_im
1509 #undef aT01_re
1510 #undef aT01_im
1511 #undef aT02_re
1512 #undef aT02_im
1513 #undef aT10_re
1514 #undef aT10_im
1515 #undef aT11_re
1516 #undef aT11_im
1517 #undef aT12_re
1518 #undef aT12_im
1519 #undef aT20_re
1520 #undef aT20_im
1521 #undef aT21_re
1522 #undef aT21_im
1523 #undef aT22_re
1524 #undef aT22_im
1525 
1526 #undef bT00_re
1527 #undef bT00_im
1528 #undef bT01_re
1529 #undef bT01_im
1530 #undef bT02_re
1531 #undef bT02_im
1532 #undef bT10_re
1533 #undef bT10_im
1534 #undef bT11_re
1535 #undef bT11_im
1536 #undef bT12_re
1537 #undef bT12_im
1538 #undef bT20_re
1539 #undef bT20_im
1540 #undef bT21_re
1541 #undef bT21_im
1542 #undef bT22_re
1543 #undef bT22_im
1544 
1545 #undef cT00_re
1546 #undef cT00_im
1547 #undef cT01_re
1548 #undef cT01_im
1549 #undef cT02_re
1550 #undef cT02_im
1551 #undef cT10_re
1552 #undef cT10_im
1553 #undef cT11_re
1554 #undef cT11_im
1555 #undef cT12_re
1556 #undef cT12_im
1557 #undef cT20_re
1558 #undef cT20_im
1559 #undef cT21_re
1560 #undef cT21_im
1561 #undef cT22_re
1562 #undef cT22_im
1563 
1564 
1565 #undef tempa00_re
1566 #undef tempa00_im
1567 #undef tempa01_re
1568 #undef tempa01_im
1569 #undef tempa02_re
1570 #undef tempa02_im
1571 #undef tempa10_re
1572 #undef tempa10_im
1573 #undef tempa11_re
1574 #undef tempa11_im
1575 #undef tempa12_re
1576 #undef tempa12_im
1577 #undef tempa20_re
1578 #undef tempa20_im
1579 #undef tempa21_re
1580 #undef tempa21_im
1581 #undef tempa22_re
1582 #undef tempa22_im
1583 
1584 #undef tempb00_re
1585 #undef tempb00_im
1586 #undef tempb01_re
1587 #undef tempb01_im
1588 #undef tempb02_re
1589 #undef tempb02_im
1590 #undef tempb10_re
1591 #undef tempb10_im
1592 #undef tempb11_re
1593 #undef tempb11_im
1594 #undef tempb12_re
1595 #undef tempb12_im
1596 #undef tempb20_re
1597 #undef tempb20_im
1598 #undef tempb21_re
1599 #undef tempb21_im
1600 #undef tempb22_re
1601 #undef tempb22_im
1602 
1603 #undef fat00_re
1604 #undef fat00_im
1605 #undef fat01_re
1606 #undef fat01_im
1607 #undef fat02_re
1608 #undef fat02_im
1609 #undef fat10_re
1610 #undef fat10_im
1611 #undef fat11_re
1612 #undef fat11_im
1613 #undef fat12_re
1614 #undef fat12_im
1615 #undef fat20_re
1616 #undef fat20_im
1617 #undef fat21_re
1618 #undef fat21_im
1619 #undef fat22_re
1620 #undef fat22_im