QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_core.h
Go to the documentation of this file.
1 #define Vsh_x ghostFace[0]
2 #define Vsh_y ghostFace[1]
3 #define Vsh_z ghostFace[2]
4 #define Vsh_t ghostFace[3]
5 #define xcomm kparam.ghostDim[0]
6 #define ycomm kparam.ghostDim[1]
7 #define zcomm kparam.ghostDim[2]
8 #define tcomm kparam.ghostDim[3]
9 #define dimcomm kparam.ghostDim
10 
11 
12 #define D1 kparam.D1
13 #define D2 kparam.D2
14 #define D3 kparam.D3
15 #define D4 kparam.D4
16 #define D1h kparam.D1h
17 
18 #if (RECONSTRUCT == 18)
19 #define DECLARE_VAR_SIGN
20 #define DECLARE_NEW_X
21 #ifdef MULTI_GPU
22 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
23 #else
24 #define DECLARE_X_ARRAY
25 #endif
26 #else //RECONSTRUCT == 12
27 #define DECLARE_VAR_SIGN short sign=1
28 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \
29  short new_x3=x3; short new_x4=x4;
30 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
31 
32 #endif
33 
34 #if (PRECISION == 1 && RECONSTRUCT == 12)
35 
36 #define a00_re A0.x
37 #define a00_im A0.y
38 #define a01_re A0.z
39 #define a01_im A0.w
40 #define a02_re A1.x
41 #define a02_im A1.y
42 #define a10_re A1.z
43 #define a10_im A1.w
44 #define a11_re A2.x
45 #define a11_im A2.y
46 #define a12_re A2.z
47 #define a12_im A2.w
48 #define a20_re A3.x
49 #define a20_im A3.y
50 #define a21_re A3.z
51 #define a21_im A3.w
52 #define a22_re A4.x
53 #define a22_im A4.y
54 
55 #define b00_re B0.x
56 #define b00_im B0.y
57 #define b01_re B0.z
58 #define b01_im B0.w
59 #define b02_re B1.x
60 #define b02_im B1.y
61 #define b10_re B1.z
62 #define b10_im B1.w
63 #define b11_re B2.x
64 #define b11_im B2.y
65 #define b12_re B2.z
66 #define b12_im B2.w
67 #define b20_re B3.x
68 #define b20_im B3.y
69 #define b21_re B3.z
70 #define b21_im B3.w
71 #define b22_re B4.x
72 #define b22_im B4.y
73 
74 #define c00_re C0.x
75 #define c00_im C0.y
76 #define c01_re C0.z
77 #define c01_im C0.w
78 #define c02_re C1.x
79 #define c02_im C1.y
80 #define c10_re C1.z
81 #define c10_im C1.w
82 #define c11_re C2.x
83 #define c11_im C2.y
84 #define c12_re C2.z
85 #define c12_im C2.w
86 #define c20_re C3.x
87 #define c20_im C3.y
88 #define c21_re C3.z
89 #define c21_im C3.w
90 #define c22_re C4.x
91 #define c22_im C4.y
92 
93 #define f00_re F0.x
94 #define f00_im F0.y
95 #define f01_re F0.z
96 #define f01_im F0.w
97 #define f02_re F1.x
98 #define f02_im F1.y
99 #define f10_re F1.z
100 #define f10_im F1.w
101 #define f11_re F2.x
102 #define f11_im F2.y
103 #define f12_re F2.z
104 #define f12_im F2.w
105 #define f20_re F3.x
106 #define f20_im F3.y
107 #define f21_re F3.z
108 #define f21_im F3.w
109 #define f22_re F4.x
110 #define f22_im F4.y
111 
112 #define WRITE_LONG_MATRIX WRITE_GAUGE_MATRIX_FLOAT4
113 
114 #else
115 #define a00_re A0.x
116 #define a00_im A0.y
117 #define a01_re A1.x
118 #define a01_im A1.y
119 #define a02_re A2.x
120 #define a02_im A2.y
121 #define a10_re A3.x
122 #define a10_im A3.y
123 #define a11_re A4.x
124 #define a11_im A4.y
125 #define a12_re A5.x
126 #define a12_im A5.y
127 #define a20_re A6.x
128 #define a20_im A6.y
129 #define a21_re A7.x
130 #define a21_im A7.y
131 #define a22_re A8.x
132 #define a22_im A8.y
133 
134 #define b00_re B0.x
135 #define b00_im B0.y
136 #define b01_re B1.x
137 #define b01_im B1.y
138 #define b02_re B2.x
139 #define b02_im B2.y
140 #define b10_re B3.x
141 #define b10_im B3.y
142 #define b11_re B4.x
143 #define b11_im B4.y
144 #define b12_re B5.x
145 #define b12_im B5.y
146 #define b20_re B6.x
147 #define b20_im B6.y
148 #define b21_re B7.x
149 #define b21_im B7.y
150 #define b22_re B8.x
151 #define b22_im B8.y
152 
153 #define c00_re C0.x
154 #define c00_im C0.y
155 #define c01_re C1.x
156 #define c01_im C1.y
157 #define c02_re C2.x
158 #define c02_im C2.y
159 #define c10_re C3.x
160 #define c10_im C3.y
161 #define c11_re C4.x
162 #define c11_im C4.y
163 #define c12_re C5.x
164 #define c12_im C5.y
165 #define c20_re C6.x
166 #define c20_im C6.y
167 #define c21_re C7.x
168 #define c21_im C7.y
169 #define c22_re C8.x
170 #define c22_im C8.y
171 
172 #define f00_re F0.x
173 #define f00_im F0.y
174 #define f01_re F1.x
175 #define f01_im F1.y
176 #define f02_re F2.x
177 #define f02_im F2.y
178 #define f10_re F3.x
179 #define f10_im F3.y
180 #define f11_re F4.x
181 #define f11_im F4.y
182 #define f12_re F5.x
183 #define f12_im F5.y
184 #define f20_re F6.x
185 #define f20_im F6.y
186 #define f21_re F7.x
187 #define f21_im F7.y
188 #define f22_re F8.x
189 #define f22_im F8.y
190 
191 #define WRITE_LONG_MATRIX WRITE_GAUGE_MATRIX_FLOAT2
192 
193 #endif
194 
195 
196 #define bb00_re BB0.x
197 #define bb00_im BB0.y
198 #define bb01_re BB1.x
199 #define bb01_im BB1.y
200 #define bb02_re BB2.x
201 #define bb02_im BB2.y
202 #define bb10_re BB3.x
203 #define bb10_im BB3.y
204 #define bb11_re BB4.x
205 #define bb11_im BB4.y
206 #define bb12_re BB5.x
207 #define bb12_im BB5.y
208 #define bb20_re BB6.x
209 #define bb20_im BB6.y
210 #define bb21_re BB7.x
211 #define bb21_im BB7.y
212 #define bb22_re BB8.x
213 #define bb22_im BB8.y
214 
215 
216 
217 #define aT00_re (+a00_re)
218 #define aT00_im (-a00_im)
219 #define aT01_re (+a10_re)
220 #define aT01_im (-a10_im)
221 #define aT02_re (+a20_re)
222 #define aT02_im (-a20_im)
223 #define aT10_re (+a01_re)
224 #define aT10_im (-a01_im)
225 #define aT11_re (+a11_re)
226 #define aT11_im (-a11_im)
227 #define aT12_re (+a21_re)
228 #define aT12_im (-a21_im)
229 #define aT20_re (+a02_re)
230 #define aT20_im (-a02_im)
231 #define aT21_re (+a12_re)
232 #define aT21_im (-a12_im)
233 #define aT22_re (+a22_re)
234 #define aT22_im (-a22_im)
235 
236 #define bT00_re (+b00_re)
237 #define bT00_im (-b00_im)
238 #define bT01_re (+b10_re)
239 #define bT01_im (-b10_im)
240 #define bT02_re (+b20_re)
241 #define bT02_im (-b20_im)
242 #define bT10_re (+b01_re)
243 #define bT10_im (-b01_im)
244 #define bT11_re (+b11_re)
245 #define bT11_im (-b11_im)
246 #define bT12_re (+b21_re)
247 #define bT12_im (-b21_im)
248 #define bT20_re (+b02_re)
249 #define bT20_im (-b02_im)
250 #define bT21_re (+b12_re)
251 #define bT21_im (-b12_im)
252 #define bT22_re (+b22_re)
253 #define bT22_im (-b22_im)
254 
255 #define cT00_re (+c00_re)
256 #define cT00_im (-c00_im)
257 #define cT01_re (+c10_re)
258 #define cT01_im (-c10_im)
259 #define cT02_re (+c20_re)
260 #define cT02_im (-c20_im)
261 #define cT10_re (+c01_re)
262 #define cT10_im (-c01_im)
263 #define cT11_re (+c11_re)
264 #define cT11_im (-c11_im)
265 #define cT12_re (+c21_re)
266 #define cT12_im (-c21_im)
267 #define cT20_re (+c02_re)
268 #define cT20_im (-c02_im)
269 #define cT21_re (+c12_re)
270 #define cT21_im (-c12_im)
271 #define cT22_re (+c22_re)
272 #define cT22_im (-c22_im)
273 
274 
275 #define tempa00_re TEMPA0.x
276 #define tempa00_im TEMPA0.y
277 #define tempa01_re TEMPA1.x
278 #define tempa01_im TEMPA1.y
279 #define tempa02_re TEMPA2.x
280 #define tempa02_im TEMPA2.y
281 #define tempa10_re TEMPA3.x
282 #define tempa10_im TEMPA3.y
283 #define tempa11_re TEMPA4.x
284 #define tempa11_im TEMPA4.y
285 #define tempa12_re TEMPA5.x
286 #define tempa12_im TEMPA5.y
287 #define tempa20_re TEMPA6.x
288 #define tempa20_im TEMPA6.y
289 #define tempa21_re TEMPA7.x
290 #define tempa21_im TEMPA7.y
291 #define tempa22_re TEMPA8.x
292 #define tempa22_im TEMPA8.y
293 
294 #define tempb00_re TEMPB0.x
295 #define tempb00_im TEMPB0.y
296 #define tempb01_re TEMPB1.x
297 #define tempb01_im TEMPB1.y
298 #define tempb02_re TEMPB2.x
299 #define tempb02_im TEMPB2.y
300 #define tempb10_re TEMPB3.x
301 #define tempb10_im TEMPB3.y
302 #define tempb11_re TEMPB4.x
303 #define tempb11_im TEMPB4.y
304 #define tempb12_re TEMPB5.x
305 #define tempb12_im TEMPB5.y
306 #define tempb20_re TEMPB6.x
307 #define tempb20_im TEMPB6.y
308 #define tempb21_re TEMPB7.x
309 #define tempb21_im TEMPB7.y
310 #define tempb22_re TEMPB8.x
311 #define tempb22_im TEMPB8.y
312 
313 #define fat00_re FAT0.x
314 #define fat00_im FAT0.y
315 #define fat01_re FAT1.x
316 #define fat01_im FAT1.y
317 #define fat02_re FAT2.x
318 #define fat02_im FAT2.y
319 #define fat10_re FAT3.x
320 #define fat10_im FAT3.y
321 #define fat11_re FAT4.x
322 #define fat11_im FAT4.y
323 #define fat12_re FAT5.x
324 #define fat12_im FAT5.y
325 #define fat20_re FAT6.x
326 #define fat20_im FAT6.y
327 #define fat21_re FAT7.x
328 #define fat21_im FAT7.y
329 #define fat22_re FAT8.x
330 #define fat22_im FAT8.y
331 
332 #define NUM_FLOATS 5
333 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x]
334 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ]
335 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ]
336 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ]
337 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ]
338 
339 
340 #undef UPDATE_COOR_PLUS
341 #undef UPDATE_COOR_MINUS
342 #undef UPDATE_COOR_LOWER_STAPLE
343 #undef UPDATE_COOR_LOWER_STAPLE_DIAG
344 #undef UPDATE_COOR_LOWER_STAPLE_EX
345 #undef COMPUTE_RECONSTRUCT_SIGN
346 #if (RECONSTRUCT != 18)
347 #define UPDATE_COOR_PLUS(mydir, n, idx) do { \
348  new_x1 = x1; new_x2 = x2; new_x3=x3; new_x4 = x4; \
349  switch(mydir){ \
350  case 0: \
351  new_x1 = x1+n; \
352  break; \
353  case 1: \
354  new_x2 = x2+n; \
355  break; \
356  case 2: \
357  new_x3 = x3+n; \
358  break; \
359  case 3: \
360  new_x4 = x4+n; \
361  break; \
362  } \
363  }while(0)
364 
365 #define UPDATE_COOR_MINUS(mydir, idx) do { \
366  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
367  switch(mydir){ \
368  case 0: \
369  new_x1 = x1-1; \
370  break; \
371  case 1: \
372  new_x2 = x2-1; \
373  break; \
374  case 2: \
375  break; \
376  case 3: \
377  new_x4 = x4-1; \
378  break; \
379  } \
380  }while(0)
381 
382 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do { \
383  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
384  if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
385  switch(mydir1){ \
386  case 0: \
387  new_x1 = x1 - 1; \
388  break; \
389  case 1: \
390  new_x2 = x2 - 1; \
391  break; \
392  case 2: \
393  break; \
394  case 3: \
395  new_x4 = x4 - 1; \
396  break; \
397  } \
398  switch(mydir2){ \
399  case 0: \
400  new_x1 = x1+1; \
401  break; \
402  case 1: \
403  new_x2 = x2+1; \
404  break; \
405  case 2: \
406  break; \
407  case 3: \
408  new_x4 = x4+1; \
409  break; \
410  } \
411  }else{ \
412  /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
413  switch(mydir2){ \
414  case 0: \
415  new_x1 = x1+1; \
416  break; \
417  case 1: \
418  new_x2 = x2+1; \
419  break; \
420  case 2: \
421  break; \
422  case 3: \
423  new_x4 = x4+1; \
424  break; \
425  } \
426  switch(mydir1){/*mydir1 is 0 here */ \
427  case 0: \
428  new_x1 = x1-1; \
429  break; \
430  case 1: \
431  new_x2 = x2-1; \
432  break; \
433  case 2: \
434  break; \
435  case 3: \
436  new_x4 = x4-1; \
437  break; \
438  } \
439  } \
440  }while(0)
441 
442 
443 
444 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
445  int new_x[4]; \
446  new_x[3] = x4; new_x[1] = x2; new_x[0] = x1; \
447  new_x[nu] = -1; \
448  new_x[mu] = 0; \
449  new_x1 = new_x[0]; \
450  new_x2 = new_x[1]; \
451  new_x4 = new_x[3]; \
452  }while(0)
453 
454 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do { \
455  new_x1 = x1; new_x2 = x2; new_x4 = x4; \
456  switch(mydir1){ \
457  case 0: \
458  new_x1 = x1 - 1; \
459  break; \
460  case 1: \
461  new_x2 = x2 - 1; \
462  break; \
463  case 2: \
464  break; \
465  case 3: \
466  new_x4 = x4 - 1; \
467  break; \
468  } \
469  switch(mydir2){ \
470  case 0: \
471  new_x1 = x1+1; \
472  break; \
473  case 1: \
474  new_x2 = x2+1; \
475  break; \
476  case 2: \
477  break; \
478  case 3: \
479  new_x4 = x4+1; \
480  break; \
481  } \
482  }while(0)
483 
484 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
485  sign =1; \
486  switch(dir){ \
487  case XUP: \
488  if ( (i4 & 1) != 0){ \
489  sign = -1; \
490  } \
491  break; \
492  case YUP: \
493  if ( ((i4+i1) & 1) != 0){ \
494  sign = -1; \
495  } \
496  break; \
497  case ZUP: \
498  if ( ((i4+i1+i2) & 1) != 0){ \
499  sign = -1; \
500  } \
501  break; \
502  case TUP: \
503  if (i4 == X4m1 && PtNm1){ \
504  sign = -1; \
505  }else if(i4 == -1 && Pt0){ \
506  sign = -1; \
507  } \
508  break; \
509  } \
510  }while (0)
511 
512 
513 #else
514 
515 #define UPDATE_COOR_PLUS(mydir, n, idx)
516 #define UPDATE_COOR_MINUS(mydir, idx)
517 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2)
518 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2)
519 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4)
520 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2)
521 #endif
522 
523 #ifdef MULTI_GPU
524 
525 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx) do { \
526  switch(mydir){ \
527  case 0: \
528  new_mem_idx = (x1>=(X1-n))? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx-(X1-n))/2*(1-xcomm)):((idx+n)>>1); \
529  break; \
530  case 1: \
531  new_mem_idx = (x2>=(X2-n))? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-(X2-n)*X1)/2*(1-ycomm)):((idx+n*X1)>>1); \
532  break; \
533  case 2: \
534  new_mem_idx = (x3>=(X3-n))? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-(X3-n)*X2X1)/2*(1-zcomm):((idx+n*X2X1)>>1); \
535  break; \
536  case 3: \
537  new_mem_idx = ( (x4>=(X4-n))? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-(X4-n)*X3X2X1)/2*(1-tcomm): (idx+n*X3X2X1)>>1); \
538  break; \
539  } \
540  UPDATE_COOR_PLUS(mydir, n, idx); \
541  }while(0)
542 
543 
544 
545 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
546  switch(mydir){ \
547  case 0: \
548  new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \
549  break; \
550  case 1: \
551  new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \
552  break; \
553  case 2: \
554  new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \
555  break; \
556  case 3: \
557  new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \
558  break; \
559  } \
560  UPDATE_COOR_MINUS(mydir, idx); \
561  }while(0)
562 
563 
564 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
565  int local_new_x1=x1; \
566  int local_new_x2=x2; \
567  int local_new_x3=x3; \
568  int local_new_x4=x4; \
569  new_mem_idx=X; \
570  if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \
571  switch(mydir1){/*mydir1 is not partitioned or x[mydir1]!= 0*/ \
572  case 0: \
573  new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1); \
574  local_new_x1 = (x1==0)?X1m1:(x1 - 1); \
575  break; \
576  case 1: \
577  new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1); \
578  local_new_x2 = (x2==0)?X2m1:(x2 - 1); \
579  break; \
580  case 2: \
581  new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \
582  local_new_x3 = (x3==0)?X3m1:(x3 -1); \
583  break; \
584  case 3: \
585  new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \
586  local_new_x4 = (x4==0)?X4m1:(x4 - 1); \
587  break; \
588  } \
589  switch(mydir2){ \
590  case 0: \
591  new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \
592  break; \
593  case 1: \
594  new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \
595  break; \
596  case 2: \
597  new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \
598  break; \
599  case 3: \
600  new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \
601  break; \
602  } \
603  }else{ \
604  /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
605  switch(mydir2){ /*mydir2 is not partitioned or x[mydir2]!= 0*/ \
606  case 0: \
607  new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1); \
608  local_new_x1 = (x1==X1m1)?0:(x1+1); \
609  break; \
610  case 1: \
611  new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \
612  local_new_x2 = (x2==X2m1)?0:(x2+1); \
613  break; \
614  case 2: \
615  new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \
616  local_new_x3 = (x3==X3m1)?0:(x3+1); \
617  break; \
618  case 3: \
619  new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \
620  local_new_x4 = (x4==X4m1)?0:(x4+1); \
621  break; \
622  } \
623  switch(mydir1){/*mydir1 is 0 here */ \
624  case 0: \
625  new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \
626  break; \
627  case 1: \
628  new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \
629  break; \
630  case 2: \
631  new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \
632  break; \
633  case 3: \
634  new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \
635  break; \
636  } \
637  } \
638  new_mem_idx = new_mem_idx >> 1; \
639  UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
640  }while(0)
641 
642 
643 
644 
645 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
646  new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \
647  UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2); \
648  }while(0)
649 
650 
651 #else
652 
653 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx) do { \
654  switch(mydir){ \
655  case 0: \
656  new_mem_idx = ( (x1>=(X1-n))?idx-(X1-n):idx+n)>>1; \
657  break; \
658  case 1: \
659  new_mem_idx = ( (x2>=(X2-n))?idx-(X2-n)*X1:idx+n*X1)>>1; \
660  break; \
661  case 2: \
662  new_mem_idx = ( (x3>=(X3-n))?idx-(X3-n)*X2X1:idx+n*X2X1)>>1; \
663  break; \
664  case 3: \
665  new_mem_idx = ( (x4>=(X4-n))?idx-(X4-n)*X3X2X1 : idx+n*X3X2X1)>>1; \
666  break; \
667  } \
668  UPDATE_COOR_PLUS(mydir, n, idx); \
669  }while(0)
670 
671 
672 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
673  switch(mydir){ \
674  case 0: \
675  new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \
676  break; \
677  case 1: \
678  new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \
679  break; \
680  case 2: \
681  new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \
682  break; \
683  case 3: \
684  new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \
685  break; \
686  } \
687  UPDATE_COOR_MINUS(mydir, idx); \
688  }while(0)
689 
690 
691 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \
692  switch(mydir1){ \
693  case 0: \
694  new_mem_idx = ( (x1==0)?X+X1m1:X-1); \
695  break; \
696  case 1: \
697  new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \
698  break; \
699  case 2: \
700  new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \
701  break; \
702  case 3: \
703  new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \
704  break; \
705  } \
706  switch(mydir2){ \
707  case 0: \
708  new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \
709  break; \
710  case 1: \
711  new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
712  break; \
713  case 2: \
714  new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
715  break; \
716  case 3: \
717  new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
718  break; \
719  } \
720  UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \
721  }while(0)
722 
723 #endif
724 
725 
726 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, n, idx) do { \
727  switch(mydir){ \
728  case 0: \
729  new_mem_idx = (idx+n)>>1; \
730  break; \
731  case 1: \
732  new_mem_idx = (idx+n*E1)>>1; \
733  break; \
734  case 2: \
735  new_mem_idx = (idx+n*E2E1)>>1; \
736  break; \
737  case 3: \
738  new_mem_idx = (idx+n*E3E2E1)>>1; \
739  break; \
740  } \
741  UPDATE_COOR_PLUS(mydir, n, idx); \
742  }while(0)
743 
744 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do { \
745  switch(mydir){ \
746  case 0: \
747  new_mem_idx = (idx-1) >> 1; \
748  break; \
749  case 1: \
750  new_mem_idx = (idx-E1) >> 1; \
751  break; \
752  case 2: \
753  new_mem_idx = (idx-E2E1) >> 1; \
754  break; \
755  case 3: \
756  new_mem_idx = (idx-E3E2E1) >> 1; \
757  break; \
758  } \
759  UPDATE_COOR_MINUS(mydir, idx); \
760  }while(0)
761 
762 
763 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do { \
764  switch(mydir1){ \
765  case 0: \
766  new_mem_idx = X-1; \
767  break; \
768  case 1: \
769  new_mem_idx = X-E1; \
770  break; \
771  case 2: \
772  new_mem_idx = X-E2E1; \
773  break; \
774  case 3: \
775  new_mem_idx = X-E3E2E1; \
776  break; \
777  } \
778  switch(mydir2){ \
779  case 0: \
780  new_mem_idx = (new_mem_idx+1)>> 1; \
781  break; \
782  case 1: \
783  new_mem_idx = (new_mem_idx+E1) >> 1; \
784  break; \
785  case 2: \
786  new_mem_idx = (new_mem_idx+E2E1) >> 1; \
787  break; \
788  case 3: \
789  new_mem_idx = (new_mem_idx+E3E2E1) >> 1; \
790  break; \
791  } \
792  UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2); \
793  }while(0)
794 
795 
796 
797 
798 template<int mu, int nu, int odd_bit>
799  __global__ void
800  LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
801  const FloatN* sitelink_even, const FloatN* sitelink_odd,
802  FloatM* fatlink_even, FloatM* fatlink_odd,
804 {
805  __shared__ FloatM sd_data[NUM_FLOATS*BLOCK_DIM];
806 
807  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
810  //FloatM STAPLE6, STAPLE7, STAPLE8;
811 
812  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
813 
814  int z1 = mem_idx / X1h;
815  short x1h = mem_idx - z1*X1h;
816  int z2 = z1 / X2;
817  short x2 = z1 - z2*X2;
818  short x4 = z2 / X3;
819  short x3 = z2 - x4*X3;
820 
821  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
822  short x1 = 2*x1h + x1odd;
823  int X = 2*mem_idx + x1odd;
824 
825  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
826  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
827  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
828  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
829  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
830  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
831  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
832  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
833 
838 
839  //int x[4] = {x1,x2,x3, x4};
840 #ifdef MULTI_GPU
841  int Z[4] ={X1,X2,X3,X4};
842  int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
843  int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
844  int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
845  int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
846 #endif
847 
848  /* Upper staple */
849  /* Computes the staple :
850  * mu (B)
851  * +-------+
852  * nu | |
853  * (A) | |(C)
854  * X X
855  *
856  */
857 
858  {
859  /* load matrix A*/
860  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
861  COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
862  RECONSTRUCT_SITE_LINK(sign, a);
863 
864 
865  /* load matrix B*/
866  LLFAT_COMPUTE_NEW_IDX_PLUS(nu, 1, X);
867  LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
868  COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);
869  RECONSTRUCT_SITE_LINK(sign, b);
870 
871 
872  MULT_SU3_NN(a, b, tempa);
873 
874  /* load matrix C*/
875 
877  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
878  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
879  RECONSTRUCT_SITE_LINK(sign, c);
880 
881  MULT_SU3_NA(tempa, c, staple);
882  }
883 
884  /***************lower staple****************
885  *
886  * X X
887  * nu | |
888  * (A) | | (C)
889  * +-------+
890  * mu (B)
891  *
892  *********************************************/
893  {
894  /* load matrix A*/
896 
897  LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
898  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
899  RECONSTRUCT_SITE_LINK(sign, a);
900 
901  /* load matrix B*/
902  LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
903  COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);
904  RECONSTRUCT_SITE_LINK(sign, b);
905 
906  MULT_SU3_AN(a, b, tempa);
907 
908  /* load matrix C*/
909  //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
910 #ifdef MULTI_GPU
911  if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
912  int idx = nu*4+mu;
913  LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
914  }else{
916  }
917 #else
919 #endif
920 
921  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
922 
923  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
924  RECONSTRUCT_SITE_LINK(sign, c);
925 
926 
927  MULT_SU3_NN(tempa, c, b);
928  LLFAT_ADD_SU3_MATRIX(b, staple, staple);
929  }
930 
931  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
932  LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
933  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
934  WRITE_FAT_MATRIX(fatlink_even,mu, mem_idx);
935  }
936  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
937 
938  return;
939 }
940 
941 template<int mu, int nu, int odd_bit, int save_staple>
942  __global__ void
943  LLFAT_KERNEL(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
944  const FloatN* sitelink_even, const FloatN* sitelink_odd,
945  FloatM* fatlink_even, FloatM* fatlink_odd,
946  const FloatM* mulink_even, const FloatM* mulink_odd,
948 {
949  __shared__ FloatM sd_data[NUM_FLOATS*BLOCK_DIM];
950  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
951  FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8;
954  //FloatM STAPLE6, STAPLE7, STAPLE8;
955 
956  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
957 
958  int z1 = mem_idx / X1h;
959  int x1h = mem_idx - z1*X1h;
960  int z2 = z1 / X2;
961  int x2 = z1 - z2*X2;
962  int x4 = z2 / X3;
963  int x3 = z2 - x4*X3;
964 
965  int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
966  int x1 = 2*x1h + x1odd;
967  int X = 2*mem_idx + x1odd;
968 
969  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
970  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
971  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
972  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
973  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
974  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
975  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
976  if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
977 
979 #ifdef MULTI_GPU
980  int Z[4] ={X1,X2,X3,X4};
981  int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
982  int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
983  int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
984  int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
985 #endif
986 
987  int new_mem_idx;
990 
991  /* Upper staple */
992  /* Computes the staple :
993  * mu (BB)
994  * +-------+
995  * nu | |
996  * (A) | |(C)
997  * X X
998  *
999  */
1000  {
1001  /* load matrix A*/
1002  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
1003  COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
1004  RECONSTRUCT_SITE_LINK(sign, a);
1005 
1006  /* load matrix BB*/
1007  LLFAT_COMPUTE_NEW_IDX_PLUS(nu, 1, X);
1008  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1009 
1010  MULT_SU3_NN(a, bb, tempa);
1011 
1012  /* load matrix C*/
1014  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
1015  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
1016  RECONSTRUCT_SITE_LINK(sign, c);
1017  if (save_staple){
1018  MULT_SU3_NA(tempa, c, staple);
1019  }else{
1020  MULT_SU3_NA(tempa, c, tempb);
1021  }
1022  }
1023 
1024  /***************lower staple****************
1025  *
1026  * X X
1027  * nu | |
1028  * (A) | | (C)
1029  * +-------+
1030  * mu (B)
1031  *
1032  *********************************************/
1033 
1034 
1035  {
1036  /* load matrix A*/
1038 
1039  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
1040  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
1041  RECONSTRUCT_SITE_LINK(sign, a);
1042 
1043  /* load matrix B*/
1045  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1046 
1047  MULT_SU3_AN(a, bb, tempa);
1048 
1049  /* load matrix C*/
1050  //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
1051 #ifdef MULTI_GPU
1052  if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
1053  int idx = nu*4+mu;
1054  LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
1055  }else{
1057  }
1058 #else
1060 #endif
1061 
1062  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1063  COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
1064  RECONSTRUCT_SITE_LINK(sign, c);
1065 
1066  MULT_SU3_NN(tempa, c, a);
1067  if(save_staple){
1068  LLFAT_ADD_SU3_MATRIX(staple, a, staple);
1069  }else{
1070  LLFAT_ADD_SU3_MATRIX(a, tempb, tempb);
1071  }
1072  }
1073 
1074  LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
1075  if(save_staple){
1076  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
1077  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1078  }
1079  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1080  }else{
1081  if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
1082  SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat);
1083  }else{
1084  //The code should never be here
1085  //because it makes no sense to split kernels when no staple is stored
1086  //print error?
1087  }
1088  }
1089 
1090  WRITE_FAT_MATRIX(fatlink_even, mu, mem_idx);
1091 
1092  return;
1093 }
1094 
1095 __global__ void
1096 LLFAT_KERNEL(llfatOneLink, RECONSTRUCT)(const FloatN* sitelink_even, const FloatN* sitelink_odd,
1097  FloatM* fatlink_even, FloatM* fatlink_odd,
1100  const FloatN* my_sitelink;
1101  FloatM* my_fatlink;
1102  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1103 
1104  if(sid >= threads) return;
1105 
1106  int mem_idx = sid;
1107 
1108 #if (RECONSTRUCT != 18)
1109  int odd_bit= 0;
1110 #endif
1111 
1112  my_sitelink = sitelink_even;
1113  my_fatlink = fatlink_even;
1114  if (mem_idx >= Vh){
1115 #if (RECONSTRUCT != 18)
1116  odd_bit=1;
1117 #endif
1118  mem_idx = mem_idx - Vh;
1119  my_sitelink = sitelink_odd;
1120  my_fatlink = fatlink_odd;
1121  }
1122 
1123 #if (RECONSTRUCT != 18)
1124  int z1 = mem_idx / X1h;
1125  int x1h = mem_idx - z1*X1h;
1126  int z2 = z1 / X2;
1127  int x2 = z1 - z2*X2;
1128  int x4 = z2 / X3;
1129  int x3 = z2 - x4*X3;
1130  int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1131  int x1 = 2*x1h + x1odd;
1133 #endif
1134 
1135  for(int dir=0;dir < 4; dir++){
1136  LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1137  COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4);
1138  RECONSTRUCT_SITE_LINK(sign, a);
1139 
1140  LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx);
1141 
1142  SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1143 
1144  WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx);
1145  }
1146 
1147  return;
1148 }
1149 
1150 
1151 
1152 
1153 template<int mu, int nu, int odd_bit>
1154  __global__ void
1155  LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
1156  const FloatN* sitelink_even, const FloatN* sitelink_odd,
1157  FloatM* fatlink_even, FloatM* fatlink_odd,
1159 {
1160 #if 1
1161  extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
1162 
1163 
1164  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1165  FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1167 
1168  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1169  if(mem_idx >= kparam.threads) return;
1170 
1171  int z1 = mem_idx/D1h;
1172  short x1h = mem_idx - z1*D1h;
1173  int z2 = z1/D2;
1174  short x2 = z1 - z2*D2;
1175  short x4 = z2/D3;
1176  short x3 = z2 - x4*D3;
1177 
1178  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1179  short x1 = 2*x1h + x1odd;
1180 
1181  x1 += kparam.base_idx;
1182  x2 += kparam.base_idx;
1183  x3 += kparam.base_idx;
1184  x4 += kparam.base_idx;
1185  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1186  mem_idx = X/2;
1187 
1188  int new_mem_idx;
1190  DECLARE_NEW_X;
1191 
1192  /* Upper staple */
1193  /* Computes the staple :
1194  * mu (B)
1195  * +-------+
1196  * nu | |
1197  * (A) | |(C)
1198  * X X
1199  *
1200  */
1201 
1202  {
1203  /* load matrix A*/
1204  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
1205  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
1206  RECONSTRUCT_SITE_LINK(sign, a);
1207 
1208 
1209 
1210  /* load matrix B*/
1211  LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, 1, X);
1212  LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
1213  COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1214  RECONSTRUCT_SITE_LINK(sign, b);
1215 
1216 
1217  MULT_SU3_NN(a, b, tempa);
1218 
1219  /* load matrix C*/
1220 
1222  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
1223  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1224  RECONSTRUCT_SITE_LINK(sign, c);
1225 
1226  MULT_SU3_NA(tempa, c, staple);
1227 
1228  }
1229 
1230  /***************lower staple****************
1231  *
1232  * X X
1233  * nu | |
1234  * (A) | | (C)
1235  * +-------+
1236  * mu (B)
1237  *
1238  *********************************************/
1239  {
1240  /* load matrix A*/
1242 
1243  LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
1244  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1245  RECONSTRUCT_SITE_LINK(sign, a);
1246 
1247  /* load matrix B*/
1248  LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
1249  COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1250  RECONSTRUCT_SITE_LINK(sign, b);
1251 
1252  MULT_SU3_AN(a, b, tempa);
1253 
1254 
1255 
1256  /* load matrix C*/
1258  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1259 
1260  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1261  RECONSTRUCT_SITE_LINK(sign, c);
1262 
1263 
1264  MULT_SU3_NN(tempa, c, b);
1265  LLFAT_ADD_SU3_MATRIX(b, staple, staple);
1266 
1267  }
1268 
1269 
1270  if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
1271  || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
1272  int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
1273 
1274  LOAD_EVEN_FAT_MATRIX(mu, orig_idx);
1275  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1276  WRITE_FAT_MATRIX(fatlink_even,mu, orig_idx);
1277  }
1278  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1279 
1280 #endif
1281 
1282  return;
1283 }
1284 
1285 template<int mu, int nu, int odd_bit, int save_staple>
1286  __global__ void
1287  LLFAT_KERNEL_EX(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd,
1288  const FloatN* sitelink_even, const FloatN* sitelink_odd,
1289  FloatM* fatlink_even, FloatM* fatlink_odd,
1290  const FloatM* mulink_even, const FloatM* mulink_odd,
1292 {
1293 #if 1
1294  //__shared__ FloatM sd_data[NUM_FLOATS*64];
1295  extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
1296  //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1297  FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8;
1299 
1300  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
1301  if(mem_idx >= kparam.threads) return;
1302 
1303  int z1 = mem_idx/D1h;
1304  short x1h = mem_idx - z1*D1h;
1305  int z2 = z1/D2;
1306  short x2 = z1 - z2*D2;
1307  short x4 = z2/D3;
1308  short x3 = z2 - x4*D3;
1309 
1310  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1311  short x1 = 2*x1h + x1odd;
1312 
1313  x1 += kparam.base_idx;
1314  x2 += kparam.base_idx;
1315  x3 += kparam.base_idx;
1316  x4 += kparam.base_idx;
1317  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1318  mem_idx = X/2;
1319 
1320  int new_mem_idx;
1322  DECLARE_NEW_X;
1323 
1324  /* Upper staple */
1325  /* Computes the staple :
1326  * mu (BB)
1327  * +-------+
1328  * nu | |
1329  * (A) | |(C)
1330  * X X
1331  *
1332  */
1333  {
1334  /* load matrix A*/
1335  LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
1336  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
1337  RECONSTRUCT_SITE_LINK(sign, a);
1338 
1339  /* load matrix BB*/
1340  LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, 1, X);
1341  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1342  MULT_SU3_NN(a, bb, tempa);
1343 
1344  /* load matrix C*/
1346  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
1347  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1348  RECONSTRUCT_SITE_LINK(sign, c);
1349 
1350  MULT_SU3_NA(tempa, c, staple);
1351  }
1352 
1353  /***************lower staple****************
1354  *
1355  * X X
1356  * nu | |
1357  * (A) | | (C)
1358  * +-------+
1359  * mu (B)
1360  *
1361  *********************************************/
1362 
1363  {
1364  /* load matrix A*/
1366 
1367  LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
1368  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1369  RECONSTRUCT_SITE_LINK(sign, a);
1370 
1371  /* load matrix B*/
1373  LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
1374 
1375  MULT_SU3_AN(a, bb, tempa);
1376 
1377  /* load matrix C*/
1378 
1380 
1381  LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
1382  COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
1383  RECONSTRUCT_SITE_LINK(sign, c);
1384 
1385  MULT_SU3_NN(tempa, c, a);
1386 
1387  LLFAT_ADD_SU3_MATRIX(a, staple, staple);
1388  }
1389 
1390 
1391  if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
1392  || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
1393  int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
1394  LOAD_EVEN_FAT_MATRIX(mu, orig_idx);
1395  SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
1396  WRITE_FAT_MATRIX(fatlink_even, mu, orig_idx);
1397  }
1398 
1399  if(save_staple){
1400  WRITE_STAPLE_MATRIX(staple_even, mem_idx);
1401  }
1402 #endif
1403 
1404  return;
1405 }
1406 
1407 
1408 __global__ void
1409 LLFAT_KERNEL_EX(llfatOneLink, RECONSTRUCT)(const FloatN* sitelink_even, const FloatN* sitelink_odd,
1410  FloatM* fatlink_even, FloatM* fatlink_odd,
1412 {
1413 #if 1
1414 
1415  const FloatN* my_sitelink;
1416  FloatM* my_fatlink;
1417  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1418  int idx = sid;
1419 
1420  if(sid >= 2*kparam.threads) return;
1421 
1422  short odd_bit= 0;
1423 
1424  my_sitelink = sitelink_even;
1425  my_fatlink = fatlink_even;
1426  if (idx >= kparam.threads){
1427  odd_bit=1;
1428  idx = idx - kparam.threads;
1429  my_sitelink = sitelink_odd;
1430  my_fatlink = fatlink_odd;
1431  }
1432 
1433  int z1 = idx/D1h;
1434  short x1h = idx - z1*D1h;
1435  int z2 = z1/D2;
1436  short x2 = z1 - z2*D2;
1437  int x4 = z2/D3;
1438  short x3 = z2 - x4*D3;
1439  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1440  short x1 = 2*x1h + x1odd;
1442 
1443  x1 += kparam.base_idx;
1444  x2 += kparam.base_idx;
1445  x3 += kparam.base_idx;
1446  x4 += kparam.base_idx;
1447  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1448  int mem_idx = X/2;
1449 
1450  for(int dir=0;dir < 4; dir++){
1451  LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
1452  COMPUTE_RECONSTRUCT_SIGN(sign, dir, (x1-2), (x2-2), (x3-2), (x4-2));
1453  RECONSTRUCT_SITE_LINK(sign, a);
1454 
1455  LOAD_FAT_MATRIX(my_fatlink, dir, idx);
1456 
1457  SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat);
1458 
1459  WRITE_FAT_MATRIX(my_fatlink,dir, idx);
1460  }
1461 #endif
1462 
1463  return;
1464 }
1465 
1466 
1467 template<int odd_bit>
1468 __global__ void LLFAT_KERNEL(computeLongLinkParity,RECONSTRUCT)
1469  (FloatN* const outField,
1470  const FloatN* const sitelink_even, const FloatN* const sitelink_odd,
1471  Float coeff,
1473 {
1474  int idx = blockIdx.x*blockDim.x + threadIdx.x;
1475  int mem_idx = idx;
1476  if(mem_idx >= kparam.threads) return;
1477 
1478  int z1 = mem_idx/D1h;
1479  short x1h = mem_idx - z1*D1h;
1480  int z2 = z1/D2;
1481  short x2 = z1 - z2*D2;
1482  short x4 = z2/D3;
1483  short x3 = z2 - x4*D3;
1484 
1485  short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
1486  short x1 = 2*x1h + x1odd;
1487 
1488 #ifdef MULTI_GPU
1489  x1 += 2;
1490  x2 += 2;
1491  x3 += 2;
1492  x4 += 2;
1493  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
1494 #else
1495  int X = x4*X3X2X1 + x3*X2X1 + x2*X1 + x1;
1496 #endif
1497  mem_idx = X/2;
1498 
1499  int new_mem_idx;
1501  DECLARE_NEW_X;
1502 
1503  FloatN F0, F1, F2, F3, F4, F5, F6, F7, F8;
1504 
1505  for(int dir=0; dir<4; ++dir){
1506  LOAD_EVEN_SITE_MATRIX(dir, mem_idx, A);
1507 #ifdef MULTI_GPU
1508  COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1-2, x2-2, x3-2, x4-2);
1509 #else
1510  COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4);
1511 #endif
1512  RECONSTRUCT_SITE_LINK(sign, a);
1513 
1514 #ifdef MULTI_GPU
1515  LLFAT_COMPUTE_NEW_IDX_PLUS_EX(dir, 1, X);
1516  LOAD_ODD_SITE_MATRIX(dir, new_mem_idx, B);
1517  COMPUTE_RECONSTRUCT_SIGN(sign, dir, new_x1-2, new_x2-2, new_x3-2, new_x4-2);
1518 #else
1519  LLFAT_COMPUTE_NEW_IDX_PLUS(dir, 1, X);
1520  LOAD_ODD_SITE_MATRIX(dir, new_mem_idx, B);
1521  COMPUTE_RECONSTRUCT_SIGN(sign, dir, new_x1, new_x2, new_x3, new_x4);
1522 #endif
1523  RECONSTRUCT_SITE_LINK(sign, b);
1524 
1525 #ifdef MULTI_GPU
1526  LLFAT_COMPUTE_NEW_IDX_PLUS_EX(dir, 2, X);
1527  LOAD_EVEN_SITE_MATRIX(dir, new_mem_idx, C);
1528  COMPUTE_RECONSTRUCT_SIGN(sign, dir, new_x1-2, new_x2-2, new_x3-2, new_x4-2);
1529 #else
1530  LLFAT_COMPUTE_NEW_IDX_PLUS(dir, 2, X);
1531  LOAD_EVEN_SITE_MATRIX(dir, new_mem_idx, C);
1532  COMPUTE_RECONSTRUCT_SIGN(sign, dir, new_x1, new_x2, new_x3, new_x4);
1533 #endif
1534  RECONSTRUCT_SITE_LINK(sign, c);
1535 
1536  SCALAR_MULT_SU3_MATRIX(coeff, a, f);
1537  MULT_SU3_NN(f,b,a);
1538  MULT_SU3_NN(a,c,f);
1539 
1540 
1541  WRITE_LONG_MATRIX(outField, F, dir, idx, fl.fat_ga_stride);
1542  }
1543  return;
1544 }
1545 
1546 #undef D1
1547 #undef D2
1548 #undef D3
1549 #undef D4
1550 #undef D1h
1551 
1552 #undef DECLARE_VAR_SIGN
1553 #undef DECLARE_NEW_X
1554 #undef DECLARE_X_ARRAY
1555 
1556 #undef a00_re
1557 #undef a00_im
1558 #undef a01_re
1559 #undef a01_im
1560 #undef a02_re
1561 #undef a02_im
1562 #undef a10_re
1563 #undef a10_im
1564 #undef a11_re
1565 #undef a11_im
1566 #undef a12_re
1567 #undef a12_im
1568 #undef a20_re
1569 #undef a20_im
1570 #undef a21_re
1571 #undef a21_im
1572 #undef a22_re
1573 #undef a22_im
1574 
1575 #undef b00_re
1576 #undef b00_im
1577 #undef b01_re
1578 #undef b01_im
1579 #undef b02_re
1580 #undef b02_im
1581 #undef b10_re
1582 #undef b10_im
1583 #undef b11_re
1584 #undef b11_im
1585 #undef b12_re
1586 #undef b12_im
1587 #undef b20_re
1588 #undef b20_im
1589 #undef b21_re
1590 #undef b21_im
1591 #undef b22_re
1592 #undef b22_im
1593 
1594 #undef bb00_re
1595 #undef bb00_im
1596 #undef bb01_re
1597 #undef bb01_im
1598 #undef bb02_re
1599 #undef bb02_im
1600 #undef bb10_re
1601 #undef bb10_im
1602 #undef bb11_re
1603 #undef bb11_im
1604 #undef bb12_re
1605 #undef bb12_im
1606 #undef bb20_re
1607 #undef bb20_im
1608 #undef bb21_re
1609 #undef bb21_im
1610 #undef bb22_re
1611 #undef bb22_im
1612 
1613 #undef c00_re
1614 #undef c00_im
1615 #undef c01_re
1616 #undef c01_im
1617 #undef c02_re
1618 #undef c02_im
1619 #undef c10_re
1620 #undef c10_im
1621 #undef c11_re
1622 #undef c11_im
1623 #undef c12_re
1624 #undef c12_im
1625 #undef c20_re
1626 #undef c20_im
1627 #undef c21_re
1628 #undef c21_im
1629 #undef c22_re
1630 #undef c22_im
1631 
1632 #undef f00_re
1633 #undef f00_im
1634 #undef f01_re
1635 #undef f01_im
1636 #undef f02_re
1637 #undef f02_im
1638 #undef f10_re
1639 #undef f10_im
1640 #undef f11_re
1641 #undef f11_im
1642 #undef f12_re
1643 #undef f12_im
1644 #undef f20_re
1645 #undef f20_im
1646 #undef f21_re
1647 #undef f21_im
1648 #undef f22_re
1649 #undef f22_im
1650 
1651 #undef aT00_re
1652 #undef aT00_im
1653 #undef aT01_re
1654 #undef aT00_re
1655 #undef aT00_im
1656 #undef aT01_re
1657 #undef aT01_im
1658 #undef aT02_re
1659 #undef aT02_im
1660 #undef aT10_re
1661 #undef aT10_im
1662 #undef aT11_re
1663 #undef aT11_im
1664 #undef aT12_re
1665 #undef aT12_im
1666 #undef aT20_re
1667 #undef aT20_im
1668 #undef aT21_re
1669 #undef aT21_im
1670 #undef aT22_re
1671 #undef aT22_im
1672 
1673 #undef bT00_re
1674 #undef bT00_im
1675 #undef bT01_re
1676 #undef bT01_im
1677 #undef bT02_re
1678 #undef bT02_im
1679 #undef bT10_re
1680 #undef bT10_im
1681 #undef bT11_re
1682 #undef bT11_im
1683 #undef bT12_re
1684 #undef bT12_im
1685 #undef bT20_re
1686 #undef bT20_im
1687 #undef bT21_re
1688 #undef bT21_im
1689 #undef bT22_re
1690 #undef bT22_im
1691 
1692 #undef cT00_re
1693 #undef cT00_im
1694 #undef cT01_re
1695 #undef cT01_im
1696 #undef cT02_re
1697 #undef cT02_im
1698 #undef cT10_re
1699 #undef cT10_im
1700 #undef cT11_re
1701 #undef cT11_im
1702 #undef cT12_re
1703 #undef cT12_im
1704 #undef cT20_re
1705 #undef cT20_im
1706 #undef cT21_re
1707 #undef cT21_im
1708 #undef cT22_re
1709 #undef cT22_im
1710 
1711 
1712 #undef tempa00_re
1713 #undef tempa00_im
1714 #undef tempa01_re
1715 #undef tempa01_im
1716 #undef tempa02_re
1717 #undef tempa02_im
1718 #undef tempa10_re
1719 #undef tempa10_im
1720 #undef tempa11_re
1721 #undef tempa11_im
1722 #undef tempa12_re
1723 #undef tempa12_im
1724 #undef tempa20_re
1725 #undef tempa20_im
1726 #undef tempa21_re
1727 #undef tempa21_im
1728 #undef tempa22_re
1729 #undef tempa22_im
1730 
1731 #undef tempb00_re
1732 #undef tempb00_im
1733 #undef tempb01_re
1734 #undef tempb01_im
1735 #undef tempb02_re
1736 #undef tempb02_im
1737 #undef tempb10_re
1738 #undef tempb10_im
1739 #undef tempb11_re
1740 #undef tempb11_im
1741 #undef tempb12_re
1742 #undef tempb12_im
1743 #undef tempb20_re
1744 #undef tempb20_im
1745 #undef tempb21_re
1746 #undef tempb21_im
1747 #undef tempb22_re
1748 #undef tempb22_im
1749 
1750 #undef fat00_re
1751 #undef fat00_im
1752 #undef fat01_re
1753 #undef fat01_im
1754 #undef fat02_re
1755 #undef fat02_im
1756 #undef fat10_re
1757 #undef fat10_im
1758 #undef fat11_re
1759 #undef fat11_im
1760 #undef fat12_re
1761 #undef fat12_im
1762 #undef fat20_re
1763 #undef fat20_im
1764 #undef fat21_re
1765 #undef fat21_im
1766 #undef fat22_re
1767 #undef fat22_im
1768 
1769 #undef WRITE_LONG_MATRIX
1770 
__global__ void FloatM * staple_odd
Definition: llfat_core.h:800
#define LLFAT_EXTERIOR_KERNEL_BACK_X
Definition: llfat_quda.h:9
__global__ void FloatM const FloatN const FloatN FloatM * fatlink_even
Definition: llfat_core.h:800
#define D3
Definition: llfat_core.h:14
__global__ void const FloatN *const const FloatN *const Float coeff
Definition: llfat_core.h:1470
__constant__ int Vh
int X
Definition: llfat_core.h:823
__global__ void const FloatN FloatM FloatM Float coeff0
Definition: llfat_core.h:1096
__global__ void FloatM const FloatN const FloatN * sitelink_odd
Definition: llfat_core.h:800
__constant__ int X1h
FloatM TEMPB6
Definition: llfat_core.h:952
__global__ void const FloatN FloatM FloatM Float Float coeff5
Definition: llfat_core.h:1096
short x1
Definition: llfat_core.h:822
__constant__ int X2
FloatN F0
Definition: llfat_core.h:1503
struct quda::llfat_kernel_param_s llfat_kernel_param_t
FloatM STAPLE1
Definition: llfat_core.h:809
#define LLFAT_EXTERIOR_KERNEL_FWD_X
Definition: llfat_quda.h:8
__global__ void FloatM const FloatN const FloatN FloatM FloatM * fatlink_odd
Definition: llfat_core.h:800
MULT_SU3_NN(a, b, tempa)
__global__ void LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM *staple_even
__constant__ int X1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
FloatN F8
Definition: llfat_core.h:1503
FloatM STAPLE2
Definition: llfat_core.h:809
LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B)
#define DECLARE_NEW_X
Definition: llfat_core.h:28
#define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx)
Definition: llfat_core.h:672
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
#define D2
Definition: llfat_core.h:13
#define LLFAT_EXTERIOR_KERNEL_FWD_Z
Definition: llfat_quda.h:12
MULT_SU3_NA(tempa, c, staple)
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM * mulink_even
Definition: llfat_core.h:943
int sid
Definition: llfat_core.h:1102
__constant__ int X3X2X1
__constant__ int X3X2
__global__ void FloatM const FloatN const FloatN FloatM FloatM Float mycoeff
Definition: llfat_core.h:800
#define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1, i2, i3, i4)
Definition: llfat_core.h:484
#define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2)
Definition: llfat_core.h:691
__constant__ int E3E2E1
#define LLFAT_EXTERIOR_KERNEL_BACK_T
Definition: llfat_quda.h:15
FloatM TEMPB8
Definition: llfat_core.h:952
FloatM STAPLE8
Definition: llfat_core.h:809
__constant__ int E2E1
#define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx)
Definition: llfat_core.h:744
FloatM STAPLE0
Definition: llfat_core.h:809
FloatN F7
Definition: llfat_core.h:1503
RECONSTRUCT_SITE_LINK(sign, a)
#define LLFAT_EXTERIOR_KERNEL_FWD_Y
Definition: llfat_quda.h:10
FloatM TEMPB7
Definition: llfat_core.h:952
__shared__ spinorFloat sd_data[]
FloatM TEMPA6
Definition: llfat_core.h:808
#define LLFAT_INTERIOR_KERNEL
Definition: llfat_quda.h:7
FloatM TEMPB3
Definition: llfat_core.h:952
#define DECLARE_X_ARRAY
Definition: llfat_core.h:30
#define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, n, idx)
Definition: llfat_core.h:726
short x4
Definition: llfat_core.h:818
my_sitelink
Definition: llfat_core.h:1112
__global__ void LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM *staple_even
LOAD_EVEN_FAT_MATRIX(mu, mem_idx)
FloatingPoint< float > Float
Definition: gtest.h:7350
FloatN F3
Definition: llfat_core.h:1503
#define LLFAT_EXTERIOR_KERNEL_BACK_Y
Definition: llfat_quda.h:11
int odd_bit
Definition: llfat_core.h:1109
int z1
Definition: llfat_core.h:814
FloatM TEMPB0
Definition: llfat_core.h:952
int new_mem_idx
Definition: llfat_core.h:834
FloatM TEMPB4
Definition: llfat_core.h:952
#define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, n, idx)
Definition: llfat_core.h:653
__constant__ int X2m1
FloatM TEMPA7
Definition: llfat_core.h:808
FloatM TEMPA8
Definition: llfat_core.h:808
int mem_idx
Definition: llfat_core.h:812
FloatM TEMPB5
Definition: llfat_core.h:952
int x[4]
LLFAT_ADD_SU3_MATRIX(b, staple, staple)
short x1h
Definition: llfat_core.h:815
FloatM STAPLE6
Definition: llfat_core.h:809
#define LLFAT_EXTERIOR_KERNEL_BACK_Z
Definition: llfat_quda.h:13
MULT_SU3_AN(a, b, tempa)
__constant__ fat_force_const_t fl
#define dimcomm
Definition: llfat_core.h:9
#define DECLARE_VAR_SIGN
Definition: llfat_core.h:27
#define WRITE_LONG_MATRIX
Definition: llfat_core.h:191
#define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc)
Definition: force_common.h:459
short x2
Definition: llfat_core.h:817
FloatM STAPLE7
Definition: llfat_core.h:809
#define LLFAT_EXTERIOR_KERNEL_FWD_T
Definition: llfat_quda.h:14
int Z[4]
Definition: test_util.cpp:28
__constant__ int X1m1
__global__ void FloatM const FloatN const FloatN FloatM FloatM Float llfat_kernel_param_t kparam
Definition: llfat_core.h:804
__constant__ int X3
int z2
Definition: llfat_core.h:816
FloatM TEMPB1
Definition: llfat_core.h:952
FloatM TEMPA5
Definition: llfat_core.h:808
FloatM STAPLE4
Definition: llfat_core.h:809
short x1odd
Definition: llfat_core.h:821
WRITE_STAPLE_MATRIX(staple_even, mem_idx)
FloatN F1
Definition: llfat_core.h:1503
FloatM * my_fatlink
Definition: llfat_core.h:1101
LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB)
FloatM STAPLE5
Definition: llfat_core.h:809
__constant__ int X4m1
#define D1h
Definition: llfat_core.h:16
__global__ void FloatM const FloatN * sitelink_even
Definition: llfat_core.h:800
__constant__ int E1
FloatN F5
Definition: llfat_core.h:1503
#define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2)
Definition: llfat_core.h:763
int idx
Definition: llfat_core.h:1418
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM const FloatM * mulink_odd
Definition: llfat_core.h:943
FloatN F4
Definition: llfat_core.h:1503
FloatM STAPLE3
Definition: llfat_core.h:809
#define NUM_FLOATS
Definition: llfat_core.h:332
__constant__ int X3X1
LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C)
#define BLOCK_DIM
FloatM TEMPB2
Definition: llfat_core.h:952
__constant__ int X4
__constant__ int X3m1
FloatN F2
Definition: llfat_core.h:1503
short x3
Definition: llfat_core.h:819
FloatN F6
Definition: llfat_core.h:1503
__constant__ int X2X1