2 #define xcomm kparam.ghostDim[0]
3 #define ycomm kparam.ghostDim[1]
4 #define zcomm kparam.ghostDim[2]
5 #define tcomm kparam.ghostDim[3]
8 #define linka00_re LINKA0.x
9 #define linka00_im LINKA0.y
10 #define linka01_re LINKA0.z
11 #define linka01_im LINKA0.w
12 #define linka02_re LINKA1.x
13 #define linka02_im LINKA1.y
14 #define linka10_re LINKA1.z
15 #define linka10_im LINKA1.w
16 #define linka11_re LINKA2.x
17 #define linka11_im LINKA2.y
18 #define linka12_re LINKA2.z
19 #define linka12_im LINKA2.w
20 #define linka20_re LINKA3.x
21 #define linka20_im LINKA3.y
22 #define linka21_re LINKA3.z
23 #define linka21_im LINKA3.w
24 #define linka22_re LINKA4.x
25 #define linka22_im LINKA4.y
28 #define linkb00_re LINKB0.x
29 #define linkb00_im LINKB0.y
30 #define linkb01_re LINKB0.z
31 #define linkb01_im LINKB0.w
32 #define linkb02_re LINKB1.x
33 #define linkb02_im LINKB1.y
34 #define linkb10_re LINKB1.z
35 #define linkb10_im LINKB1.w
36 #define linkb11_re LINKB2.x
37 #define linkb11_im LINKB2.y
38 #define linkb12_re LINKB2.z
39 #define linkb12_im LINKB2.w
40 #define linkb20_re LINKB3.x
41 #define linkb20_im LINKB3.y
42 #define linkb21_re LINKB3.z
43 #define linkb21_im LINKB3.w
44 #define linkb22_re LINKB4.x
45 #define linkb22_im LINKB4.y
48 #define linka00_re LINKA0.x
49 #define linka00_im LINKA0.y
50 #define linka01_re LINKA1.x
51 #define linka01_im LINKA1.y
52 #define linka02_re LINKA2.x
53 #define linka02_im LINKA2.y
54 #define linka10_re LINKA3.x
55 #define linka10_im LINKA3.y
56 #define linka11_re LINKA4.x
57 #define linka11_im LINKA4.y
58 #define linka12_re LINKA5.x
59 #define linka12_im LINKA5.y
60 #define linka20_re LINKA6.x
61 #define linka20_im LINKA6.y
62 #define linka21_re LINKA7.x
63 #define linka21_im LINKA7.y
64 #define linka22_re LINKA8.x
65 #define linka22_im LINKA8.y
67 #define linkb00_re LINKB0.x
68 #define linkb00_im LINKB0.y
69 #define linkb01_re LINKB1.x
70 #define linkb01_im LINKB1.y
71 #define linkb02_re LINKB2.x
72 #define linkb02_im LINKB2.y
73 #define linkb10_re LINKB3.x
74 #define linkb10_im LINKB3.y
75 #define linkb11_re LINKB4.x
76 #define linkb11_im LINKB4.y
77 #define linkb12_re LINKB5.x
78 #define linkb12_im LINKB5.y
79 #define linkb20_re LINKB6.x
80 #define linkb20_im LINKB6.y
81 #define linkb21_re LINKB7.x
82 #define linkb21_im LINKB7.y
83 #define linkb22_re LINKB8.x
84 #define linkb22_im LINKB8.y
91 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \
94 new_mem_idx = ((!xcomm) && (new_x1 == (X1+1)))?(idx - X1m1): idx+1; \
95 new_x1 = ((!xcomm)&& (new_x1 == (X1+1)))? (new_x1 - X1m1):(new_x1+1); \
98 new_mem_idx = ((!ycomm) && (new_x2 == (X2+1)))?(idx - X2m1*E1): idx+E1; \
99 new_x2 = ((!ycomm)&& (new_x2 == (X2+1)))? (new_x2 - X2m1):(new_x2+1); \
102 new_mem_idx = ((!zcomm) && (new_x3 == (X3+1)))?(idx - X3m1*E2E1): idx+E2E1; \
103 new_x3 = ((!zcomm)&& (new_x3 == (X3+1)))? (new_x3 - X3m1):(new_x3+1); \
106 new_mem_idx = ((!tcomm) && (new_x4 == (X4+1)))?(idx - X4m1*E3E2E1): idx+E3E2E1; \
107 new_x4 = ((!tcomm)&& (new_x4 == (X4+1)))? (new_x4 - X4m1):(new_x4+1); \
112 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \
115 new_mem_idx = ((!xcomm) && new_x1 == 2)?(idx+X1m1):(idx-1); \
116 new_x1 = ((!xcomm) && new_x1 == 2)? (new_x1+X1m1): (new_x1-1); \
119 new_mem_idx = ((!ycomm) && new_x2 == 2)?(idx+X2m1*E1):(idx-E1); \
120 new_x2 = ((!ycomm) && new_x2 == 2)? (new_x2+X2m1): (new_x2-1); \
123 new_mem_idx = ((!zcomm) && new_x3 == 2)?(idx+X3m1*E2E1):(idx-E2E1); \
124 new_x3 = ((!zcomm) && new_x3 == 2)? (new_x3+X3m1): (new_x3-1); \
127 new_mem_idx = ((!tcomm) && new_x4 == 2)?(idx+X4m1*E3E2E1):(idx-E3E2E1); \
128 new_x4 = ((!tcomm) && new_x4 == 2)? (new_x4+X4m1): (new_x4-1); \
134 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \
137 new_mem_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \
138 new_x1 = (new_x1==X1m1)?0:new_x1+1; \
141 new_mem_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \
142 new_x2 = (new_x2==X2m1)?0:new_x2+1; \
145 new_mem_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
146 new_x3 = (new_x3==X3m1)?0:new_x3+1; \
149 new_mem_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
150 new_x4 = (new_x4==X4m1)?0:new_x4+1; \
155 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \
158 new_mem_idx = ( (new_x1==0)?idx+X1m1:idx-1); \
159 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \
162 new_mem_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \
163 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \
166 new_mem_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
167 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \
170 new_mem_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
171 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \
179 #define MULT_SU3_NN_TEST(ma, mb) do{ \
180 Float fa_re,fa_im, fb_re, fb_im, fc_re, fc_im; \
182 ma##00_re * mb##00_re - ma##00_im * mb##00_im + \
183 ma##01_re * mb##10_re - ma##01_im * mb##10_im + \
184 ma##02_re * mb##20_re - ma##02_im * mb##20_im; \
186 ma##00_re * mb##00_im + ma##00_im * mb##00_re + \
187 ma##01_re * mb##10_im + ma##01_im * mb##10_re + \
188 ma##02_re * mb##20_im + ma##02_im * mb##20_re; \
190 ma##00_re * mb##01_re - ma##00_im * mb##01_im + \
191 ma##01_re * mb##11_re - ma##01_im * mb##11_im + \
192 ma##02_re * mb##21_re - ma##02_im * mb##21_im; \
194 ma##00_re * mb##01_im + ma##00_im * mb##01_re + \
195 ma##01_re * mb##11_im + ma##01_im * mb##11_re + \
196 ma##02_re * mb##21_im + ma##02_im * mb##21_re; \
198 ma##00_re * mb##02_re - ma##00_im * mb##02_im + \
199 ma##01_re * mb##12_re - ma##01_im * mb##12_im + \
200 ma##02_re * mb##22_re - ma##02_im * mb##22_im; \
202 ma##00_re * mb##02_im + ma##00_im * mb##02_re + \
203 ma##01_re * mb##12_im + ma##01_im * mb##12_re + \
204 ma##02_re * mb##22_im + ma##02_im * mb##22_re; \
212 ma##10_re * mb##00_re - ma##10_im * mb##00_im + \
213 ma##11_re * mb##10_re - ma##11_im * mb##10_im + \
214 ma##12_re * mb##20_re - ma##12_im * mb##20_im; \
216 ma##10_re * mb##00_im + ma##10_im * mb##00_re + \
217 ma##11_re * mb##10_im + ma##11_im * mb##10_re + \
218 ma##12_re * mb##20_im + ma##12_im * mb##20_re; \
220 ma##10_re * mb##01_re - ma##10_im * mb##01_im + \
221 ma##11_re * mb##11_re - ma##11_im * mb##11_im + \
222 ma##12_re * mb##21_re - ma##12_im * mb##21_im; \
224 ma##10_re * mb##01_im + ma##10_im * mb##01_re + \
225 ma##11_re * mb##11_im + ma##11_im * mb##11_re + \
226 ma##12_re * mb##21_im + ma##12_im * mb##21_re; \
228 ma##10_re * mb##02_re - ma##10_im * mb##02_im + \
229 ma##11_re * mb##12_re - ma##11_im * mb##12_im + \
230 ma##12_re * mb##22_re - ma##12_im * mb##22_im; \
232 ma##10_re * mb##02_im + ma##10_im * mb##02_re + \
233 ma##11_re * mb##12_im + ma##11_im * mb##12_re + \
234 ma##12_re * mb##22_im + ma##12_im * mb##22_re; \
242 ma##20_re * mb##00_re - ma##20_im * mb##00_im + \
243 ma##21_re * mb##10_re - ma##21_im * mb##10_im + \
244 ma##22_re * mb##20_re - ma##22_im * mb##20_im; \
246 ma##20_re * mb##00_im + ma##20_im * mb##00_re + \
247 ma##21_re * mb##10_im + ma##21_im * mb##10_re + \
248 ma##22_re * mb##20_im + ma##22_im * mb##20_re; \
250 ma##20_re * mb##01_re - ma##20_im * mb##01_im + \
251 ma##21_re * mb##11_re - ma##21_im * mb##11_im + \
252 ma##22_re * mb##21_re - ma##22_im * mb##21_im; \
254 ma##20_re * mb##01_im + ma##20_im * mb##01_re + \
255 ma##21_re * mb##11_im + ma##21_im * mb##11_re + \
256 ma##22_re * mb##21_im + ma##22_im * mb##21_re; \
258 ma##20_re * mb##02_re - ma##20_im * mb##02_im + \
259 ma##21_re * mb##12_re - ma##21_im * mb##12_im + \
260 ma##22_re * mb##22_re - ma##22_im * mb##22_im; \
262 ma##20_re * mb##02_im + ma##20_im * mb##02_re + \
263 ma##21_re * mb##12_im + ma##21_im * mb##12_re + \
264 ma##22_re * mb##22_im + ma##22_im * mb##22_re; \
274 #define MULT_SU3_NA_TEST(ma, mb) do{ \
275 Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \
277 ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \
278 ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \
279 ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \
281 ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \
282 ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \
283 ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \
285 ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \
286 ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \
287 ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \
289 ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \
290 ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \
291 ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \
293 ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \
294 ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \
295 ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \
297 ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \
298 ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \
299 ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \
307 ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \
308 ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \
309 ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \
311 ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \
312 ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \
313 ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \
315 ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \
316 ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \
317 ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \
319 ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \
320 ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \
321 ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \
323 ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \
324 ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \
325 ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \
327 ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \
328 ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \
329 ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \
337 ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \
338 ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \
339 ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \
341 ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \
342 ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \
343 ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \
345 ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \
346 ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \
347 ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \
349 ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \
350 ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \
351 ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \
353 ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \
354 ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \
355 ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \
357 ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \
358 ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \
359 ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; \
370 #define MULT_SU3_AN_TEST(ma, mb) do{ \
371 Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \
373 ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \
374 ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \
375 ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \
377 ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \
378 ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \
379 ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \
381 ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \
382 ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \
383 ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \
385 ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \
386 ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \
387 ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \
389 ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \
390 ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \
391 ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \
393 ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \
394 ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \
395 ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \
403 ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \
404 ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \
405 ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \
407 ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \
408 ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \
409 ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \
411 ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \
412 ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \
413 ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \
415 ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \
416 ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \
417 ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \
419 ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \
420 ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \
421 ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \
423 ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \
424 ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \
425 ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \
433 ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \
434 ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \
435 ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \
437 ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \
438 ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \
439 ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \
441 ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \
442 ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \
443 ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \
445 ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \
446 ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \
447 ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \
449 ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \
450 ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \
451 ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \
453 ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \
454 ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \
455 ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; \
466 #define print_matrix(mul) \
467 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
468 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
469 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
474 template<
int oddBit,
typename Float,
typename Float2,
typename FloatN>
477 const int dir,
const double eb3,
479 const int* input_path,
483 int sid = blockIdx.x * blockDim.x + threadIdx.x;
484 if (sid >= kparam.threads)
return;
496 x4 += 2; x3 += 2; x2 += 2; x1 += 2;
499 int X = 2*sid +
x1odd;
502 Float2* mymom=momEven;
510 Float2 AH0, AH1, AH2, AH3, AH4;
517 for(i=0;i < num_paths; i++){
519 if(coeff == 0)
continue;
521 int nbr_oddbit = (
oddBit^1 );
541 nbr_oddbit = nbr_oddbit^1;
545 int nbr_idx = new_mem_idx >>1;
556 nbr_oddbit = nbr_oddbit^1;
561 for(j=1; j < length[i]; j++){
570 nbr_oddbit = nbr_oddbit^1;
574 int nbr_idx = new_mem_idx >>1;
585 nbr_oddbit = nbr_oddbit^1;
617 #undef COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
618 #undef COMPUTE_NEW_FULL_IDX_MINUS_UPDATE
619 #undef MULT_SU3_NN_TEST
620 #undef MULT_SU3_NA_TEST
621 #undef MULT_SU3_AN_TEST
#define UNCOMPRESS_ANTI_HERMITIAN(ah, m)
#define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx)
__constant__ fat_force_const_t gf
#define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx)
#define LOAD_EVEN_MATRIX(dir, idx, var)
#define LOAD_ANTI_HERMITIAN(src, dir, idx, var)
#define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc)
#define MULT_SU3_NN_TEST(ma, mb)
struct quda::kernel_param_s kernel_param_t
FloatingPoint< float > Float
#define MULT_SU3_NA_TEST(ma, mb)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
__constant__ double coeff
#define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
#define SU3_ADJOINT(a, b)
#define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc)
#define RECONSTRUCT_MATRIX(sign, var)
#define LOAD_ODD_MATRIX(dir, idx, var)
#define DECLARE_LINK_VARS(var)
#define SET_UNIT_SU3_MATRIX(a)
#define GOES_FORWARDS(dir)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
#define COPY_SU3_MATRIX(a, b)
#define SET_SU3_MATRIX(a, value)
#define MAKE_ANTI_HERMITIAN(m, ah)
__global__ void GAUGE_FORCE_KERN_NAME(Float2 *momEven, Float2 *momOdd, const int dir, const double eb3, const FloatN *linkEven, const FloatN *linkOdd, const int *input_path, const int *length, const double *path_coeff, const int num_paths, const kernel_param_t kparam)