QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_force_core.h
Go to the documentation of this file.
1 
2 #define xcomm kparam.ghostDim[0]
3 #define ycomm kparam.ghostDim[1]
4 #define zcomm kparam.ghostDim[2]
5 #define tcomm kparam.ghostDim[3]
6 
7 #if (N_IN_FLOATN == 4)
8 #define linka00_re LINKA0.x
9 #define linka00_im LINKA0.y
10 #define linka01_re LINKA0.z
11 #define linka01_im LINKA0.w
12 #define linka02_re LINKA1.x
13 #define linka02_im LINKA1.y
14 #define linka10_re LINKA1.z
15 #define linka10_im LINKA1.w
16 #define linka11_re LINKA2.x
17 #define linka11_im LINKA2.y
18 #define linka12_re LINKA2.z
19 #define linka12_im LINKA2.w
20 #define linka20_re LINKA3.x
21 #define linka20_im LINKA3.y
22 #define linka21_re LINKA3.z
23 #define linka21_im LINKA3.w
24 #define linka22_re LINKA4.x
25 #define linka22_im LINKA4.y
26 
27 
28 #define linkb00_re LINKB0.x
29 #define linkb00_im LINKB0.y
30 #define linkb01_re LINKB0.z
31 #define linkb01_im LINKB0.w
32 #define linkb02_re LINKB1.x
33 #define linkb02_im LINKB1.y
34 #define linkb10_re LINKB1.z
35 #define linkb10_im LINKB1.w
36 #define linkb11_re LINKB2.x
37 #define linkb11_im LINKB2.y
38 #define linkb12_re LINKB2.z
39 #define linkb12_im LINKB2.w
40 #define linkb20_re LINKB3.x
41 #define linkb20_im LINKB3.y
42 #define linkb21_re LINKB3.z
43 #define linkb21_im LINKB3.w
44 #define linkb22_re LINKB4.x
45 #define linkb22_im LINKB4.y
46 
47 #else
48 #define linka00_re LINKA0.x
49 #define linka00_im LINKA0.y
50 #define linka01_re LINKA1.x
51 #define linka01_im LINKA1.y
52 #define linka02_re LINKA2.x
53 #define linka02_im LINKA2.y
54 #define linka10_re LINKA3.x
55 #define linka10_im LINKA3.y
56 #define linka11_re LINKA4.x
57 #define linka11_im LINKA4.y
58 #define linka12_re LINKA5.x
59 #define linka12_im LINKA5.y
60 #define linka20_re LINKA6.x
61 #define linka20_im LINKA6.y
62 #define linka21_re LINKA7.x
63 #define linka21_im LINKA7.y
64 #define linka22_re LINKA8.x
65 #define linka22_im LINKA8.y
66 
67 #define linkb00_re LINKB0.x
68 #define linkb00_im LINKB0.y
69 #define linkb01_re LINKB1.x
70 #define linkb01_im LINKB1.y
71 #define linkb02_re LINKB2.x
72 #define linkb02_im LINKB2.y
73 #define linkb10_re LINKB3.x
74 #define linkb10_im LINKB3.y
75 #define linkb11_re LINKB4.x
76 #define linkb11_im LINKB4.y
77 #define linkb12_re LINKB5.x
78 #define linkb12_im LINKB5.y
79 #define linkb20_re LINKB6.x
80 #define linkb20_im LINKB6.y
81 #define linkb21_re LINKB7.x
82 #define linkb21_im LINKB7.y
83 #define linkb22_re LINKB8.x
84 #define linkb22_im LINKB8.y
85 
86 #endif
87 
88 
89 #ifdef MULTI_GPU
90 
91 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \
92  switch(mydir){ \
93  case 0: \
94  new_mem_idx = ((!xcomm) && (new_x1 == (X1+1)))?(idx - X1m1): idx+1; \
95  new_x1 = ((!xcomm)&& (new_x1 == (X1+1)))? (new_x1 - X1m1):(new_x1+1); \
96  break; \
97  case 1: \
98  new_mem_idx = ((!ycomm) && (new_x2 == (X2+1)))?(idx - X2m1*E1): idx+E1; \
99  new_x2 = ((!ycomm)&& (new_x2 == (X2+1)))? (new_x2 - X2m1):(new_x2+1); \
100  break; \
101  case 2: \
102  new_mem_idx = ((!zcomm) && (new_x3 == (X3+1)))?(idx - X3m1*E2E1): idx+E2E1; \
103  new_x3 = ((!zcomm)&& (new_x3 == (X3+1)))? (new_x3 - X3m1):(new_x3+1); \
104  break; \
105  case 3: \
106  new_mem_idx = ((!tcomm) && (new_x4 == (X4+1)))?(idx - X4m1*E3E2E1): idx+E3E2E1; \
107  new_x4 = ((!tcomm)&& (new_x4 == (X4+1)))? (new_x4 - X4m1):(new_x4+1); \
108  break; \
109  } \
110  }while(0)
111 
112 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \
113  switch(mydir){ \
114  case 0: \
115  new_mem_idx = ((!xcomm) && new_x1 == 2)?(idx+X1m1):(idx-1); \
116  new_x1 = ((!xcomm) && new_x1 == 2)? (new_x1+X1m1): (new_x1-1); \
117  break; \
118  case 1: \
119  new_mem_idx = ((!ycomm) && new_x2 == 2)?(idx+X2m1*E1):(idx-E1); \
120  new_x2 = ((!ycomm) && new_x2 == 2)? (new_x2+X2m1): (new_x2-1); \
121  break; \
122  case 2: \
123  new_mem_idx = ((!zcomm) && new_x3 == 2)?(idx+X3m1*E2E1):(idx-E2E1); \
124  new_x3 = ((!zcomm) && new_x3 == 2)? (new_x3+X3m1): (new_x3-1); \
125  break; \
126  case 3: \
127  new_mem_idx = ((!tcomm) && new_x4 == 2)?(idx+X4m1*E3E2E1):(idx-E3E2E1); \
128  new_x4 = ((!tcomm) && new_x4 == 2)? (new_x4+X4m1): (new_x4-1); \
129  break; \
130  } \
131  }while(0)
132 
133 #else
134 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \
135  switch(mydir){ \
136  case 0: \
137  new_mem_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \
138  new_x1 = (new_x1==X1m1)?0:new_x1+1; \
139  break; \
140  case 1: \
141  new_mem_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \
142  new_x2 = (new_x2==X2m1)?0:new_x2+1; \
143  break; \
144  case 2: \
145  new_mem_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
146  new_x3 = (new_x3==X3m1)?0:new_x3+1; \
147  break; \
148  case 3: \
149  new_mem_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
150  new_x4 = (new_x4==X4m1)?0:new_x4+1; \
151  break; \
152  } \
153  }while(0)
154 
155 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \
156  switch(mydir){ \
157  case 0: \
158  new_mem_idx = ( (new_x1==0)?idx+X1m1:idx-1); \
159  new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \
160  break; \
161  case 1: \
162  new_mem_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \
163  new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \
164  break; \
165  case 2: \
166  new_mem_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
167  new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \
168  break; \
169  case 3: \
170  new_mem_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
171  new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \
172  break; \
173  } \
174  }while(0)
175 
176 #endif
177 
178 
179 #define MULT_SU3_NN_TEST(ma, mb) do{ \
180  Float fa_re,fa_im, fb_re, fb_im, fc_re, fc_im; \
181  fa_re = \
182  ma##00_re * mb##00_re - ma##00_im * mb##00_im + \
183  ma##01_re * mb##10_re - ma##01_im * mb##10_im + \
184  ma##02_re * mb##20_re - ma##02_im * mb##20_im; \
185  fa_im = \
186  ma##00_re * mb##00_im + ma##00_im * mb##00_re + \
187  ma##01_re * mb##10_im + ma##01_im * mb##10_re + \
188  ma##02_re * mb##20_im + ma##02_im * mb##20_re; \
189  fb_re = \
190  ma##00_re * mb##01_re - ma##00_im * mb##01_im + \
191  ma##01_re * mb##11_re - ma##01_im * mb##11_im + \
192  ma##02_re * mb##21_re - ma##02_im * mb##21_im; \
193  fb_im = \
194  ma##00_re * mb##01_im + ma##00_im * mb##01_re + \
195  ma##01_re * mb##11_im + ma##01_im * mb##11_re + \
196  ma##02_re * mb##21_im + ma##02_im * mb##21_re; \
197  fc_re = \
198  ma##00_re * mb##02_re - ma##00_im * mb##02_im + \
199  ma##01_re * mb##12_re - ma##01_im * mb##12_im + \
200  ma##02_re * mb##22_re - ma##02_im * mb##22_im; \
201  fc_im = \
202  ma##00_re * mb##02_im + ma##00_im * mb##02_re + \
203  ma##01_re * mb##12_im + ma##01_im * mb##12_re + \
204  ma##02_re * mb##22_im + ma##02_im * mb##22_re; \
205  ma##00_re = fa_re; \
206  ma##00_im = fa_im; \
207  ma##01_re = fb_re; \
208  ma##01_im = fb_im; \
209  ma##02_re = fc_re; \
210  ma##02_im = fc_im; \
211  fa_re = \
212  ma##10_re * mb##00_re - ma##10_im * mb##00_im + \
213  ma##11_re * mb##10_re - ma##11_im * mb##10_im + \
214  ma##12_re * mb##20_re - ma##12_im * mb##20_im; \
215  fa_im = \
216  ma##10_re * mb##00_im + ma##10_im * mb##00_re + \
217  ma##11_re * mb##10_im + ma##11_im * mb##10_re + \
218  ma##12_re * mb##20_im + ma##12_im * mb##20_re; \
219  fb_re = \
220  ma##10_re * mb##01_re - ma##10_im * mb##01_im + \
221  ma##11_re * mb##11_re - ma##11_im * mb##11_im + \
222  ma##12_re * mb##21_re - ma##12_im * mb##21_im; \
223  fb_im = \
224  ma##10_re * mb##01_im + ma##10_im * mb##01_re + \
225  ma##11_re * mb##11_im + ma##11_im * mb##11_re + \
226  ma##12_re * mb##21_im + ma##12_im * mb##21_re; \
227  fc_re = \
228  ma##10_re * mb##02_re - ma##10_im * mb##02_im + \
229  ma##11_re * mb##12_re - ma##11_im * mb##12_im + \
230  ma##12_re * mb##22_re - ma##12_im * mb##22_im; \
231  fc_im = \
232  ma##10_re * mb##02_im + ma##10_im * mb##02_re + \
233  ma##11_re * mb##12_im + ma##11_im * mb##12_re + \
234  ma##12_re * mb##22_im + ma##12_im * mb##22_re; \
235  ma##10_re = fa_re; \
236  ma##10_im = fa_im; \
237  ma##11_re = fb_re; \
238  ma##11_im = fb_im; \
239  ma##12_re = fc_re; \
240  ma##12_im = fc_im; \
241  fa_re = \
242  ma##20_re * mb##00_re - ma##20_im * mb##00_im + \
243  ma##21_re * mb##10_re - ma##21_im * mb##10_im + \
244  ma##22_re * mb##20_re - ma##22_im * mb##20_im; \
245  fa_im = \
246  ma##20_re * mb##00_im + ma##20_im * mb##00_re + \
247  ma##21_re * mb##10_im + ma##21_im * mb##10_re + \
248  ma##22_re * mb##20_im + ma##22_im * mb##20_re; \
249  fb_re = \
250  ma##20_re * mb##01_re - ma##20_im * mb##01_im + \
251  ma##21_re * mb##11_re - ma##21_im * mb##11_im + \
252  ma##22_re * mb##21_re - ma##22_im * mb##21_im; \
253  fb_im = \
254  ma##20_re * mb##01_im + ma##20_im * mb##01_re + \
255  ma##21_re * mb##11_im + ma##21_im * mb##11_re + \
256  ma##22_re * mb##21_im + ma##22_im * mb##21_re; \
257  fc_re = \
258  ma##20_re * mb##02_re - ma##20_im * mb##02_im + \
259  ma##21_re * mb##12_re - ma##21_im * mb##12_im + \
260  ma##22_re * mb##22_re - ma##22_im * mb##22_im; \
261  fc_im = \
262  ma##20_re * mb##02_im + ma##20_im * mb##02_re + \
263  ma##21_re * mb##12_im + ma##21_im * mb##12_re + \
264  ma##22_re * mb##22_im + ma##22_im * mb##22_re; \
265  ma##20_re = fa_re; \
266  ma##20_im = fa_im; \
267  ma##21_re = fb_re; \
268  ma##21_im = fb_im; \
269  ma##22_re = fc_re; \
270  ma##22_im = fc_im; \
271  }while(0)
272 
273 
274 #define MULT_SU3_NA_TEST(ma, mb) do{ \
275  Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \
276  fa_re = \
277  ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \
278  ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \
279  ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \
280  fa_im = \
281  ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \
282  ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \
283  ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \
284  fb_re = \
285  ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \
286  ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \
287  ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \
288  fb_im = \
289  ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \
290  ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \
291  ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \
292  fc_re = \
293  ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \
294  ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \
295  ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \
296  fc_im = \
297  ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \
298  ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \
299  ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \
300  ma##00_re = fa_re; \
301  ma##00_im = fa_im; \
302  ma##01_re = fb_re; \
303  ma##01_im = fb_im; \
304  ma##02_re = fc_re; \
305  ma##02_im = fc_im; \
306  fa_re = \
307  ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \
308  ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \
309  ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \
310  fa_im = \
311  ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \
312  ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \
313  ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \
314  fb_re = \
315  ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \
316  ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \
317  ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \
318  fb_im = \
319  ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \
320  ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \
321  ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \
322  fc_re = \
323  ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \
324  ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \
325  ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \
326  fc_im = \
327  ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \
328  ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \
329  ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \
330  ma##10_re = fa_re; \
331  ma##10_im = fa_im; \
332  ma##11_re = fb_re; \
333  ma##11_im = fb_im; \
334  ma##12_re = fc_re; \
335  ma##12_im = fc_im; \
336  fa_re = \
337  ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \
338  ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \
339  ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \
340  fa_im = \
341  ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \
342  ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \
343  ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \
344  fb_re = \
345  ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \
346  ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \
347  ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \
348  fb_im = \
349  ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \
350  ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \
351  ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \
352  fc_re = \
353  ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \
354  ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \
355  ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \
356  fc_im = \
357  ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \
358  ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \
359  ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; \
360  ma##20_re = fa_re; \
361  ma##20_im = fa_im; \
362  ma##21_re = fb_re; \
363  ma##21_im = fb_im; \
364  ma##22_re = fc_re; \
365  ma##22_im = fc_im; \
366  }while(0)
367 
368 
369 
370 #define MULT_SU3_AN_TEST(ma, mb) do{ \
371  Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \
372  fa_re = \
373  ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \
374  ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \
375  ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \
376  fa_im = \
377  ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \
378  ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \
379  ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \
380  fb_re = \
381  ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \
382  ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \
383  ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \
384  fb_im = \
385  ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \
386  ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \
387  ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \
388  fc_re = \
389  ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \
390  ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \
391  ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \
392  fc_im = \
393  ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \
394  ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \
395  ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \
396  mb##00_re = fa_re; \
397  mb##00_im = fa_im; \
398  mb##10_re = fb_re; \
399  mb##10_im = fb_im; \
400  mb##20_re = fc_re; \
401  mb##20_im = fc_im; \
402  fa_re = \
403  ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \
404  ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \
405  ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \
406  fa_im = \
407  ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \
408  ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \
409  ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \
410  fb_re = \
411  ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \
412  ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \
413  ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \
414  fb_im = \
415  ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \
416  ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \
417  ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \
418  fc_re = \
419  ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \
420  ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \
421  ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \
422  fc_im = \
423  ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \
424  ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \
425  ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \
426  mb##01_re = fa_re; \
427  mb##01_im = fa_im; \
428  mb##11_re = fb_re; \
429  mb##11_im = fb_im; \
430  mb##21_re = fc_re; \
431  mb##21_im = fc_im; \
432  fa_re = \
433  ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \
434  ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \
435  ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \
436  fa_im = \
437  ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \
438  ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \
439  ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \
440  fb_re = \
441  ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \
442  ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \
443  ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \
444  fb_im = \
445  ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \
446  ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \
447  ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \
448  fc_re = \
449  ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \
450  ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \
451  ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \
452  fc_im = \
453  ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \
454  ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \
455  ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; \
456  mb##02_re = fa_re; \
457  mb##02_im = fa_im; \
458  mb##12_re = fb_re; \
459  mb##12_im = fb_im; \
460  mb##22_re = fc_re; \
461  mb##22_im = fc_im; \
462  }while(0)
463 
464 
465 
466 #define print_matrix(mul) \
467  printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
468  printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
469  printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
470 
471 
472 //FloatN can be float2/float4/double2
473 //Float2 can be float2/double2
474 template<int oddBit, typename Float, typename Float2, typename FloatN>
475  __global__ void
476  GAUGE_FORCE_KERN_NAME(Float2* momEven, Float2* momOdd,
477  const int dir, const double eb3,
478  const FloatN* linkEven, const FloatN* linkOdd,
479  const int* input_path,
480  const int* length, const double* path_coeff, const int num_paths, const kernel_param_t kparam)
481 {
482  int i,j=0;
483  int sid = blockIdx.x * blockDim.x + threadIdx.x;
484  if (sid >= kparam.threads) return;
485 
486  int z1 = sid / X1h;
487  int x1h = sid - z1*X1h;
488  int z2 = z1 / X2;
489  int x2 = z1 - z2*X2;
490  int x4 = z2 / X3;
491  int x3 = z2 - x4*X3;
492  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
493  int x1 = 2*x1h + x1odd;
494 
495 #ifdef MULTI_GPU
496  x4 += 2; x3 += 2; x2 += 2; x1 += 2;
497  int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
498 #else
499  int X = 2*sid + x1odd;
500 #endif
501 
502  Float2* mymom=momEven;
503  if (oddBit){
504  mymom = momOdd;
505  }
506 
507  DECLARE_LINK_VARS(LINKA);
508  DECLARE_LINK_VARS(LINKB);
510  Float2 AH0, AH1, AH2, AH3, AH4;
511 
512  int new_mem_idx;
513 
514 
515  SET_SU3_MATRIX(staple, 0);
516 
517  for(i=0;i < num_paths; i++){
518  Float coeff = path_coeff[i];
519  if(coeff == 0) continue;
520 
521  int nbr_oddbit = (oddBit^1 );
522 
523  int new_x1 =x1;
524  int new_x2 =x2;
525  int new_x3 =x3;
526  int new_x4 =x4;
528 
529  //linka: current matrix
530  //linkb: the loaded matrix in this round
531  SET_UNIT_SU3_MATRIX(linka);
532  const int* path = input_path + i*gf.path_max_length;
533 
534  int lnkdir;
535  int path0 = path[0];
536  if (GOES_FORWARDS(path0)){
537  lnkdir=path0;
538  }else{
539  lnkdir=OPP_DIR(path0);
540  COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(path0), new_mem_idx);
541  nbr_oddbit = nbr_oddbit^1;
542 
543  }
544 
545  int nbr_idx = new_mem_idx >>1;
546  if (nbr_oddbit){
547  LOAD_ODD_MATRIX( lnkdir, nbr_idx, LINKB);
548  }else{
549  LOAD_EVEN_MATRIX( lnkdir, nbr_idx, LINKB);
550  }
551  RECONSTRUCT_MATRIX(1, linkb);
552 
553  if (GOES_FORWARDS(path0)){
554  COPY_SU3_MATRIX(linkb, linka);
555  COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(path0, new_mem_idx);
556  nbr_oddbit = nbr_oddbit^1;
557  }else{
558  SU3_ADJOINT(linkb, linka);
559  }
560 
561  for(j=1; j < length[i]; j++){
562 
563  int lnkdir;
564  int pathj = path[j];
565  if (GOES_FORWARDS(pathj)){
566  lnkdir=pathj;
567  }else{
568  lnkdir=OPP_DIR(pathj);
569  COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(pathj), new_mem_idx);
570  nbr_oddbit = nbr_oddbit^1;
571 
572  }
573 
574  int nbr_idx = new_mem_idx >>1;
575  if (nbr_oddbit){
576  LOAD_ODD_MATRIX(lnkdir, nbr_idx, LINKB);
577  }else{
578  LOAD_EVEN_MATRIX(lnkdir, nbr_idx, LINKB);
579  }
580  RECONSTRUCT_MATRIX(1, linkb);
581  if (GOES_FORWARDS(pathj)){
582  MULT_SU3_NN_TEST(linka, linkb);
583 
584  COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(pathj, new_mem_idx);
585  nbr_oddbit = nbr_oddbit^1;
586 
587 
588  }else{
589  MULT_SU3_NA_TEST(linka, linkb);
590  }
591 
592  }//j
593  SCALAR_MULT_ADD_SU3_MATRIX(staple, linka, coeff, staple);
594  }//i
595 
596 
597  //update mom
598  if (oddBit){
599  LOAD_ODD_MATRIX(dir, (X>>1), LINKA);
600  }else{
601  LOAD_EVEN_MATRIX(dir, (X>>1), LINKA);
602  }
603  RECONSTRUCT_MATRIX(1, linka);
604  MULT_SU3_NN_TEST(linka, staple);
605  LOAD_ANTI_HERMITIAN(mymom, dir, sid, AH);
606  UNCOMPRESS_ANTI_HERMITIAN(ah, linkb);
607  SCALAR_MULT_SUB_SU3_MATRIX(linkb, linka, eb3, linka);
608  MAKE_ANTI_HERMITIAN(linka, ah);
609 
610  WRITE_ANTI_HERMITIAN(mymom, dir, sid, AH, gf.mom_ga_stride);
611 
612  return;
613 }
614 
615 
616 
617 #undef COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
618 #undef COMPUTE_NEW_FULL_IDX_MINUS_UPDATE
619 #undef MULT_SU3_NN_TEST
620 #undef MULT_SU3_NA_TEST
621 #undef MULT_SU3_AN_TEST
622 
623 #undef linka00_re
624 #undef linka00_im
625 #undef linka01_re
626 #undef linka01_im
627 #undef linka02_re
628 #undef linka02_im
629 #undef linka10_re
630 #undef linka10_im
631 #undef linka11_re
632 #undef linka11_im
633 #undef linka12_re
634 #undef linka12_im
635 #undef linka20_re
636 #undef linka20_im
637 #undef linka21_re
638 #undef linka21_im
639 #undef linka22_re
640 #undef linka22_im
641 
642 #undef linkb00_re
643 #undef linkb00_im
644 #undef linkb01_re
645 #undef linkb01_im
646 #undef linkb02_re
647 #undef linkb02_im
648 #undef linkb10_re
649 #undef linkb10_im
650 #undef linkb11_re
651 #undef linkb11_im
652 #undef linkb12_re
653 #undef linkb12_im
654 #undef linkb20_re
655 #undef linkb20_im
656 #undef linkb21_re
657 #undef linkb21_im
658 #undef linkb22_re
659 #undef linkb22_im
#define UNCOMPRESS_ANTI_HERMITIAN(ah, m)
Definition: force_common.h:511
__constant__ int X1h
__constant__ int X2
#define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx)
FloatM STAPLE1
Definition: llfat_core.h:809
__constant__ fat_force_const_t gf
#define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx)
FloatM STAPLE2
Definition: llfat_core.h:809
#define LOAD_EVEN_MATRIX(dir, idx, var)
__constant__ int E3E2E1
int length[]
FloatM STAPLE8
Definition: llfat_core.h:809
__constant__ int E2E1
#define LOAD_ANTI_HERMITIAN(src, dir, idx, var)
FloatM STAPLE0
Definition: llfat_core.h:809
#define OPP_DIR(dir)
Definition: force_common.h:16
#define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc)
Definition: force_common.h:479
#define MULT_SU3_NN_TEST(ma, mb)
struct quda::kernel_param_s kernel_param_t
FloatingPoint< float > Float
Definition: gtest.h:7350
#define MULT_SU3_NA_TEST(ma, mb)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
int z1
Definition: llfat_core.h:814
int new_mem_idx
Definition: llfat_core.h:834
__constant__ double coeff
short x1h
Definition: llfat_core.h:815
FloatM STAPLE6
Definition: llfat_core.h:809
#define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride)
Definition: force_common.h:566
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
#define SU3_ADJOINT(a, b)
Definition: force_common.h:595
#define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc)
Definition: force_common.h:459
FloatM STAPLE7
Definition: llfat_core.h:809
__constant__ int X3
int z2
Definition: llfat_core.h:816
#define RECONSTRUCT_MATRIX(sign, var)
FloatM STAPLE4
Definition: llfat_core.h:809
#define LOAD_ODD_MATRIX(dir, idx, var)
short x1odd
Definition: llfat_core.h:821
#define DECLARE_LINK_VARS(var)
#define SET_UNIT_SU3_MATRIX(a)
Definition: force_common.h:615
FloatM STAPLE5
Definition: llfat_core.h:809
#define GOES_FORWARDS(dir)
Definition: force_common.h:17
__constant__ int E1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
FloatM STAPLE3
Definition: llfat_core.h:809
#define COPY_SU3_MATRIX(a, b)
Definition: force_common.h:575
#define SET_SU3_MATRIX(a, value)
Definition: force_common.h:439
#define MAKE_ANTI_HERMITIAN(m, ah)
Definition: force_common.h:532
int oddBit
__global__ void GAUGE_FORCE_KERN_NAME(Float2 *momEven, Float2 *momOdd, const int dir, const double eb3, const FloatN *linkEven, const FloatN *linkOdd, const int *input_path, const int *length, const double *path_coeff, const int num_paths, const kernel_param_t kparam)