QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
force_common.h
Go to the documentation of this file.
1 #ifndef _FORCE_COMMON_H
2 #define _FORCE_COMMON_H
3 
4 enum {
5  XUP = 0,
6  YUP = 1,
7  ZUP = 2,
8  TUP = 3,
9  TDOWN = 4,
10  ZDOWN = 5,
11  YDOWN = 6,
12  XDOWN = 7
13 };
14 
15 
16 #define OPP_DIR(dir) (7-(dir))
17 #define GOES_FORWARDS(dir) (dir<=3)
18 #define GOES_BACKWARDS(dir) (dir>3)
19 
20 
21 #define linkaT00_re (+linka00_re)
22 #define linkaT00_im (-linka00_im)
23 #define linkaT01_re (+linka10_re)
24 #define linkaT01_im (-linka10_im)
25 #define linkaT02_re (+linka20_re)
26 #define linkaT02_im (-linka20_im)
27 #define linkaT10_re (+linka01_re)
28 #define linkaT10_im (-linka01_im)
29 #define linkaT11_re (+linka11_re)
30 #define linkaT11_im (-linka11_im)
31 #define linkaT12_re (+linka21_re)
32 #define linkaT12_im (-linka21_im)
33 #define linkaT20_re (+linka02_re)
34 #define linkaT20_im (-linka02_im)
35 #define linkaT21_re (+linka12_re)
36 #define linkaT21_im (-linka12_im)
37 #define linkaT22_re (+linka22_re)
38 #define linkaT22_im (-linka22_im)
39 
40 
41 #define linkbT00_re (+linkb00_re)
42 #define linkbT00_im (-linkb00_im)
43 #define linkbT01_re (+linkb10_re)
44 #define linkbT01_im (-linkb10_im)
45 #define linkbT02_re (+linkb20_re)
46 #define linkbT02_im (-linkb20_im)
47 #define linkbT10_re (+linkb01_re)
48 #define linkbT10_im (-linkb01_im)
49 #define linkbT11_re (+linkb11_re)
50 #define linkbT11_im (-linkb11_im)
51 #define linkbT12_re (+linkb21_re)
52 #define linkbT12_im (-linkb21_im)
53 #define linkbT20_re (+linkb02_re)
54 #define linkbT20_im (-linkb02_im)
55 #define linkbT21_re (+linkb12_re)
56 #define linkbT21_im (-linkb12_im)
57 #define linkbT22_re (+linkb22_re)
58 #define linkbT22_im (-linkb22_im)
59 
60 
61 
62 
63 #define linkc00_re LINKC0.x
64 #define linkc00_im LINKC0.y
65 #define linkc01_re LINKC0.z
66 #define linkc01_im LINKC0.w
67 #define linkc02_re LINKC1.x
68 #define linkc02_im LINKC1.y
69 #define linkc10_re LINKC1.z
70 #define linkc10_im LINKC1.w
71 #define linkc11_re LINKC2.x
72 #define linkc11_im LINKC2.y
73 #define linkc12_re LINKC2.z
74 #define linkc12_im LINKC2.w
75 #define linkc20_re LINKC3.x
76 #define linkc20_im LINKC3.y
77 #define linkc21_re LINKC3.z
78 #define linkc21_im LINKC3.w
79 #define linkc22_re LINKC4.x
80 #define linkc22_im LINKC4.y
81 
82 #define linkcT00_re (+linkc00_re)
83 #define linkcT00_im (-linkc00_im)
84 #define linkcT01_re (+linkc10_re)
85 #define linkcT01_im (-linkc10_im)
86 #define linkcT02_re (+linkc20_re)
87 #define linkcT02_im (-linkc20_im)
88 #define linkcT10_re (+linkc01_re)
89 #define linkcT10_im (-linkc01_im)
90 #define linkcT11_re (+linkc11_re)
91 #define linkcT11_im (-linkc11_im)
92 #define linkcT12_re (+linkc21_re)
93 #define linkcT12_im (-linkc21_im)
94 #define linkcT20_re (+linkc02_re)
95 #define linkcT20_im (-linkc02_im)
96 #define linkcT21_re (+linkc12_re)
97 #define linkcT21_im (-linkc12_im)
98 #define linkcT22_re (+linkc22_re)
99 #define linkcT22_im (-linkc22_im)
100 
101 
102 #define staple00_re STAPLE0.x
103 #define staple00_im STAPLE0.y
104 #define staple01_re STAPLE1.x
105 #define staple01_im STAPLE1.y
106 #define staple02_re STAPLE2.x
107 #define staple02_im STAPLE2.y
108 #define staple10_re STAPLE3.x
109 #define staple10_im STAPLE3.y
110 #define staple11_re STAPLE4.x
111 #define staple11_im STAPLE4.y
112 #define staple12_re STAPLE5.x
113 #define staple12_im STAPLE5.y
114 #define staple20_re STAPLE6.x
115 #define staple20_im STAPLE6.y
116 #define staple21_re STAPLE7.x
117 #define staple21_im STAPLE7.y
118 #define staple22_re STAPLE8.x
119 #define staple22_im STAPLE8.y
120 
121 #define stapleT00_re (+staple00_re)
122 #define stapleT00_im (-staple00_im)
123 #define stapleT01_re (+staple10_re)
124 #define stapleT01_im (-staple10_im)
125 #define stapleT02_re (+staple20_re)
126 #define stapleT02_im (-staple20_im)
127 #define stapleT10_re (+staple01_re)
128 #define stapleT10_im (-staple01_im)
129 #define stapleT11_re (+staple11_re)
130 #define stapleT11_im (-staple11_im)
131 #define stapleT12_re (+staple21_re)
132 #define stapleT12_im (-staple21_im)
133 #define stapleT20_re (+staple02_re)
134 #define stapleT20_im (-staple02_im)
135 #define stapleT21_re (+staple12_re)
136 #define stapleT21_im (-staple12_im)
137 #define stapleT22_re (+staple22_re)
138 #define stapleT22_im (-staple22_im)
139 
140 #ifdef FERMI_NO_DBLE_TEX
141 #define READ_DOUBLE2_TEXTURE(x_tex, x, i) (x)[i]
142 #else
143 #define READ_DOUBLE2_TEXTURE(x_tex, x, i) fetch_double2_old(x_tex, i)
144 #endif
145 
146 
147 #define LOAD_MATRIX_12_SINGLE(gauge, dir, idx, var, stride)do{ \
148  var##0 = gauge[idx + dir*stride*3]; \
149  var##1 = gauge[idx + dir*stride*3 + stride]; \
150  var##2 = gauge[idx + dir*stride*3 + stride*2]; \
151  }while(0)
152 
153 #define LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \
154  var##0 = tex1Dfetch(gauge, idx + dir*stride*3); \
155  var##1 = tex1Dfetch(gauge, idx + dir*stride*3 + stride); \
156  var##2 = tex1Dfetch(gauge, idx + dir*stride*3 + stride*2); \
157  }while(0)
158 
159 #define LOAD_MATRIX_12_DOUBLE(gauge, dir, idx, var, stride)do{ \
160  var##0 = gauge[idx + dir*stride*6]; \
161  var##1 = gauge[idx + dir*stride*6 + stride]; \
162  var##2 = gauge[idx + dir*stride*6 + stride*2]; \
163  var##3 = gauge[idx + dir*stride*6 + stride*3]; \
164  var##4 = gauge[idx + dir*stride*6 + stride*4]; \
165  var##5 = gauge[idx + dir*stride*6 + stride*5]; \
166  }while(0)
167 
168 #define LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
169  var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6); \
170  var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride); \
171  var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*2); \
172  var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*3); \
173  var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*4); \
174  var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*5); \
175  }while(0)
176 
177 #define LOAD_MATRIX_18(gauge, dir, idx, var, stride)do{ \
178  var##0 = gauge[idx + dir*stride*9]; \
179  var##1 = gauge[idx + dir*stride*9 + stride]; \
180  var##2 = gauge[idx + dir*stride*9 + stride*2]; \
181  var##3 = gauge[idx + dir*stride*9 + stride*3]; \
182  var##4 = gauge[idx + dir*stride*9 + stride*4]; \
183  var##5 = gauge[idx + dir*stride*9 + stride*5]; \
184  var##6 = gauge[idx + dir*stride*9 + stride*6]; \
185  var##7 = gauge[idx + dir*stride*9 + stride*7]; \
186  var##8 = gauge[idx + dir*stride*9 + stride*8]; \
187  }while(0)
188 
189 #define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \
190  var##0 = tex1Dfetch(gauge, idx + dir*stride*9); \
191  var##1 = tex1Dfetch(gauge, idx + dir*stride*9 + stride); \
192  var##2 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*2); \
193  var##3 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*3); \
194  var##4 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*4); \
195  var##5 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*5); \
196  var##6 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*6); \
197  var##7 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*7); \
198  var##8 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*8); \
199  }while(0)
200 
201 #define LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
202  var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9); \
203  var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride); \
204  var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*2); \
205  var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*3); \
206  var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*4); \
207  var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*5); \
208  var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*6); \
209  var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*7); \
210  var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*8); \
211  }while(0)
212 
213 #define MULT_SU3_NN(ma, mb, mc) \
214  mc##00_re = \
215  ma##00_re * mb##00_re - ma##00_im * mb##00_im + \
216  ma##01_re * mb##10_re - ma##01_im * mb##10_im + \
217  ma##02_re * mb##20_re - ma##02_im * mb##20_im; \
218  mc##00_im = \
219  ma##00_re * mb##00_im + ma##00_im * mb##00_re + \
220  ma##01_re * mb##10_im + ma##01_im * mb##10_re + \
221  ma##02_re * mb##20_im + ma##02_im * mb##20_re; \
222  mc##10_re = \
223  ma##10_re * mb##00_re - ma##10_im * mb##00_im + \
224  ma##11_re * mb##10_re - ma##11_im * mb##10_im + \
225  ma##12_re * mb##20_re - ma##12_im * mb##20_im; \
226  mc##10_im = \
227  ma##10_re * mb##00_im + ma##10_im * mb##00_re + \
228  ma##11_re * mb##10_im + ma##11_im * mb##10_re + \
229  ma##12_re * mb##20_im + ma##12_im * mb##20_re; \
230  mc##20_re = \
231  ma##20_re * mb##00_re - ma##20_im * mb##00_im + \
232  ma##21_re * mb##10_re - ma##21_im * mb##10_im + \
233  ma##22_re * mb##20_re - ma##22_im * mb##20_im; \
234  mc##20_im = \
235  ma##20_re * mb##00_im + ma##20_im * mb##00_re + \
236  ma##21_re * mb##10_im + ma##21_im * mb##10_re + \
237  ma##22_re * mb##20_im + ma##22_im * mb##20_re; \
238  mc##01_re = \
239  ma##00_re * mb##01_re - ma##00_im * mb##01_im + \
240  ma##01_re * mb##11_re - ma##01_im * mb##11_im + \
241  ma##02_re * mb##21_re - ma##02_im * mb##21_im; \
242  mc##01_im = \
243  ma##00_re * mb##01_im + ma##00_im * mb##01_re + \
244  ma##01_re * mb##11_im + ma##01_im * mb##11_re + \
245  ma##02_re * mb##21_im + ma##02_im * mb##21_re; \
246  mc##11_re = \
247  ma##10_re * mb##01_re - ma##10_im * mb##01_im + \
248  ma##11_re * mb##11_re - ma##11_im * mb##11_im + \
249  ma##12_re * mb##21_re - ma##12_im * mb##21_im; \
250  mc##11_im = \
251  ma##10_re * mb##01_im + ma##10_im * mb##01_re + \
252  ma##11_re * mb##11_im + ma##11_im * mb##11_re + \
253  ma##12_re * mb##21_im + ma##12_im * mb##21_re; \
254  mc##21_re = \
255  ma##20_re * mb##01_re - ma##20_im * mb##01_im + \
256  ma##21_re * mb##11_re - ma##21_im * mb##11_im + \
257  ma##22_re * mb##21_re - ma##22_im * mb##21_im; \
258  mc##21_im = \
259  ma##20_re * mb##01_im + ma##20_im * mb##01_re + \
260  ma##21_re * mb##11_im + ma##21_im * mb##11_re + \
261  ma##22_re * mb##21_im + ma##22_im * mb##21_re; \
262  mc##02_re = \
263  ma##00_re * mb##02_re - ma##00_im * mb##02_im + \
264  ma##01_re * mb##12_re - ma##01_im * mb##12_im + \
265  ma##02_re * mb##22_re - ma##02_im * mb##22_im; \
266  mc##02_im = \
267  ma##00_re * mb##02_im + ma##00_im * mb##02_re + \
268  ma##01_re * mb##12_im + ma##01_im * mb##12_re + \
269  ma##02_re * mb##22_im + ma##02_im * mb##22_re; \
270  mc##12_re = \
271  ma##10_re * mb##02_re - ma##10_im * mb##02_im + \
272  ma##11_re * mb##12_re - ma##11_im * mb##12_im + \
273  ma##12_re * mb##22_re - ma##12_im * mb##22_im; \
274  mc##12_im = \
275  ma##10_re * mb##02_im + ma##10_im * mb##02_re + \
276  ma##11_re * mb##12_im + ma##11_im * mb##12_re + \
277  ma##12_re * mb##22_im + ma##12_im * mb##22_re; \
278  mc##22_re = \
279  ma##20_re * mb##02_re - ma##20_im * mb##02_im + \
280  ma##21_re * mb##12_re - ma##21_im * mb##12_im + \
281  ma##22_re * mb##22_re - ma##22_im * mb##22_im; \
282  mc##22_im = \
283  ma##20_re * mb##02_im + ma##20_im * mb##02_re + \
284  ma##21_re * mb##12_im + ma##21_im * mb##12_re + \
285  ma##22_re * mb##22_im + ma##22_im * mb##22_re;
286 
287 
288 
289 #define MULT_SU3_NA(ma, mb, mc) \
290  mc##00_re = \
291  ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \
292  ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \
293  ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \
294  mc##00_im = \
295  ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \
296  ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \
297  ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \
298  mc##10_re = \
299  ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \
300  ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \
301  ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \
302  mc##10_im = \
303  ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \
304  ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \
305  ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \
306  mc##20_re = \
307  ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \
308  ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \
309  ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \
310  mc##20_im = \
311  ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \
312  ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \
313  ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \
314  mc##01_re = \
315  ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \
316  ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \
317  ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \
318  mc##01_im = \
319  ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \
320  ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \
321  ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \
322  mc##11_re = \
323  ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \
324  ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \
325  ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \
326  mc##11_im = \
327  ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \
328  ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \
329  ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \
330  mc##21_re = \
331  ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \
332  ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \
333  ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \
334  mc##21_im = \
335  ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \
336  ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \
337  ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \
338  mc##02_re = \
339  ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \
340  ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \
341  ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \
342  mc##02_im = \
343  ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \
344  ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \
345  ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \
346  mc##12_re = \
347  ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \
348  ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \
349  ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \
350  mc##12_im = \
351  ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \
352  ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \
353  ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \
354  mc##22_re = \
355  ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \
356  ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \
357  ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \
358  mc##22_im = \
359  ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \
360  ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \
361  ma##22_re * mb##T22_im + ma##22_im * mb##T22_re;
362 
363 
364 
365 #define MULT_SU3_AN(ma, mb, mc) \
366  mc##00_re = \
367  ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \
368  ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \
369  ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \
370  mc##00_im = \
371  ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \
372  ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \
373  ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \
374  mc##10_re = \
375  ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \
376  ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \
377  ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \
378  mc##10_im = \
379  ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \
380  ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \
381  ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \
382  mc##20_re = \
383  ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \
384  ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \
385  ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \
386  mc##20_im = \
387  ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \
388  ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \
389  ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \
390  mc##01_re = \
391  ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \
392  ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \
393  ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \
394  mc##01_im = \
395  ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \
396  ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \
397  ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \
398  mc##11_re = \
399  ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \
400  ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \
401  ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \
402  mc##11_im = \
403  ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \
404  ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \
405  ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \
406  mc##21_re = \
407  ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \
408  ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \
409  ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \
410  mc##21_im = \
411  ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \
412  ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \
413  ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \
414  mc##02_re = \
415  ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \
416  ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \
417  ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \
418  mc##02_im = \
419  ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \
420  ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \
421  ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \
422  mc##12_re = \
423  ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \
424  ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \
425  ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \
426  mc##12_im = \
427  ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \
428  ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \
429  ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \
430  mc##22_re = \
431  ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \
432  ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \
433  ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \
434  mc##22_im = \
435  ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \
436  ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \
437  ma##T22_re * mb##22_im + ma##T22_im * mb##22_re;
438 
439 #define SET_SU3_MATRIX(a, value) \
440  a##00_re = value; \
441  a##00_im = value; \
442  a##01_re = value; \
443  a##01_im = value; \
444  a##02_re = value; \
445  a##02_im = value; \
446  a##10_re = value; \
447  a##10_im = value; \
448  a##11_re = value; \
449  a##11_im = value; \
450  a##12_re = value; \
451  a##12_im = value; \
452  a##20_re = value; \
453  a##20_im = value; \
454  a##21_re = value; \
455  a##21_im = value; \
456  a##22_re = value; \
457  a##22_im = value; \
458 
459 #define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc) \
460  mc##00_re = ma##00_re + mb##00_re * s; \
461  mc##00_im = ma##00_im + mb##00_im * s; \
462  mc##01_re = ma##01_re + mb##01_re * s; \
463  mc##01_im = ma##01_im + mb##01_im * s; \
464  mc##02_re = ma##02_re + mb##02_re * s; \
465  mc##02_im = ma##02_im + mb##02_im * s; \
466  mc##10_re = ma##10_re + mb##10_re * s; \
467  mc##10_im = ma##10_im + mb##10_im * s; \
468  mc##11_re = ma##11_re + mb##11_re * s; \
469  mc##11_im = ma##11_im + mb##11_im * s; \
470  mc##12_re = ma##12_re + mb##12_re * s; \
471  mc##12_im = ma##12_im + mb##12_im * s; \
472  mc##20_re = ma##20_re + mb##20_re * s; \
473  mc##20_im = ma##20_im + mb##20_im * s; \
474  mc##21_re = ma##21_re + mb##21_re * s; \
475  mc##21_im = ma##21_im + mb##21_im * s; \
476  mc##22_re = ma##22_re + mb##22_re * s; \
477  mc##22_im = ma##22_im + mb##22_im * s;
478 
479 #define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc) \
480  mc##00_re = ma##00_re - mb##00_re * s; \
481  mc##00_im = ma##00_im - mb##00_im * s; \
482  mc##01_re = ma##01_re - mb##01_re * s; \
483  mc##01_im = ma##01_im - mb##01_im * s; \
484  mc##02_re = ma##02_re - mb##02_re * s; \
485  mc##02_im = ma##02_im - mb##02_im * s; \
486  mc##10_re = ma##10_re - mb##10_re * s; \
487  mc##10_im = ma##10_im - mb##10_im * s; \
488  mc##11_re = ma##11_re - mb##11_re * s; \
489  mc##11_im = ma##11_im - mb##11_im * s; \
490  mc##12_re = ma##12_re - mb##12_re * s; \
491  mc##12_im = ma##12_im - mb##12_im * s; \
492  mc##20_re = ma##20_re - mb##20_re * s; \
493  mc##20_im = ma##20_im - mb##20_im * s; \
494  mc##21_re = ma##21_re - mb##21_re * s; \
495  mc##21_im = ma##21_im - mb##21_im * s; \
496  mc##22_re = ma##22_re - mb##22_re * s; \
497  mc##22_im = ma##22_im - mb##22_im * s;
498 
499 
500 #define ah01_re AH0.x
501 #define ah01_im AH0.y
502 #define ah02_re AH1.x
503 #define ah02_im AH1.y
504 #define ah12_re AH2.x
505 #define ah12_im AH2.y
506 #define ah00_im AH3.x
507 #define ah11_im AH3.y
508 #define ah22_im AH4.x
509 #define ahspace AH4.y
510 
511 #define UNCOMPRESS_ANTI_HERMITIAN(ah, m) \
512  m##00_re = 0; \
513  m##00_im = ah##00_im; \
514  m##11_re = 0; \
515  m##11_im = ah##11_im; \
516  m##22_re = 0; \
517  m##22_im = ah##22_im; \
518  m##01_re = ah##01_re; \
519  m##01_im = ah##01_im; \
520  m##10_re = -ah##01_re; \
521  m##10_im = ah##01_im; \
522  m##02_re = ah##02_re; \
523  m##02_im = ah##02_im; \
524  m##20_re = -ah##02_re; \
525  m##20_im = ah##02_im; \
526  m##12_re = ah##12_re; \
527  m##12_im = ah##12_im; \
528  m##21_re = -ah##12_re; \
529  m##21_im = ah##12_im;
530 
531 
532 #define MAKE_ANTI_HERMITIAN(m, ah) do { \
533  typeof(ah##space) temp; \
534  temp = (m##00_im + m##11_im + m##22_im)*0.33333333333333333; \
535  ah##00_im = (m##00_im - temp); \
536  ah##11_im = (m##11_im - temp); \
537  ah##22_im = (m##22_im - temp); \
538  ah##01_re = (m##01_re - m##10_re)*0.5; \
539  ah##02_re = (m##02_re - m##20_re)*0.5; \
540  ah##12_re = (m##12_re - m##21_re)*0.5; \
541  ah##01_im = (m##01_im + m##10_im)*0.5; \
542  ah##02_im = (m##02_im + m##20_im)*0.5; \
543  ah##12_im = (m##12_im + m##21_im)*0.5; \
544  ah##space = 0; \
545  }while(0)
546 
547 
548 #define LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, stride) do{ \
549  int start_pos = idx + dir*stride*5; \
550  var##0 = src[start_pos]; \
551  var##1 = src[start_pos + stride]; \
552  var##2 = src[start_pos + stride*2]; \
553  var##3 = src[start_pos + stride*3]; \
554  var##4 = src[start_pos + stride*4]; \
555  }while(0)
556 
557 #define LOAD_ANTI_HERMITIAN_SINGLE_TEX(src, dir, idx, var) do{ \
558  int start_pos = idx + dir*Vh*5; \
559  var##0 = tex1Dfetch(src, start_pos); \
560  var##1 = tex1Dfetch(src, start_pos + Vh); \
561  var##2 = tex1Dfetch(src, start_pos + Vh*2); \
562  var##3 = tex1Dfetch(src, start_pos + Vh*3); \
563  var##4 = tex1Dfetch(src, start_pos + Vh*4); \
564  }while(0)
565 
566 #define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride) do{ \
567  int start_ps = idx + dir*stride*5; \
568  mem[start_ps] = var##0; \
569  mem[start_ps + stride] = var##1; \
570  mem[start_ps + stride*2] = var##2; \
571  mem[start_ps + stride*3] = var##3; \
572  mem[start_ps + stride*4] = var##4; \
573  }while(0)
574 
575 #define COPY_SU3_MATRIX(a, b) \
576  b##00_re = a##00_re; \
577  b##00_im = a##00_im; \
578  b##01_re = a##01_re; \
579  b##01_im = a##01_im; \
580  b##02_re = a##02_re; \
581  b##02_im = a##02_im; \
582  b##10_re = a##10_re; \
583  b##10_im = a##10_im; \
584  b##11_re = a##11_re; \
585  b##11_im = a##11_im; \
586  b##12_re = a##12_re; \
587  b##12_im = a##12_im; \
588  b##20_re = a##20_re; \
589  b##20_im = a##20_im; \
590  b##21_re = a##21_re; \
591  b##21_im = a##21_im; \
592  b##22_re = a##22_re; \
593  b##22_im = a##22_im;
594 
595 #define SU3_ADJOINT(a, b) \
596  b##00_re = a##00_re; \
597  b##00_im = - a##00_im; \
598  b##01_re = a##10_re; \
599  b##01_im = - a##10_im; \
600  b##02_re = a##20_re; \
601  b##02_im = - a##20_im; \
602  b##10_re = a##01_re; \
603  b##10_im = - a##01_im; \
604  b##11_re = a##11_re; \
605  b##11_im = - a##11_im; \
606  b##12_re = a##21_re; \
607  b##12_im = - a##21_im; \
608  b##20_re = a##02_re; \
609  b##20_im = - a##02_im; \
610  b##21_re = a##12_re; \
611  b##21_im = - a##12_im; \
612  b##22_re = a##22_re; \
613  b##22_im = - a##22_im;
614 
615 #define SET_UNIT_SU3_MATRIX(a) \
616  a##00_re = 1.0; \
617  a##00_im = 0; \
618  a##01_re = 0; \
619  a##01_im = 0; \
620  a##02_re = 0; \
621  a##02_im = 0; \
622  a##10_re = 0; \
623  a##10_im = 0; \
624  a##11_re = 1.0; \
625  a##11_im = 0; \
626  a##12_re = 0; \
627  a##12_im = 0; \
628  a##20_re = 0; \
629  a##20_im = 0; \
630  a##21_re = 0; \
631  a##21_im = 0; \
632  a##22_re = 1.0; \
633  a##22_im = 0;
634 
635 // Performs the complex conjugated accumulation: a = b* c*
636 #define ACC_CONJ_PROD_ASSIGN(a, b, c) \
637  a##_re = b##_re * c##_re; \
638  a##_re -= b##_im * c##_im; \
639  a##_im = - b##_re * c##_im; \
640  a##_im -= b##_im * c##_re
641 
642 
643 #define RECONSTRUCT_LINK_12(sign, var) \
644  ACC_CONJ_PROD_ASSIGN(var##20, +var##01, +var##12); \
645  ACC_CONJ_PROD(var##20, -var##02, +var##11); \
646  ACC_CONJ_PROD_ASSIGN(var##21, +var##02, +var##10); \
647  ACC_CONJ_PROD(var##21, -var##00, +var##12); \
648  ACC_CONJ_PROD_ASSIGN(var##22, +var##00, +var##11); \
649  ACC_CONJ_PROD(var##22, -var##01, +var##10); \
650  var##20_re *=sign;var##20_im *=sign; var##21_re *=sign; var##21_im *=sign; \
651  var##22_re *=sign;var##22_im *=sign;
652 
653 #define COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
654  switch(mydir){ \
655  case 0: \
656  new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>> 1; \
657  break; \
658  case 1: \
659  new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1) >> 1; \
660  break; \
661  case 2: \
662  new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1) >> 1; \
663  break; \
664  case 3: \
665  new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1) >> 1; \
666  break; \
667  } \
668  }while(0)
669 
670 #define COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
671  switch(mydir){ \
672  case 0: \
673  new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \
674  break; \
675  case 1: \
676  new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \
677  break; \
678  case 2: \
679  new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \
680  break; \
681  case 3: \
682  new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \
683  break; \
684  } \
685  }while(0)
686 
687 
688 #define COMPUTE_NEW_FULL_IDX_PLUS(mydir, idx) do { \
689  switch(mydir){ \
690  case 0: \
691  new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1); \
692  break; \
693  case 1: \
694  new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1); \
695  break; \
696  case 2: \
697  new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
698  break; \
699  case 3: \
700  new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
701  break; \
702  } \
703  }while(0)
704 
705 #define COMPUTE_NEW_FULL_IDX_MINUS(mydir, idx) do { \
706  switch(mydir){ \
707  case 0: \
708  new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \
709  break; \
710  case 1: \
711  new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \
712  break; \
713  case 2: \
714  new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \
715  break; \
716  case 3: \
717  new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \
718  break; \
719  } \
720  }while(0)
721 
722 
723 #endif // _FORCE_COMMON_H