QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
read_gauge.h
Go to the documentation of this file.
1 //#include <half_quda.h>
2 
3 // Performs complex addition
4 #define COMPLEX_ADD_TO(a, b) \
5  a##_re += b##_re, \
6  a##_im += b##_im
7 
8 #define COMPLEX_PRODUCT(a, b, c) \
9  a##_re = b##_re*c##_re; \
10  a##_re -= b##_im*c##_im; \
11  a##_im = b##_re*c##_im; \
12  a##_im += b##_im*c##_re
13 
14 #define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \
15  a##_re = b##_re*c##_re; \
16  a##_re -= b##_im*c##_im; \
17  a##_im = -b##_re*c##_im; \
18  a##_im -= b##_im*c##_re
19 
20 // Performs a complex dot product
21 #define COMPLEX_DOT_PRODUCT(a, b, c) \
22  a##_re = b##_re*c##_re; \
23  a##_re += b##_im*c##_im; \
24  a##_im = b##_re*c##_im; \
25  a##_im -= b##_im*c##_re
26 
27 // Performs a complex norm
28 #define COMPLEX_NORM(a, b) \
29  a = b##_re*b##_re; \
30  a += b##_im*b##_im
31 
32 #define ACC_COMPLEX_PROD(a, b, c) \
33  a##_re += b##_re*c##_re; \
34  a##_re -= b##_im*c##_im; \
35  a##_im += b##_re*c##_im; \
36  a##_im += b##_im*c##_re
37 
38 // Performs the complex conjugated accumulation: a += b* c*
39 #define ACC_CONJ_PROD(a, b, c) \
40  a##_re += b##_re * c##_re; \
41  a##_re -= b##_im * c##_im; \
42  a##_im -= b##_re * c##_im; \
43  a##_im -= b##_im * c##_re
44 
45 #define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
46  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
47  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
48  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
49  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
50  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
51  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
52  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
53  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
54  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
55 
56 #define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
57  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
58  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
59  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
60  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
61  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
62  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
63  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
64  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
65  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
66 
67 #define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
68  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
69  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
70  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
71  float4 G##3 = make_float4(0,0,0,0); \
72  float4 G##4 = make_float4(0,0,0,0);
73 
74 #define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
75  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
76  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
77  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
78  float4 G##3 = make_float4(0,0,0,0); \
79  float4 G##4 = make_float4(0,0,0,0);
80 
81 // set A to be last components of G4 (otherwise unused)
82 #define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
83  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
84  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
85  float4 G##2 = make_float4(0,0,0,0); \
86  float4 G##3 = make_float4(0,0,0,0); \
87  float4 G##4 = make_float4(0,0,0,0); \
88  (G##3).z = (G##0).x; \
89  (G##3).w = (G##0).y;
90 
91 #define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
92  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
93  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
94  float4 G##2 = make_float4(0,0,0,0); \
95  float4 G##3 = make_float4(0,0,0,0); \
96  float4 G##4 = make_float4(0,0,0,0); \
97  (G##3).z = (G##0).x = pi_f*(G##0).x; \
98  (G##3).w = (G##0).y = pi_f*(G##0).y;
99 
100 #define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
101  double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
102  double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
103  double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
104  double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
105  double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
106  double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
107  double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
108  double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
109  double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
110 
111 #define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
112  float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
113  float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
114  float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
115  float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
116  float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
117  float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
118  float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
119  float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
120  float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
121 
122 #define READ_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
123  float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
124  float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
125  float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
126  float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
127  float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
128  float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
129  float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
130  float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
131  float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
132 
133 #define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
134  double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
135  double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
136  double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
137  double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
138  double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
139  double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
140  double2 G##6 = make_double2(0,0); \
141  double2 G##7 = make_double2(0,0); \
142  double2 G##8 = make_double2(0,0);
143 
144 #define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
145  float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
146  float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
147  float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
148  float4 G##3 = make_float4(0,0,0,0); \
149  float4 G##4 = make_float4(0,0,0,0);
150 
151 #define READ_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
152  float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
153  float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
154  float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
155  float4 G##3 = make_float4(0,0,0,0); \
156  float4 G##4 = make_float4(0,0,0,0);
157 
158 // set A to be last components of G4 (otherwise unused)
159 #define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
160  double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
161  double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
162  double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
163  double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
164  double2 G##4 = make_double2(0,0); \
165  double2 G##5 = make_double2(0,0); \
166  double2 G##6 = make_double2(0,0); \
167  double2 G##7 = make_double2(0,0); \
168  double2 G##8 = make_double2(0,0); \
169  (G##7).x = (G##0).x; \
170  (G##7).y = (G##0).y;
171 
172 // set A to be last components of G4 (otherwise unused)
173 #define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
174  float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
175  float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
176  float4 G##2 = make_float4(0,0,0,0); \
177  float4 G##3 = make_float4(0,0,0,0); \
178  float4 G##4 = make_float4(0,0,0,0); \
179  (G##3).z = (G##0).x; \
180  (G##3).w = (G##0).y;
181 
182 #define READ_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
183  float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
184  float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
185  float4 G##2 = make_float4(0,0,0,0); \
186  float4 G##3 = make_float4(0,0,0,0); \
187  float4 G##4 = make_float4(0,0,0,0); \
188  (G##3).z = (G##0).x = pi_f*(G##0).x; \
189  (G##3).w = (G##0).y = pi_f*(G##0).y;
190 
193 #define ASSN_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
194  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
195  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
196  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
197  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
198  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
199  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
200  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
201  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
202  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
203 
204 #define ASSN_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
205  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
206  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
207  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
208  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
209  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
210  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
211  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
212  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
213  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
214 
215 #define ASSN_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
216  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
217  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
218  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
219  G##3 = make_float4(0,0,0,0); \
220  G##4 = make_float4(0,0,0,0);
221 
222 #define ASSN_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
223  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
224  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
225  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
226  G##3 = make_float4(0,0,0,0); \
227  G##4 = make_float4(0,0,0,0);
228 
229 // set A to be last components of G4 (otherwise unused)
230 #define ASSN_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
231  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
232  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
233  G##2 = make_float4(0,0,0,0); \
234  G##3 = make_float4(0,0,0,0); \
235  G##4 = make_float4(0,0,0,0); \
236  (G##3).z = (G##0).x; \
237  (G##3).w = (G##0).y;
238 
239 #define ASSN_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
240  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
241  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
242  G##2 = make_float4(0,0,0,0); \
243  G##3 = make_float4(0,0,0,0); \
244  G##4 = make_float4(0,0,0,0); \
245  (G##3).z = (G##0).x = pi_f*(G##0).x; \
246  (G##3).w = (G##0).y = pi_f*(G##0).y;
247 
248 #define ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
249  G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
250  G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
251  G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
252  G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
253  G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
254  G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
255  G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
256  G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
257  G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
258 
259 #define ASSN_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
260  G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
261  G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
262  G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
263  G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
264  G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
265  G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
266  G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
267  G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
268  G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
269 
270 #define ASSN_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
271  G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
272  G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
273  G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
274  G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
275  G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
276  G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
277  G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
278  G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
279  G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
280 
281 #define ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
282  G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
283  G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
284  G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
285  G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
286  G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
287  G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
288  G##6 = make_double2(0,0); \
289  G##7 = make_double2(0,0); \
290  G##8 = make_double2(0,0);
291 
292 #define ASSN_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
293  G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
294  G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
295  G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
296  G##3 = make_float4(0,0,0,0); \
297  G##4 = make_float4(0,0,0,0);
298 
299 #define ASSN_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
300  G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
301  G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
302  G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
303  G##3 = make_float4(0,0,0,0); \
304  G##4 = make_float4(0,0,0,0);
305 
306 // set A to be last components of G4 (otherwise unused)
307 #define ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
308  G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
309  G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
310  G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
311  G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
312  G##4 = make_double2(0,0); \
313  G##5 = make_double2(0,0); \
314  G##6 = make_double2(0,0); \
315  G##7 = make_double2(0,0); \
316  G##8 = make_double2(0,0); \
317  (G##7).x = (G##0).x; \
318  (G##7).y = (G##0).y;
319 
320 // set A to be last components of G4 (otherwise unused)
321 #define ASSN_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
322  G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
323  G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
324  G##2 = make_float4(0,0,0,0); \
325  G##3 = make_float4(0,0,0,0); \
326  G##4 = make_float4(0,0,0,0); \
327  (G##3).z = (G##0).x; \
328  (G##3).w = (G##0).y;
329 
330 #define ASSN_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
331  G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
332  G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
333  G##2 = make_float4(0,0,0,0); \
334  G##3 = make_float4(0,0,0,0); \
335  G##4 = make_float4(0,0,0,0); \
336  (G##3).z = (G##0).x = pi_f*(G##0).x; \
337  (G##3).w = (G##0).y = pi_f*(G##0).y;
338 
339 
340 /*----END--*/
341 
342 #define RESCALE2(G, max) \
343  (G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \
344  (G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \
345  (G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \
346  (G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \
347  (G##8).x *= max; (G##8).y *= max;
348 
349 // FIXME: merge staggered and Wilson reconstruct macros
350 
351 #define RECONSTRUCT_MATRIX_18_DOUBLE(dir)
352 #define RECONSTRUCT_MATRIX_18_SINGLE(dir)
353 
354 #ifndef MULTI_GPU
355 #define do_boundary ga_idx >= X4X3X2X1hmX3X2X1h
356 #else
357 #define do_boundary ( (Pt0 && (ga_idx >= Vh)) || ( PtNm1 && (ga_idx >= X4X3X2X1hmX3X2X1h) && (ga_idx < Vh) ) )
358 #endif
359 
360 #define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \
361  ACC_CONJ_PROD(g20, +g01, +g12); \
362  ACC_CONJ_PROD(g20, -g02, +g11); \
363  ACC_CONJ_PROD(g21, +g02, +g10); \
364  ACC_CONJ_PROD(g21, -g00, +g12); \
365  ACC_CONJ_PROD(g22, +g00, +g11); \
366  ACC_CONJ_PROD(g22, -g01, +g10); \
367  double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
368  G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0;
369 
370 #define RECONSTRUCT_MATRIX_12_SINGLE(dir) \
371  ACC_CONJ_PROD(g20, +g01, +g12); \
372  ACC_CONJ_PROD(g20, -g02, +g11); \
373  ACC_CONJ_PROD(g21, +g02, +g10); \
374  ACC_CONJ_PROD(g21, -g00, +g12); \
375  ACC_CONJ_PROD(g22, +g00, +g11); \
376  ACC_CONJ_PROD(g22, -g01, +g10); \
377  float u0 = (dir < 6 ? anisotropy_f : (do_boundary ? t_boundary_f : 1)); \
378  G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
379 
380 #define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
381  double row_sum = g01_re*g01_re; \
382  row_sum += g01_im*g01_im; \
383  row_sum += g02_re*g02_re; \
384  row_sum += g02_im*g02_im; \
385  double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
386  double u02_inv = 1.0 / (u0*u0); \
387  double column_sum = u02_inv - row_sum; \
388  double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
389  sincos(g21_re, &g00_im, &g00_re); \
390  g00_re *= U00_mag; \
391  g00_im *= U00_mag; \
392  column_sum += g10_re*g10_re; \
393  column_sum += g10_im*g10_im; \
394  sincos(g21_im, &g20_im, &g20_re); \
395  double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
396  g20_re *= U20_mag; \
397  g20_im *= U20_mag; \
398  double r_inv2 = 1.0 / (u0*row_sum); \
399  double A_re, A_im; \
400  COMPLEX_DOT_PRODUCT(A, g00, g10); \
401  A_re *= u0; A_im *= u0; \
402  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
403  ACC_COMPLEX_PROD(g11, A, g01); \
404  g11_re *= -r_inv2; \
405  g11_im *= -r_inv2; \
406  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
407  ACC_COMPLEX_PROD(g12, -A, g02); \
408  g12_re *= r_inv2; \
409  g12_im *= r_inv2; \
410  COMPLEX_DOT_PRODUCT(A, g00, g20); \
411  A_re *= u0; A_im *= u0; \
412  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
413  ACC_COMPLEX_PROD(g21, -A, g01); \
414  g21_re *= r_inv2; \
415  g21_im *= r_inv2; \
416  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
417  ACC_COMPLEX_PROD(g22, A, g02); \
418  g22_re *= -r_inv2; \
419  g22_im *= -r_inv2;
420 
421  #define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
422  float row_sum = g01_re*g01_re; \
423  row_sum += g01_im*g01_im; \
424  row_sum += g02_re*g02_re; \
425  row_sum += g02_im*g02_im; \
426  __sincosf(g21_re, &g00_im, &g00_re); \
427  __sincosf(g21_im, &g20_im, &g20_re); \
428  float2 u0_2 = (dir < 6 ? An2 : (do_boundary ? TB2 : No2)); \
429  float column_sum = u0_2.y - row_sum; \
430  float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
431  g00_re *= U00_mag; \
432  g00_im *= U00_mag; \
433  column_sum += g10_re*g10_re; \
434  column_sum += g10_im*g10_im; \
435  column_sum = u0_2.y - column_sum; \
436  float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
437  g20_re *= U20_mag; \
438  g20_im *= U20_mag; \
439  float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \
440  float A_re, A_im; \
441  COMPLEX_DOT_PRODUCT(A, g00, g10); \
442  A_re *= u0_2.x; A_im *= u0_2.x; \
443  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
444  ACC_COMPLEX_PROD(g11, A, g01); \
445  g11_re *= -r_inv2; \
446  g11_im *= -r_inv2; \
447  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
448  ACC_COMPLEX_PROD(g12, -A, g02); \
449  g12_re *= r_inv2; \
450  g12_im *= r_inv2; \
451  COMPLEX_DOT_PRODUCT(A, g00, g20); \
452  A_re *= u0_2.x; A_im *= u0_2.x; \
453  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
454  ACC_COMPLEX_PROD(g21, -A, g01); \
455  g21_re *= r_inv2; \
456  g21_im *= r_inv2; \
457  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
458  ACC_COMPLEX_PROD(g22, A, g02); \
459  g22_re *= -r_inv2; \
460  g22_im *= -r_inv2;
461 
462 
463 
464 /************* the following is added for staggered *********/
465 
466 #define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
467  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
468  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
469  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
470  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
471  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
472  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
473  {float u0 = coeff_f*sign; \
474  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
475  gauge##22_re *=u0;gauge##22_im *=u0;}
476 
477 #define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
478  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
479  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
480  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
481  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
482  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
483  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
484  {double u0 = coeff* sign; \
485  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
486  gauge##22_re *=u0;gauge##22_im *=u0;}
487 
488 #define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
489  double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
490  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
491  double u0 = coeff*sign; \
492  double u02_inv = 1.0 / (u0*u0); \
493  double column_sum = u02_inv - row_sum; \
494  double U00_mag = sqrt(column_sum); \
495  sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \
496  gauge##00_re *= U00_mag; \
497  gauge##00_im *= U00_mag; \
498  column_sum += gauge##10_re*gauge##10_re; \
499  column_sum += gauge##10_im*gauge##10_im; \
500  sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \
501  double U20_mag = sqrt(u02_inv - column_sum); \
502  gauge##20_re *= U20_mag; \
503  gauge##20_im *= U20_mag; \
504  double r_inv2 = 1.0 / (u0*row_sum); \
505  double A_re, A_im; \
506  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
507  A_re *= u0; A_im *= u0; \
508  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
509  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
510  gauge##11_re *= -r_inv2; \
511  gauge##11_im *= -r_inv2; \
512  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
513  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
514  gauge##12_re *= r_inv2; \
515  gauge##12_im *= r_inv2; \
516  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
517  A_re *= u0; A_im *= u0; \
518  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
519  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
520  gauge##21_re *= r_inv2; \
521  gauge##21_im *= r_inv2; \
522  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
523  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
524  gauge##22_re *= -r_inv2; \
525  gauge##22_im *= -r_inv2;
526 
527 #define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \
528  float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
529  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
530  float u0 = coeff_f*sign; \
531  float u02_inv = __fdividef(1.f, u0*u0); \
532  float column_sum = u02_inv - row_sum; \
533  float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \
534  __sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \
535  gauge##00_re *= U00_mag; \
536  gauge##00_im *= U00_mag; \
537  column_sum += gauge##10_re*gauge##10_re; \
538  column_sum += gauge##10_im*gauge##10_im; \
539  __sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \
540  float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \
541  gauge##20_re *= U20_mag; \
542  gauge##20_im *= U20_mag; \
543  float r_inv2 = __fdividef(1.0f, u0*row_sum); \
544  float A_re, A_im; \
545  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
546  A_re *= u0; A_im *= u0; \
547  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
548  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
549  gauge##11_re *= -r_inv2; \
550  gauge##11_im *= -r_inv2; \
551  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
552  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
553  gauge##12_re *= r_inv2; \
554  gauge##12_im *= r_inv2; \
555  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
556  A_re *= u0; A_im *= u0; \
557  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
558  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
559  gauge##21_re *= r_inv2; \
560  gauge##21_im *= r_inv2; \
561  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
562  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
563  gauge##22_re *= -r_inv2; \
564  gauge##22_im *= -r_inv2;}
565 
566 // Fermi patch to disable double-precision texture reads
567 #ifdef FERMI_NO_DBLE_TEX
568 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
569  READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
570 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
571  READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
572 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
573  READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
574 
577 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
578  ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
579 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
580  ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
581 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
582  ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
583 
584 #else
585 
586 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
587  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
588  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
589  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
590  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
591  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
592  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
593  double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
594  double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
595  double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
596 
597 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
598  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
599  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
600  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
601  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
602  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
603  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
604  double2 G##6 = make_double2(0,0); \
605  double2 G##7 = make_double2(0,0); \
606  double2 G##8 = make_double2(0,0); \
607 
608 // set A to be last components of G4 (otherwise unused)
609 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
610  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
611  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
612  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
613  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
614  double2 G##4 = make_double2(0,0); \
615  double2 G##5 = make_double2(0,0); \
616  double2 G##6 = make_double2(0,0); \
617  double2 G##7 = make_double2(0,0); \
618  double2 G##8 = make_double2(0,0); \
619  (G##7).x = (G##0).x; \
620  (G##7).y = (G##0).y;
621 
624 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
625  G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
626  G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
627  G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
628  G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
629  G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
630  G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
631  G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
632  G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
633  G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
634 
635 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
636  G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
637  G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
638  G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
639  G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
640  G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
641  G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
642  G##6 = make_double2(0,0); \
643  G##7 = make_double2(0,0); \
644  G##8 = make_double2(0,0); \
645 
646 // set A to be last components of G4 (otherwise unused)
647 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
648  G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
649  G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
650  G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
651  G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
652  G##4 = make_double2(0,0); \
653  G##5 = make_double2(0,0); \
654  G##6 = make_double2(0,0); \
655  G##7 = make_double2(0,0); \
656  G##8 = make_double2(0,0); \
657  (G##7).x = (G##0).x; \
658  (G##7).y = (G##0).y;
659 
660 #endif
661