QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
read_gauge.h
Go to the documentation of this file.
1 //#include <half_quda.h>
2 
3 // Performs complex addition
4 #define COMPLEX_ADD_TO(a, b) \
5  a##_re += b##_re, \
6  a##_im += b##_im
7 
8 #define COMPLEX_PRODUCT(a, b, c) \
9  a##_re = b##_re*c##_re; \
10  a##_re -= b##_im*c##_im; \
11  a##_im = b##_re*c##_im; \
12  a##_im += b##_im*c##_re
13 
14 #define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \
15  a##_re = b##_re*c##_re; \
16  a##_re -= b##_im*c##_im; \
17  a##_im = -b##_re*c##_im; \
18  a##_im -= b##_im*c##_re
19 
20 // Performs a complex dot product
21 #define COMPLEX_DOT_PRODUCT(a, b, c) \
22  a##_re = b##_re*c##_re; \
23  a##_re += b##_im*c##_im; \
24  a##_im = b##_re*c##_im; \
25  a##_im -= b##_im*c##_re
26 
27 // Performs a complex norm
28 #define COMPLEX_NORM(a, b) \
29  a = b##_re*b##_re; \
30  a += b##_im*b##_im
31 
32 #define ACC_COMPLEX_PROD(a, b, c) \
33  a##_re += b##_re*c##_re; \
34  a##_re -= b##_im*c##_im; \
35  a##_im += b##_re*c##_im; \
36  a##_im += b##_im*c##_re
37 
38 // Performs the complex conjugated accumulation: a += b* c*
39 #define ACC_CONJ_PROD(a, b, c) \
40  a##_re += b##_re * c##_re; \
41  a##_re -= b##_im * c##_im; \
42  a##_im -= b##_re * c##_im; \
43  a##_im -= b##_im * c##_re
44 
45 #define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
46  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
47  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
48  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
49  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
50  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
51  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
52  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
53  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
54  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
55 
56 #define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
57  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
58  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
59  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
60  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
61  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
62  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
63  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
64  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
65  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
66 
67 #define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
68  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
69  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
70  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
71  float4 G##3 = make_float4(0,0,0,0); \
72  float4 G##4 = make_float4(0,0,0,0);
73 
74 #define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
75  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
76  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
77  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
78  float4 G##3 = make_float4(0,0,0,0); \
79  float4 G##4 = make_float4(0,0,0,0);
80 
81 // set A to be last components of G4 (otherwise unused)
82 #define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
83  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
84  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
85  float4 G##2 = make_float4(0,0,0,0); \
86  float4 G##3 = make_float4(0,0,0,0); \
87  float4 G##4 = make_float4(0,0,0,0); \
88  (G##3).z = (G##0).x; \
89  (G##3).w = (G##0).y;
90 
91 #define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
92  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
93  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
94  float4 G##2 = make_float4(0,0,0,0); \
95  float4 G##3 = make_float4(0,0,0,0); \
96  float4 G##4 = make_float4(0,0,0,0); \
97  (G##3).z = (G##0).x = pi_f*(G##0).x; \
98  (G##3).w = (G##0).y = pi_f*(G##0).y;
99 
100 #define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
101  double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
102  double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
103  double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
104  double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
105  double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
106  double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
107  double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
108  double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
109  double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
110 
111 #define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
112  float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
113  float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
114  float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
115  float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
116  float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
117  float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
118  float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
119  float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
120  float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
121 
122 #define READ_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
123  float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
124  float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
125  float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
126  float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
127  float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
128  float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
129  float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
130  float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
131  float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
132 
133 #define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
134  double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
135  double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
136  double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
137  double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
138  double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
139  double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
140  double2 G##6 = make_double2(0,0); \
141  double2 G##7 = make_double2(0,0); \
142  double2 G##8 = make_double2(0,0);
143 
144 #define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
145  float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
146  float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
147  float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
148  float4 G##3 = make_float4(0,0,0,0); \
149  float4 G##4 = make_float4(0,0,0,0);
150 
151 #define READ_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
152  float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
153  float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
154  float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
155  float4 G##3 = make_float4(0,0,0,0); \
156  float4 G##4 = make_float4(0,0,0,0);
157 
158 // set A to be last components of G4 (otherwise unused)
159 #define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
160  double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
161  double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
162  double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
163  double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
164  double2 G##4 = make_double2(0,0); \
165  double2 G##5 = make_double2(0,0); \
166  double2 G##6 = make_double2(0,0); \
167  double2 G##7 = make_double2(0,0); \
168  double2 G##8 = make_double2(0,0); \
169  (G##7).x = (G##0).x; \
170  (G##7).y = (G##0).y;
171 
172 // set A to be last components of G4 (otherwise unused)
173 #define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
174  float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
175  float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
176  float4 G##2 = make_float4(0,0,0,0); \
177  float4 G##3 = make_float4(0,0,0,0); \
178  float4 G##4 = make_float4(0,0,0,0); \
179  (G##3).z = (G##0).x; \
180  (G##3).w = (G##0).y;
181 
182 #define READ_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
183  float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
184  float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
185  float4 G##2 = make_float4(0,0,0,0); \
186  float4 G##3 = make_float4(0,0,0,0); \
187  float4 G##4 = make_float4(0,0,0,0); \
188  (G##3).z = (G##0).x = pi_f*(G##0).x; \
189  (G##3).w = (G##0).y = pi_f*(G##0).y;
190 
191 
192 #define READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride){ \
193  P = 2.*M_PI*phase[idx + (dir/2)*stride]; \
194 }
195 
196 #define READ_GAUGE_PHASE_FLOAT(P, phase, dir, idx, stride){ \
197  P = 2.f*pi_f*phase[idx + (dir/2)*stride]; \
198 }
199 
200 #define READ_GAUGE_PHASE_SHORT(P, phase, dir, idx, stride){ \
201  P = 2.f*pi_f*short2float(phase[idx + (dir/2)*stride]); \
202 }
203 
206 #define ASSN_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
207  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
208  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
209  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
210  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
211  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
212  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
213  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
214  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
215  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
216 
217 #define ASSN_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
218  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
219  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
220  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
221  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
222  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
223  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
224  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
225  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
226  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
227 
228 #define ASSN_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
229  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
230  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
231  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
232  G##3 = make_float4(0,0,0,0); \
233  G##4 = make_float4(0,0,0,0);
234 
235 #define ASSN_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
236  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
237  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
238  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
239  G##3 = make_float4(0,0,0,0); \
240  G##4 = make_float4(0,0,0,0);
241 
242 // set A to be last components of G4 (otherwise unused)
243 #define ASSN_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
244  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
245  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
246  G##2 = make_float4(0,0,0,0); \
247  G##3 = make_float4(0,0,0,0); \
248  G##4 = make_float4(0,0,0,0); \
249  (G##3).z = (G##0).x; \
250  (G##3).w = (G##0).y;
251 
252 #define ASSN_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
253  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
254  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
255  G##2 = make_float4(0,0,0,0); \
256  G##3 = make_float4(0,0,0,0); \
257  G##4 = make_float4(0,0,0,0); \
258  (G##3).z = (G##0).x = pi_f*(G##0).x; \
259  (G##3).w = (G##0).y = pi_f*(G##0).y;
260 
261 #define ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
262  G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
263  G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
264  G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
265  G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
266  G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
267  G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
268  G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
269  G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
270  G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
271 
272 #define ASSN_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
273  G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
274  G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
275  G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
276  G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
277  G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
278  G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
279  G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
280  G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
281  G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
282 
283 #define ASSN_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
284  G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
285  G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
286  G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
287  G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
288  G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
289  G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
290  G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
291  G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
292  G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
293 
294 #define ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
295  G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
296  G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
297  G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
298  G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
299  G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
300  G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
301  G##6 = make_double2(0,0); \
302  G##7 = make_double2(0,0); \
303  G##8 = make_double2(0,0);
304 
305 #define ASSN_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
306  G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
307  G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
308  G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
309  G##3 = make_float4(0,0,0,0); \
310  G##4 = make_float4(0,0,0,0);
311 
312 #define ASSN_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
313  G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
314  G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
315  G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
316  G##3 = make_float4(0,0,0,0); \
317  G##4 = make_float4(0,0,0,0);
318 
319 // set A to be last components of G4 (otherwise unused)
320 #define ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
321  G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
322  G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
323  G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
324  G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
325  G##4 = make_double2(0,0); \
326  G##5 = make_double2(0,0); \
327  G##6 = make_double2(0,0); \
328  G##7 = make_double2(0,0); \
329  G##8 = make_double2(0,0); \
330  (G##7).x = (G##0).x; \
331  (G##7).y = (G##0).y;
332 
333 // set A to be last components of G4 (otherwise unused)
334 #define ASSN_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
335  G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
336  G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
337  G##2 = make_float4(0,0,0,0); \
338  G##3 = make_float4(0,0,0,0); \
339  G##4 = make_float4(0,0,0,0); \
340  (G##3).z = (G##0).x; \
341  (G##3).w = (G##0).y;
342 
343 #define ASSN_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
344  G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
345  G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
346  G##2 = make_float4(0,0,0,0); \
347  G##3 = make_float4(0,0,0,0); \
348  G##4 = make_float4(0,0,0,0); \
349  (G##3).z = (G##0).x = pi_f*(G##0).x; \
350  (G##3).w = (G##0).y = pi_f*(G##0).y;
351 
352 
353 /*----END--*/
354 
355 #define RESCALE2(G, max) \
356  (G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \
357  (G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \
358  (G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \
359  (G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \
360  (G##8).x *= max; (G##8).y *= max;
361 
362 // FIXME: merge staggered and Wilson reconstruct macros
363 
364 #define RECONSTRUCT_MATRIX_18_DOUBLE(dir)
365 #define RECONSTRUCT_MATRIX_18_SINGLE(dir)
366 
367 #ifndef MULTI_GPU
368 #define do_boundary ga_idx >= X4X3X2X1hmX3X2X1h
369 #else
370 #define do_boundary ( (Pt0 && (ga_idx >= Vh)) || ( PtNm1 && (ga_idx >= X4X3X2X1hmX3X2X1h) && (ga_idx < Vh) ) )
371 #endif
372 
373 #define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \
374  ACC_CONJ_PROD(g20, +g01, +g12); \
375  ACC_CONJ_PROD(g20, -g02, +g11); \
376  ACC_CONJ_PROD(g21, +g02, +g10); \
377  ACC_CONJ_PROD(g21, -g00, +g12); \
378  ACC_CONJ_PROD(g22, +g00, +g11); \
379  ACC_CONJ_PROD(g22, -g01, +g10); \
380  double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
381  G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0;
382 
383 #define RECONSTRUCT_MATRIX_12_SINGLE(dir) \
384  ACC_CONJ_PROD(g20, +g01, +g12); \
385  ACC_CONJ_PROD(g20, -g02, +g11); \
386  ACC_CONJ_PROD(g21, +g02, +g10); \
387  ACC_CONJ_PROD(g21, -g00, +g12); \
388  ACC_CONJ_PROD(g22, +g00, +g11); \
389  ACC_CONJ_PROD(g22, -g01, +g10); \
390  float u0 = (dir < 6 ? anisotropy_f : (do_boundary ? t_boundary_f : 1)); \
391  G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
392 
393 #define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
394  double row_sum = g01_re*g01_re; \
395  row_sum += g01_im*g01_im; \
396  row_sum += g02_re*g02_re; \
397  row_sum += g02_im*g02_im; \
398  double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
399  double u02_inv = 1.0 / (u0*u0); \
400  double column_sum = u02_inv - row_sum; \
401  double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
402  sincos(g21_re, &g00_im, &g00_re); \
403  g00_re *= U00_mag; \
404  g00_im *= U00_mag; \
405  column_sum += g10_re*g10_re; \
406  column_sum += g10_im*g10_im; \
407  sincos(g21_im, &g20_im, &g20_re); \
408  double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
409  g20_re *= U20_mag; \
410  g20_im *= U20_mag; \
411  double r_inv2 = 1.0 / (u0*row_sum); \
412  double A_re, A_im; \
413  COMPLEX_DOT_PRODUCT(A, g00, g10); \
414  A_re *= u0; A_im *= u0; \
415  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
416  ACC_COMPLEX_PROD(g11, A, g01); \
417  g11_re *= -r_inv2; \
418  g11_im *= -r_inv2; \
419  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
420  ACC_COMPLEX_PROD(g12, -A, g02); \
421  g12_re *= r_inv2; \
422  g12_im *= r_inv2; \
423  COMPLEX_DOT_PRODUCT(A, g00, g20); \
424  A_re *= u0; A_im *= u0; \
425  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
426  ACC_COMPLEX_PROD(g21, -A, g01); \
427  g21_re *= r_inv2; \
428  g21_im *= r_inv2; \
429  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
430  ACC_COMPLEX_PROD(g22, A, g02); \
431  g22_re *= -r_inv2; \
432  g22_im *= -r_inv2;
433 
434  #define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
435  float row_sum = g01_re*g01_re; \
436  row_sum += g01_im*g01_im; \
437  row_sum += g02_re*g02_re; \
438  row_sum += g02_im*g02_im; \
439  __sincosf(g21_re, &g00_im, &g00_re); \
440  __sincosf(g21_im, &g20_im, &g20_re); \
441  float2 u0_2 = (dir < 6 ? An2 : (do_boundary ? TB2 : No2)); \
442  float column_sum = u0_2.y - row_sum; \
443  float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
444  g00_re *= U00_mag; \
445  g00_im *= U00_mag; \
446  column_sum += g10_re*g10_re; \
447  column_sum += g10_im*g10_im; \
448  column_sum = u0_2.y - column_sum; \
449  float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
450  g20_re *= U20_mag; \
451  g20_im *= U20_mag; \
452  float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \
453  float A_re, A_im; \
454  COMPLEX_DOT_PRODUCT(A, g00, g10); \
455  A_re *= u0_2.x; A_im *= u0_2.x; \
456  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
457  ACC_COMPLEX_PROD(g11, A, g01); \
458  g11_re *= -r_inv2; \
459  g11_im *= -r_inv2; \
460  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
461  ACC_COMPLEX_PROD(g12, -A, g02); \
462  g12_re *= r_inv2; \
463  g12_im *= r_inv2; \
464  COMPLEX_DOT_PRODUCT(A, g00, g20); \
465  A_re *= u0_2.x; A_im *= u0_2.x; \
466  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
467  ACC_COMPLEX_PROD(g21, -A, g01); \
468  g21_re *= r_inv2; \
469  g21_im *= r_inv2; \
470  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
471  ACC_COMPLEX_PROD(g22, A, g02); \
472  g22_re *= -r_inv2; \
473  g22_im *= -r_inv2;
474 
475 
476 
477 /************* the following is added for staggered *********/
478 
479 #define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
480  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
481  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
482  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
483  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
484  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
485  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
486  {float u0 = coeff_f*sign; \
487  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
488  gauge##22_re *=u0;gauge##22_im *=u0;}
489 
490 #define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
491  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
492  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
493  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
494  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
495  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
496  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
497  {double u0 = coeff* sign; \
498  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
499  gauge##22_re *=u0;gauge##22_im *=u0;}
500 
501 
502 #define RECONSTRUCT_GAUGE_MATRIX_13_SINGLE(dir, gauge, idx, sign) { \
503  RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
504  float exp_i3phase_re, exp_i3phase_im; \
505  sincosf(3.f*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
506  float A_re, A_im; \
507  COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
508  gauge##20_re = A_re; \
509  gauge##20_im = A_im; \
510  COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
511  gauge##21_re = A_re; \
512  gauge##21_im = A_im; \
513  COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
514  gauge##22_re = A_re; \
515  gauge##22_im = A_im; \
516 }
517 
518 
519 #define RECONSTRUCT_GAUGE_MATRIX_13_DOUBLE(dir, gauge, idx, sign) { \
520  RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
521  double exp_i3phase_re, exp_i3phase_im; \
522  sincos(3.*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
523  double A_re, A_im; \
524  COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
525  gauge##20_re = A_re; \
526  gauge##20_im = A_im; \
527  COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
528  gauge##21_re = A_re; \
529  gauge##21_im = A_im; \
530  COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
531  gauge##22_re = A_re; \
532  gauge##22_im = A_im; \
533 }
534 
535 
536 
537 
538 #define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
539  double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
540  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
541  double u0 = coeff*sign; \
542  double u02_inv = 1.0 / (u0*u0); \
543  double column_sum = u02_inv - row_sum; \
544  double U00_mag = sqrt(column_sum); \
545  sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \
546  gauge##00_re *= U00_mag; \
547  gauge##00_im *= U00_mag; \
548  column_sum += gauge##10_re*gauge##10_re; \
549  column_sum += gauge##10_im*gauge##10_im; \
550  sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \
551  double U20_mag = sqrt(u02_inv - column_sum); \
552  gauge##20_re *= U20_mag; \
553  gauge##20_im *= U20_mag; \
554  double r_inv2 = 1.0 / (u0*row_sum); \
555  double A_re, A_im; \
556  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
557  A_re *= u0; A_im *= u0; \
558  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
559  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
560  gauge##11_re *= -r_inv2; \
561  gauge##11_im *= -r_inv2; \
562  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
563  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
564  gauge##12_re *= r_inv2; \
565  gauge##12_im *= r_inv2; \
566  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
567  A_re *= u0; A_im *= u0; \
568  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
569  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
570  gauge##21_re *= r_inv2; \
571  gauge##21_im *= r_inv2; \
572  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
573  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
574  gauge##22_re *= -r_inv2; \
575  gauge##22_im *= -r_inv2;
576 
577 #define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \
578  float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
579  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
580  float u0 = coeff_f*sign; \
581  float u02_inv = __fdividef(1.f, u0*u0); \
582  float column_sum = u02_inv - row_sum; \
583  float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \
584  __sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \
585  gauge##00_re *= U00_mag; \
586  gauge##00_im *= U00_mag; \
587  column_sum += gauge##10_re*gauge##10_re; \
588  column_sum += gauge##10_im*gauge##10_im; \
589  __sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \
590  float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \
591  gauge##20_re *= U20_mag; \
592  gauge##20_im *= U20_mag; \
593  float r_inv2 = __fdividef(1.0f, u0*row_sum); \
594  float A_re, A_im; \
595  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
596  A_re *= u0; A_im *= u0; \
597  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
598  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
599  gauge##11_re *= -r_inv2; \
600  gauge##11_im *= -r_inv2; \
601  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
602  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
603  gauge##12_re *= r_inv2; \
604  gauge##12_im *= r_inv2; \
605  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
606  A_re *= u0; A_im *= u0; \
607  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
608  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
609  gauge##21_re *= r_inv2; \
610  gauge##21_im *= r_inv2; \
611  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
612  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
613  gauge##22_re *= -r_inv2; \
614  gauge##22_im *= -r_inv2;}
615 
616 
617 
618 #define RECONSTRUCT_GAUGE_MATRIX_9_SINGLE(dir, gauge, idx, sign) { \
619  RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) \
620  float exp_iphase_re, exp_iphase_im; \
621  __sincosf(PHASE, &exp_iphase_im, &exp_iphase_re); \
622  float B_re, B_im; \
623  COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
624  gauge##00_re = B_re; \
625  gauge##00_im = B_im; \
626  COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
627  gauge##01_re = B_re; \
628  gauge##01_im = B_im; \
629  COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
630  gauge##02_re = B_re; \
631  gauge##02_im = B_im; \
632  COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
633  gauge##10_re = B_re; \
634  gauge##10_im = B_im; \
635  COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
636  gauge##11_re = B_re; \
637  gauge##11_im = B_im; \
638  COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
639  gauge##12_re = B_re; \
640  gauge##12_im = B_im; \
641  COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
642  gauge##20_re = B_re; \
643  gauge##20_im = B_im; \
644  COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
645  gauge##21_re = B_re; \
646  gauge##21_im = B_im; \
647  COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
648  gauge##22_re = B_re; \
649  gauge##22_im = B_im; \
650 }
651 
652 
653 #define RECONSTRUCT_GAUGE_MATRIX_9_DOUBLE(dir, gauge, idx, sign) { \
654  RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
655  double exp_iphase_re, exp_iphase_im; \
656  sincos(PHASE, &exp_iphase_im, &exp_iphase_re); \
657  double B_re, B_im; \
658  COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
659  gauge##00_re = B_re; \
660  gauge##00_im = B_im; \
661  COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
662  gauge##01_re = B_re; \
663  gauge##01_im = B_im; \
664  COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
665  gauge##02_re = B_re; \
666  gauge##02_im = B_im; \
667  COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
668  gauge##10_re = B_re; \
669  gauge##10_im = B_im; \
670  COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
671  gauge##11_re = B_re; \
672  gauge##11_im = B_im; \
673  COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
674  gauge##12_re = B_re; \
675  gauge##12_im = B_im; \
676  COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
677  gauge##20_re = B_re; \
678  gauge##20_im = B_im; \
679  COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
680  gauge##21_re = B_re; \
681  gauge##21_im = B_im; \
682  COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
683  gauge##22_re = B_re; \
684  gauge##22_im = B_im; \
685 }
686 
687 
688 
689 
690 // Fermi patch to disable double-precision texture reads
691 #ifdef FERMI_NO_DBLE_TEX
692 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
693  READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
694 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
695  READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
696 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
697  READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
698 #define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) \
699  READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride)
700 
703 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
704  ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
705 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
706  ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
707 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
708  ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
709 
710 #else
711 
712 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
713  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
714  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
715  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
716  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
717  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
718  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
719  double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
720  double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
721  double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
722 
723 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
724  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
725  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
726  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
727  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
728  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
729  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
730  double2 G##6 = make_double2(0,0); \
731  double2 G##7 = make_double2(0,0); \
732  double2 G##8 = make_double2(0,0); \
733 
734 // set A to be last components of G4 (otherwise unused)
735 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
736  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
737  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
738  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
739  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
740  double2 G##4 = make_double2(0,0); \
741  double2 G##5 = make_double2(0,0); \
742  double2 G##6 = make_double2(0,0); \
743  double2 G##7 = make_double2(0,0); \
744  double2 G##8 = make_double2(0,0); \
745  (G##7).x = (G##0).x; \
746  (G##7).y = (G##0).y;
747 
748 
749 
750 
751 #define READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride) { \
752  P = 2.f*pi_f*TEX1DFETCH(float, (phase), idx + (dir/2)*stride); \
753 }
754 
755 #define READ_GAUGE_PHASE_SHORT_TEX(P, phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride)
756 
757 #define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) { \
758  P = 2*M_PI*fetch_double((phase), idx + (dir/2)*stride); \
759 }
760 
763 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
764  G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
765  G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
766  G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
767  G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
768  G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
769  G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
770  G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
771  G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
772  G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
773 
774 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
775  G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
776  G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
777  G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
778  G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
779  G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
780  G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
781  G##6 = make_double2(0,0); \
782  G##7 = make_double2(0,0); \
783  G##8 = make_double2(0,0); \
784 
785 // set A to be last components of G4 (otherwise unused)
786 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
787  G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
788  G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
789  G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
790  G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
791  G##4 = make_double2(0,0); \
792  G##5 = make_double2(0,0); \
793  G##6 = make_double2(0,0); \
794  G##7 = make_double2(0,0); \
795  G##8 = make_double2(0,0); \
796  (G##7).x = (G##0).x; \
797  (G##7).y = (G##0).y;
798 
799 
800 
801 #endif
802