QUDA  0.9.0
read_gauge.h
Go to the documentation of this file.
1 //#include <half_quda.h>
2 
3 // Performs complex addition
4 #define COMPLEX_ADD_TO(a, b) \
5  a##_re += b##_re, \
6  a##_im += b##_im
7 
8 #define COMPLEX_PRODUCT(a, b, c) \
9  a##_re = b##_re*c##_re; \
10  a##_re -= b##_im*c##_im; \
11  a##_im = b##_re*c##_im; \
12  a##_im += b##_im*c##_re
13 
14 #define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \
15  a##_re = b##_re*c##_re; \
16  a##_re -= b##_im*c##_im; \
17  a##_im = -b##_re*c##_im; \
18  a##_im -= b##_im*c##_re
19 
20 // Performs a complex dot product
21 #define COMPLEX_DOT_PRODUCT(a, b, c) \
22  a##_re = b##_re*c##_re; \
23  a##_re += b##_im*c##_im; \
24  a##_im = b##_re*c##_im; \
25  a##_im -= b##_im*c##_re
26 
27 // Performs a complex norm
28 #define COMPLEX_NORM(a, b) \
29  a = b##_re*b##_re; \
30  a += b##_im*b##_im
31 
32 #define ACC_COMPLEX_PROD(a, b, c) \
33  a##_re += b##_re*c##_re; \
34  a##_re -= b##_im*c##_im; \
35  a##_im += b##_re*c##_im; \
36  a##_im += b##_im*c##_re
37 
38 // Performs the complex conjugated accumulation: a += b* c*
39 #define ACC_CONJ_PROD(a, b, c) \
40  a##_re += b##_re * c##_re; \
41  a##_re -= b##_im * c##_im; \
42  a##_im -= b##_re * c##_im; \
43  a##_im -= b##_im * c##_re
44 
45 #define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
46  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
47  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
48  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
49  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
50  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
51  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
52  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
53  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
54  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
55 
56 #define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
57  float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
58  float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
59  float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
60  float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
61  float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
62  float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
63  float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
64  float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
65  float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
66 
67 #define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
68  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
69  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
70  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
71  float4 G##3 = make_float4(0,0,0,0); \
72  float4 G##4 = make_float4(0,0,0,0);
73 
74 #define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
75  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
76  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
77  float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
78  float4 G##3 = make_float4(0,0,0,0); \
79  float4 G##4 = make_float4(0,0,0,0);
80 
81 // set A to be last components of G4 (otherwise unused)
82 #define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
83  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
84  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
85  float4 G##2 = make_float4(0,0,0,0); \
86  float4 G##3 = make_float4(0,0,0,0); \
87  float4 G##4 = make_float4(0,0,0,0); \
88  (G##3).z = (G##0).x; \
89  (G##3).w = (G##0).y;
90 
91 #define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
92  float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
93  float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
94  float4 G##2 = make_float4(0,0,0,0); \
95  float4 G##3 = make_float4(0,0,0,0); \
96  float4 G##4 = make_float4(0,0,0,0); \
97  (G##3).z = (G##0).x = M_PI*(G##0).x; \
98  (G##3).w = (G##0).y = M_PI*(G##0).y;
99 
100 #define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge_, dir, idx, stride) \
101  double2 *gauge = (double2*)gauge_; \
102  double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
103  double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
104  double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
105  double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
106  double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
107  double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
108  double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
109  double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
110  double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
111 
112 #define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge_, dir, idx, stride) \
113  float2 *gauge = (float2*)gauge_; \
114  float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
115  float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
116  float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
117  float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
118  float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
119  float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
120  float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
121  float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
122  float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
123 
124 #define READ_GAUGE_MATRIX_18_SHORT2(G, gauge_, dir, idx, stride) \
125  short2 *gauge = (short2*)gauge_; \
126  float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
127  float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
128  float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
129  float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
130  float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
131  float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
132  float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
133  float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
134  float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
135 
136 #define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge_, dir, idx, stride) \
137  double2 *gauge = (double2*)gauge_; \
138  double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
139  double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
140  double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
141  double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
142  double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
143  double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
144  double2 G##6 = make_double2(0,0); \
145  double2 G##7 = make_double2(0,0); \
146  double2 G##8 = make_double2(0,0);
147 
148 #define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge_, dir, idx, stride)\
149  float4 *gauge = (float4*)gauge_; \
150  float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
151  float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
152  float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
153  float4 G##3 = make_float4(0,0,0,0); \
154  float4 G##4 = make_float4(0,0,0,0);
155 
156 #define READ_GAUGE_MATRIX_12_SHORT4(G, gauge_, dir, idx, stride) \
157  short4 *gauge = (short4*)gauge_; \
158  float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
159  float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
160  float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
161  float4 G##3 = make_float4(0,0,0,0); \
162  float4 G##4 = make_float4(0,0,0,0);
163 
164 // set A to be last components of G4 (otherwise unused)
165 #define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge_, dir, idx, stride) \
166  double2 *gauge = (double2*)gauge_; \
167  double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
168  double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
169  double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
170  double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
171  double2 G##4 = make_double2(0,0); \
172  double2 G##5 = make_double2(0,0); \
173  double2 G##6 = make_double2(0,0); \
174  double2 G##7 = make_double2(0,0); \
175  double2 G##8 = make_double2(0,0); \
176  (G##7).x = (G##0).x; \
177  (G##7).y = (G##0).y;
178 
179 // set A to be last components of G4 (otherwise unused)
180 #define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge_, dir, idx, stride) \
181  float4 *gauge = (float4*)gauge_; \
182  float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
183  float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
184  float4 G##2 = make_float4(0,0,0,0); \
185  float4 G##3 = make_float4(0,0,0,0); \
186  float4 G##4 = make_float4(0,0,0,0); \
187  (G##3).z = (G##0).x; \
188  (G##3).w = (G##0).y;
189 
190 #define READ_GAUGE_MATRIX_8_SHORT4(G, gauge_, dir, idx, stride) \
191  short4 *gauge = (short4*)gauge_; \
192  float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
193  float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
194  float4 G##2 = make_float4(0,0,0,0); \
195  float4 G##3 = make_float4(0,0,0,0); \
196  float4 G##4 = make_float4(0,0,0,0); \
197  (G##3).z = (G##0).x = M_PI*(G##0).x; \
198  (G##3).w = (G##0).y = M_PI*(G##0).y;
199 
200 
201 #define READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride){ \
202  P = 2.*M_PI*((double*)phase)[idx + (dir/2)*stride]; \
203 }
204 
205 #define READ_GAUGE_PHASE_FLOAT(P, phase, dir, idx, stride){ \
206  P = 2.f*M_PI*((float*)phase)[idx + (dir/2)*stride]; \
207 }
208 
209 #define READ_GAUGE_PHASE_SHORT(P, phase, dir, idx, stride){ \
210  P = 2.f*M_PI*short2float(((short*)phase)[idx + (dir/2)*stride]); \
211 }
212 
215 #define ASSN_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
216  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
217  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
218  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
219  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
220  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
221  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
222  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
223  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
224  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
225 
226 #define ASSN_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
227  G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
228  G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
229  G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
230  G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
231  G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
232  G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
233  G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
234  G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
235  G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
236 
237 #define ASSN_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
238  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
239  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
240  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
241  G##3 = make_float4(0,0,0,0); \
242  G##4 = make_float4(0,0,0,0);
243 
244 #define ASSN_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
245  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
246  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
247  G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
248  G##3 = make_float4(0,0,0,0); \
249  G##4 = make_float4(0,0,0,0);
250 
251 // set A to be last components of G4 (otherwise unused)
252 #define ASSN_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
253  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
254  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
255  G##2 = make_float4(0,0,0,0); \
256  G##3 = make_float4(0,0,0,0); \
257  G##4 = make_float4(0,0,0,0); \
258  (G##3).z = (G##0).x; \
259  (G##3).w = (G##0).y;
260 
261 #define ASSN_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
262  G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
263  G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
264  G##2 = make_float4(0,0,0,0); \
265  G##3 = make_float4(0,0,0,0); \
266  G##4 = make_float4(0,0,0,0); \
267  (G##3).z = (G##0).x = M_PI*(G##0).x; \
268  (G##3).w = (G##0).y = M_PI*(G##0).y;
269 
270 #define ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge_, dir, idx, stride) \
271  double2 *gauge = (double2*)gauge_; \
272  G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
273  G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
274  G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
275  G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
276  G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
277  G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
278  G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
279  G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
280  G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
281 
282 #define ASSN_GAUGE_MATRIX_18_FLOAT2(G, gauge_, dir, idx, stride) \
283  float2 *gauge = (float2*)gauge_; \
284  G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
285  G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
286  G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
287  G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
288  G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
289  G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
290  G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
291  G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
292  G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
293 
294 #define ASSN_GAUGE_MATRIX_18_SHORT2(G, gauge_, dir, idx, stride) \
295  short2 *gauge = (short2*)gauge_; \
296  G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
297  G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
298  G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
299  G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
300  G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
301  G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
302  G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
303  G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
304  G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
305 
306 #define ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge_, dir, idx, stride) \
307  double2 *gauge = (double2*)gauge_; \
308  G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
309  G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
310  G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
311  G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
312  G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
313  G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
314  G##6 = make_double2(0,0); \
315  G##7 = make_double2(0,0); \
316  G##8 = make_double2(0,0);
317 
318 #define ASSN_GAUGE_MATRIX_12_FLOAT4(G, gauge_, dir, idx, stride) \
319  float4 *gauge = (float4*)gauge_; \
320  G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
321  G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
322  G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
323  G##3 = make_float4(0,0,0,0); \
324  G##4 = make_float4(0,0,0,0);
325 
326 #define ASSN_GAUGE_MATRIX_12_SHORT4(G, gauge_, dir, idx, stride) \
327  short4 *gauge = (short4*)gauge_; \
328  G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
329  G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
330  G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
331  G##3 = make_float4(0,0,0,0); \
332  G##4 = make_float4(0,0,0,0);
333 
334 // set A to be last components of G4 (otherwise unused)
335 #define ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge_, dir, idx, stride) \
336  double2 *gauge = (double2*)gauge_; \
337  G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
338  G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
339  G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
340  G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
341  G##4 = make_double2(0,0); \
342  G##5 = make_double2(0,0); \
343  G##6 = make_double2(0,0); \
344  G##7 = make_double2(0,0); \
345  G##8 = make_double2(0,0); \
346  (G##7).x = (G##0).x; \
347  (G##7).y = (G##0).y;
348 
349 // set A to be last components of G4 (otherwise unused)
350 #define ASSN_GAUGE_MATRIX_8_FLOAT4(G, gauge_, dir, idx, stride) \
351  float4 *gauge = (float4*)gauge_; \
352  G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
353  G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
354  G##2 = make_float4(0,0,0,0); \
355  G##3 = make_float4(0,0,0,0); \
356  G##4 = make_float4(0,0,0,0); \
357  (G##3).z = (G##0).x; \
358  (G##3).w = (G##0).y;
359 
360 #define ASSN_GAUGE_MATRIX_8_SHORT4(G, gauge_, dir, idx, stride) \
361  short4 *gauge = (short4*)gauge_; \
362  G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
363  G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
364  G##2 = make_float4(0,0,0,0); \
365  G##3 = make_float4(0,0,0,0); \
366  G##4 = make_float4(0,0,0,0); \
367  (G##3).z = (G##0).x = M_PI*(G##0).x; \
368  (G##3).w = (G##0).y = M_PI*(G##0).y;
369 
370 
371 /*----END--*/
372 
373 #define RESCALE2(G, max) \
374  (G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \
375  (G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \
376  (G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \
377  (G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \
378  (G##8).x *= max; (G##8).y *= max;
379 
380 #define RESCALE4(G, max) \
381  (G##0).x *= max; (G##0).y *= max; (G##0).z *= max; (G##0).w *= max; \
382  (G##1).x *= max; (G##1).y *= max; (G##1).z *= max; (G##1).w *= max; \
383  (G##2).x *= max; (G##2).y *= max; (G##2).z *= max; (G##2).w *= max; \
384  (G##3).x *= max; (G##3).y *= max; (G##3).z *= max; (G##3).w *= max; \
385  (G##4).x *= max; (G##4).y *= max;
386 
387 
388 
389 
390 // FIXME: merge staggered and Wilson reconstruct macros
391 
392 #define RECONSTRUCT_MATRIX_18_DOUBLE(dir)
393 #define RECONSTRUCT_MATRIX_18_SINGLE(dir)
394 
395 #ifndef MULTI_GPU
396 #define do_boundary ga_idx >= param.dc.X4X3X2X1hmX3X2X1h
397 #else
398 #define do_boundary ( (param.Pt0 && (ga_idx >= param.dc.Vh)) || ( param.PtNm1 && (ga_idx >= param.dc.X4X3X2X1hmX3X2X1h) && (ga_idx < param.dc.Vh) ) )
399 #endif
400 
401 #define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \
402  ACC_CONJ_PROD(g20, +g01, +g12); \
403  ACC_CONJ_PROD(g20, -g02, +g11); \
404  ACC_CONJ_PROD(g21, +g02, +g10); \
405  ACC_CONJ_PROD(g21, -g00, +g12); \
406  ACC_CONJ_PROD(g22, +g00, +g11); \
407  ACC_CONJ_PROD(g22, -g01, +g10); \
408  double u0 = (dir < 6 ? param.anisotropy : (do_boundary ? param.t_boundary : 1)); \
409  G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0;
410 
411 #define RECONSTRUCT_MATRIX_12_SINGLE(dir) \
412  ACC_CONJ_PROD(g20, +g01, +g12); \
413  ACC_CONJ_PROD(g20, -g02, +g11); \
414  ACC_CONJ_PROD(g21, +g02, +g10); \
415  ACC_CONJ_PROD(g21, -g00, +g12); \
416  ACC_CONJ_PROD(g22, +g00, +g11); \
417  ACC_CONJ_PROD(g22, -g01, +g10); \
418  float u0 = (dir < 6 ? param.anisotropy_f : (do_boundary ? param.t_boundary_f : 1)); \
419  G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
420 
421 #define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
422  double row_sum = g01_re*g01_re; \
423  row_sum += g01_im*g01_im; \
424  row_sum += g02_re*g02_re; \
425  row_sum += g02_im*g02_im; \
426  double u0 = (dir < 6 ? param.anisotropy : (do_boundary ? param.t_boundary : 1)); \
427  double u02_inv = 1.0 / (u0*u0); \
428  double column_sum = u02_inv - row_sum; \
429  double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
430  sincos(g21_re, &g00_im, &g00_re); \
431  g00_re *= U00_mag; \
432  g00_im *= U00_mag; \
433  column_sum += g10_re*g10_re; \
434  column_sum += g10_im*g10_im; \
435  sincos(g21_im, &g20_im, &g20_re); \
436  double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
437  g20_re *= U20_mag; \
438  g20_im *= U20_mag; \
439  double r_inv2 = 1.0 / (u0*row_sum); \
440  double A_re, A_im; \
441  COMPLEX_DOT_PRODUCT(A, g00, g10); \
442  A_re *= u0; A_im *= u0; \
443  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
444  ACC_COMPLEX_PROD(g11, A, g01); \
445  g11_re *= -r_inv2; \
446  g11_im *= -r_inv2; \
447  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
448  ACC_COMPLEX_PROD(g12, -A, g02); \
449  g12_re *= r_inv2; \
450  g12_im *= r_inv2; \
451  COMPLEX_DOT_PRODUCT(A, g00, g20); \
452  A_re *= u0; A_im *= u0; \
453  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
454  ACC_COMPLEX_PROD(g21, -A, g01); \
455  g21_re *= r_inv2; \
456  g21_im *= r_inv2; \
457  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
458  ACC_COMPLEX_PROD(g22, A, g02); \
459  g22_re *= -r_inv2; \
460  g22_im *= -r_inv2;
461 
462  #define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
463  float row_sum = g01_re*g01_re; \
464  row_sum += g01_im*g01_im; \
465  row_sum += g02_re*g02_re; \
466  row_sum += g02_im*g02_im; \
467  __sincosf(g21_re, &g00_im, &g00_re); \
468  __sincosf(g21_im, &g20_im, &g20_re); \
469  float2 u0_2 = (dir < 6 ? param.An2 : (do_boundary ? param.TB2 : param.No2)); \
470  float column_sum = u0_2.y - row_sum; \
471  float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
472  g00_re *= U00_mag; \
473  g00_im *= U00_mag; \
474  column_sum += g10_re*g10_re; \
475  column_sum += g10_im*g10_im; \
476  column_sum = u0_2.y - column_sum; \
477  float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
478  g20_re *= U20_mag; \
479  g20_im *= U20_mag; \
480  float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \
481  float A_re, A_im; \
482  COMPLEX_DOT_PRODUCT(A, g00, g10); \
483  A_re *= u0_2.x; A_im *= u0_2.x; \
484  COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
485  ACC_COMPLEX_PROD(g11, A, g01); \
486  g11_re *= -r_inv2; \
487  g11_im *= -r_inv2; \
488  COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
489  ACC_COMPLEX_PROD(g12, -A, g02); \
490  g12_re *= r_inv2; \
491  g12_im *= r_inv2; \
492  COMPLEX_DOT_PRODUCT(A, g00, g20); \
493  A_re *= u0_2.x; A_im *= u0_2.x; \
494  COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
495  ACC_COMPLEX_PROD(g21, -A, g01); \
496  g21_re *= r_inv2; \
497  g21_im *= r_inv2; \
498  COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
499  ACC_COMPLEX_PROD(g22, A, g02); \
500  g22_re *= -r_inv2; \
501  g22_im *= -r_inv2;
502 
503 
504 
505 /************* the following is added for staggered *********/
506 
507 #define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
508  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
509  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
510  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
511  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
512  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
513  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
514  {float u0 = param.coeff_f*sign; \
515  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
516  gauge##22_re *=u0;gauge##22_im *=u0;}
517 
518 #define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
519  ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
520  ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
521  ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
522  ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
523  ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
524  ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
525  {double u0 = param.coeff* sign; \
526  gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
527  gauge##22_re *=u0;gauge##22_im *=u0;}
528 
529 
530 #define RECONSTRUCT_GAUGE_MATRIX_13_SINGLE(dir, gauge, idx, sign) { \
531  RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
532  float exp_i3phase_re, exp_i3phase_im; \
533  sincosf(3.f*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
534  float A_re, A_im; \
535  COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
536  gauge##20_re = A_re; \
537  gauge##20_im = A_im; \
538  COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
539  gauge##21_re = A_re; \
540  gauge##21_im = A_im; \
541  COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
542  gauge##22_re = A_re; \
543  gauge##22_im = A_im; \
544 }
545 
546 
547 #define RECONSTRUCT_GAUGE_MATRIX_13_DOUBLE(dir, gauge, idx, sign) { \
548  RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
549  double exp_i3phase_re, exp_i3phase_im; \
550  sincos(3.*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
551  double A_re, A_im; \
552  COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
553  gauge##20_re = A_re; \
554  gauge##20_im = A_im; \
555  COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
556  gauge##21_re = A_re; \
557  gauge##21_im = A_im; \
558  COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
559  gauge##22_re = A_re; \
560  gauge##22_im = A_im; \
561 }
562 
563 
564 
565 
566 #define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
567  double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
568  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
569  double u0 = param.coeff*sign; \
570  double u02_inv = 1.0 / (u0*u0); \
571  double column_sum = u02_inv - row_sum; \
572  double U00_mag = sqrt(column_sum); \
573  sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \
574  gauge##00_re *= U00_mag; \
575  gauge##00_im *= U00_mag; \
576  column_sum += gauge##10_re*gauge##10_re; \
577  column_sum += gauge##10_im*gauge##10_im; \
578  sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \
579  double U20_mag = sqrt(u02_inv - column_sum); \
580  gauge##20_re *= U20_mag; \
581  gauge##20_im *= U20_mag; \
582  double r_inv2 = 1.0 / (u0*row_sum); \
583  double A_re, A_im; \
584  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
585  A_re *= u0; A_im *= u0; \
586  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
587  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
588  gauge##11_re *= -r_inv2; \
589  gauge##11_im *= -r_inv2; \
590  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
591  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
592  gauge##12_re *= r_inv2; \
593  gauge##12_im *= r_inv2; \
594  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
595  A_re *= u0; A_im *= u0; \
596  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
597  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
598  gauge##21_re *= r_inv2; \
599  gauge##21_im *= r_inv2; \
600  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
601  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
602  gauge##22_re *= -r_inv2; \
603  gauge##22_im *= -r_inv2;
604 
605 #define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \
606  float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
607  row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
608  float u0 = param.coeff_f*sign; \
609  float u02_inv = __fdividef(1.f, u0*u0); \
610  float column_sum = u02_inv - row_sum; \
611  float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \
612  __sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \
613  gauge##00_re *= U00_mag; \
614  gauge##00_im *= U00_mag; \
615  column_sum += gauge##10_re*gauge##10_re; \
616  column_sum += gauge##10_im*gauge##10_im; \
617  __sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \
618  float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \
619  gauge##20_re *= U20_mag; \
620  gauge##20_im *= U20_mag; \
621  float r_inv2 = __fdividef(1.0f, u0*row_sum); \
622  float A_re, A_im; \
623  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
624  A_re *= u0; A_im *= u0; \
625  COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
626  ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
627  gauge##11_re *= -r_inv2; \
628  gauge##11_im *= -r_inv2; \
629  COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
630  ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
631  gauge##12_re *= r_inv2; \
632  gauge##12_im *= r_inv2; \
633  COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
634  A_re *= u0; A_im *= u0; \
635  COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
636  ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
637  gauge##21_re *= r_inv2; \
638  gauge##21_im *= r_inv2; \
639  COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
640  ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
641  gauge##22_re *= -r_inv2; \
642  gauge##22_im *= -r_inv2;}
643 
644 
645 
646 #define RECONSTRUCT_GAUGE_MATRIX_9_SINGLE(dir, gauge, idx, sign) { \
647  RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) \
648  float exp_iphase_re, exp_iphase_im; \
649  __sincosf(PHASE, &exp_iphase_im, &exp_iphase_re); \
650  float B_re, B_im; \
651  COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
652  gauge##00_re = B_re; \
653  gauge##00_im = B_im; \
654  COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
655  gauge##01_re = B_re; \
656  gauge##01_im = B_im; \
657  COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
658  gauge##02_re = B_re; \
659  gauge##02_im = B_im; \
660  COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
661  gauge##10_re = B_re; \
662  gauge##10_im = B_im; \
663  COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
664  gauge##11_re = B_re; \
665  gauge##11_im = B_im; \
666  COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
667  gauge##12_re = B_re; \
668  gauge##12_im = B_im; \
669  COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
670  gauge##20_re = B_re; \
671  gauge##20_im = B_im; \
672  COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
673  gauge##21_re = B_re; \
674  gauge##21_im = B_im; \
675  COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
676  gauge##22_re = B_re; \
677  gauge##22_im = B_im; \
678 }
679 
680 
681 #define RECONSTRUCT_GAUGE_MATRIX_9_DOUBLE(dir, gauge, idx, sign) { \
682  RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
683  double exp_iphase_re, exp_iphase_im; \
684  sincos(PHASE, &exp_iphase_im, &exp_iphase_re); \
685  double B_re, B_im; \
686  COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
687  gauge##00_re = B_re; \
688  gauge##00_im = B_im; \
689  COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
690  gauge##01_re = B_re; \
691  gauge##01_im = B_im; \
692  COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
693  gauge##02_re = B_re; \
694  gauge##02_im = B_im; \
695  COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
696  gauge##10_re = B_re; \
697  gauge##10_im = B_im; \
698  COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
699  gauge##11_re = B_re; \
700  gauge##11_im = B_im; \
701  COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
702  gauge##12_re = B_re; \
703  gauge##12_im = B_im; \
704  COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
705  gauge##20_re = B_re; \
706  gauge##20_im = B_im; \
707  COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
708  gauge##21_re = B_re; \
709  gauge##21_im = B_im; \
710  COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
711  gauge##22_re = B_re; \
712  gauge##22_im = B_im; \
713 }
714 
715 
716 
717 
718 // Fermi patch to disable double-precision texture reads
719 #ifdef FERMI_NO_DBLE_TEX
720 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
721  READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
722 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
723  READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
724 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
725  READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
726 #define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) \
727  READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride)
728 
731 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
732  ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
733 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
734  ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
735 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
736  ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
737 
738 #else
739 
740 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
741  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
742  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
743  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
744  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
745  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
746  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
747  double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
748  double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
749  double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
750 
751 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
752  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
753  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
754  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
755  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
756  double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
757  double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
758  double2 G##6 = make_double2(0,0); \
759  double2 G##7 = make_double2(0,0); \
760  double2 G##8 = make_double2(0,0); \
761 
762 // set A to be last components of G4 (otherwise unused)
763 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
764  double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
765  double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
766  double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
767  double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
768  double2 G##4 = make_double2(0,0); \
769  double2 G##5 = make_double2(0,0); \
770  double2 G##6 = make_double2(0,0); \
771  double2 G##7 = make_double2(0,0); \
772  double2 G##8 = make_double2(0,0); \
773  (G##7).x = (G##0).x; \
774  (G##7).y = (G##0).y;
775 
776 
777 
778 
779 #define READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride) { \
780  P = 2.f*M_PI*TEX1DFETCH(float, (phase), idx + (dir/2)*stride); \
781 }
782 
783 #define READ_GAUGE_PHASE_SHORT_TEX(P, phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride)
784 
785 #define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) { \
786  P = 2*M_PI*fetch_double((phase), idx + (dir/2)*stride); \
787 }
788 
791 #define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
792  G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
793  G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
794  G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
795  G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
796  G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
797  G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
798  G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
799  G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
800  G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
801 
802 #define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
803  G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
804  G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
805  G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
806  G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
807  G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
808  G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
809  G##6 = make_double2(0,0); \
810  G##7 = make_double2(0,0); \
811  G##8 = make_double2(0,0); \
812 
813 // set A to be last components of G4 (otherwise unused)
814 #define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
815  G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
816  G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
817  G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
818  G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
819  G##4 = make_double2(0,0); \
820  G##5 = make_double2(0,0); \
821  G##6 = make_double2(0,0); \
822  G##7 = make_double2(0,0); \
823  G##8 = make_double2(0,0); \
824  (G##7).x = (G##0).x; \
825  (G##7).y = (G##0).y;
826 
827 
828 
829 #endif
830