QUDA
v0.7.0
A library for QCD on GPUs
Main Page
Namespaces
Classes
Files
File List
File Members
•
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
quda
lib
read_gauge.h
Go to the documentation of this file.
1
//#include <half_quda.h>
2
3
// Performs complex addition
4
#define COMPLEX_ADD_TO(a, b) \
5
a##_re += b##_re, \
6
a##_im += b##_im
7
8
#define COMPLEX_PRODUCT(a, b, c) \
9
a##_re = b##_re*c##_re; \
10
a##_re -= b##_im*c##_im; \
11
a##_im = b##_re*c##_im; \
12
a##_im += b##_im*c##_re
13
14
#define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \
15
a##_re = b##_re*c##_re; \
16
a##_re -= b##_im*c##_im; \
17
a##_im = -b##_re*c##_im; \
18
a##_im -= b##_im*c##_re
19
20
// Performs a complex dot product
21
#define COMPLEX_DOT_PRODUCT(a, b, c) \
22
a##_re = b##_re*c##_re; \
23
a##_re += b##_im*c##_im; \
24
a##_im = b##_re*c##_im; \
25
a##_im -= b##_im*c##_re
26
27
// Performs a complex norm
28
#define COMPLEX_NORM(a, b) \
29
a = b##_re*b##_re; \
30
a += b##_im*b##_im
31
32
#define ACC_COMPLEX_PROD(a, b, c) \
33
a##_re += b##_re*c##_re; \
34
a##_re -= b##_im*c##_im; \
35
a##_im += b##_re*c##_im; \
36
a##_im += b##_im*c##_re
37
38
// Performs the complex conjugated accumulation: a += b* c*
39
#define ACC_CONJ_PROD(a, b, c) \
40
a##_re += b##_re * c##_re; \
41
a##_re -= b##_im * c##_im; \
42
a##_im -= b##_re * c##_im; \
43
a##_im -= b##_im * c##_re
44
45
#define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
46
float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
47
float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
48
float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
49
float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
50
float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
51
float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
52
float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
53
float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
54
float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
55
56
#define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
57
float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
58
float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
59
float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
60
float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
61
float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
62
float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
63
float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
64
float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
65
float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
66
67
#define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
68
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
69
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
70
float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
71
float4 G##3 = make_float4(0,0,0,0); \
72
float4 G##4 = make_float4(0,0,0,0);
73
74
#define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
75
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
76
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
77
float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
78
float4 G##3 = make_float4(0,0,0,0); \
79
float4 G##4 = make_float4(0,0,0,0);
80
81
// set A to be last components of G4 (otherwise unused)
82
#define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
83
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
84
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
85
float4 G##2 = make_float4(0,0,0,0); \
86
float4 G##3 = make_float4(0,0,0,0); \
87
float4 G##4 = make_float4(0,0,0,0); \
88
(G##3).z = (G##0).x; \
89
(G##3).w = (G##0).y;
90
91
#define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
92
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
93
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
94
float4 G##2 = make_float4(0,0,0,0); \
95
float4 G##3 = make_float4(0,0,0,0); \
96
float4 G##4 = make_float4(0,0,0,0); \
97
(G##3).z = (G##0).x = pi_f*(G##0).x; \
98
(G##3).w = (G##0).y = pi_f*(G##0).y;
99
100
#define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
101
double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
102
double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
103
double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
104
double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
105
double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
106
double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
107
double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
108
double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
109
double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
110
111
#define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
112
float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
113
float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
114
float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
115
float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
116
float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
117
float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
118
float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
119
float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
120
float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
121
122
#define READ_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
123
float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
124
float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
125
float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
126
float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
127
float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
128
float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
129
float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
130
float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
131
float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
132
133
#define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
134
double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
135
double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
136
double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
137
double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
138
double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
139
double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
140
double2 G##6 = make_double2(0,0); \
141
double2 G##7 = make_double2(0,0); \
142
double2 G##8 = make_double2(0,0);
143
144
#define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
145
float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
146
float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
147
float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
148
float4 G##3 = make_float4(0,0,0,0); \
149
float4 G##4 = make_float4(0,0,0,0);
150
151
#define READ_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
152
float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
153
float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
154
float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
155
float4 G##3 = make_float4(0,0,0,0); \
156
float4 G##4 = make_float4(0,0,0,0);
157
158
// set A to be last components of G4 (otherwise unused)
159
#define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
160
double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
161
double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
162
double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
163
double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
164
double2 G##4 = make_double2(0,0); \
165
double2 G##5 = make_double2(0,0); \
166
double2 G##6 = make_double2(0,0); \
167
double2 G##7 = make_double2(0,0); \
168
double2 G##8 = make_double2(0,0); \
169
(G##7).x = (G##0).x; \
170
(G##7).y = (G##0).y;
171
172
// set A to be last components of G4 (otherwise unused)
173
#define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
174
float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
175
float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
176
float4 G##2 = make_float4(0,0,0,0); \
177
float4 G##3 = make_float4(0,0,0,0); \
178
float4 G##4 = make_float4(0,0,0,0); \
179
(G##3).z = (G##0).x; \
180
(G##3).w = (G##0).y;
181
182
#define READ_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
183
float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
184
float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
185
float4 G##2 = make_float4(0,0,0,0); \
186
float4 G##3 = make_float4(0,0,0,0); \
187
float4 G##4 = make_float4(0,0,0,0); \
188
(G##3).z = (G##0).x = pi_f*(G##0).x; \
189
(G##3).w = (G##0).y = pi_f*(G##0).y;
190
191
192
#define READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride){ \
193
P = 2.*M_PI*phase[idx + (dir/2)*stride]; \
194
}
195
196
#define READ_GAUGE_PHASE_FLOAT(P, phase, dir, idx, stride){ \
197
P = 2.f*pi_f*phase[idx + (dir/2)*stride]; \
198
}
199
200
#define READ_GAUGE_PHASE_SHORT(P, phase, dir, idx, stride){ \
201
P = 2.f*pi_f*short2float(phase[idx + (dir/2)*stride]); \
202
}
203
206
#define ASSN_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
207
G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
208
G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
209
G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
210
G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
211
G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
212
G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
213
G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
214
G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
215
G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
216
217
#define ASSN_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
218
G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
219
G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
220
G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
221
G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
222
G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
223
G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
224
G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
225
G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
226
G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
227
228
#define ASSN_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
229
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
230
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
231
G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
232
G##3 = make_float4(0,0,0,0); \
233
G##4 = make_float4(0,0,0,0);
234
235
#define ASSN_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
236
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
237
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
238
G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
239
G##3 = make_float4(0,0,0,0); \
240
G##4 = make_float4(0,0,0,0);
241
242
// set A to be last components of G4 (otherwise unused)
243
#define ASSN_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
244
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
245
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
246
G##2 = make_float4(0,0,0,0); \
247
G##3 = make_float4(0,0,0,0); \
248
G##4 = make_float4(0,0,0,0); \
249
(G##3).z = (G##0).x; \
250
(G##3).w = (G##0).y;
251
252
#define ASSN_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
253
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
254
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
255
G##2 = make_float4(0,0,0,0); \
256
G##3 = make_float4(0,0,0,0); \
257
G##4 = make_float4(0,0,0,0); \
258
(G##3).z = (G##0).x = pi_f*(G##0).x; \
259
(G##3).w = (G##0).y = pi_f*(G##0).y;
260
261
#define ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
262
G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
263
G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
264
G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
265
G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
266
G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
267
G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
268
G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
269
G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
270
G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
271
272
#define ASSN_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
273
G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
274
G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
275
G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
276
G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
277
G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
278
G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
279
G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
280
G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
281
G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
282
283
#define ASSN_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
284
G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
285
G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
286
G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
287
G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
288
G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
289
G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
290
G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
291
G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
292
G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
293
294
#define ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
295
G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
296
G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
297
G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
298
G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
299
G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
300
G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
301
G##6 = make_double2(0,0); \
302
G##7 = make_double2(0,0); \
303
G##8 = make_double2(0,0);
304
305
#define ASSN_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
306
G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
307
G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
308
G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
309
G##3 = make_float4(0,0,0,0); \
310
G##4 = make_float4(0,0,0,0);
311
312
#define ASSN_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
313
G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
314
G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
315
G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
316
G##3 = make_float4(0,0,0,0); \
317
G##4 = make_float4(0,0,0,0);
318
319
// set A to be last components of G4 (otherwise unused)
320
#define ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
321
G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
322
G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
323
G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
324
G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
325
G##4 = make_double2(0,0); \
326
G##5 = make_double2(0,0); \
327
G##6 = make_double2(0,0); \
328
G##7 = make_double2(0,0); \
329
G##8 = make_double2(0,0); \
330
(G##7).x = (G##0).x; \
331
(G##7).y = (G##0).y;
332
333
// set A to be last components of G4 (otherwise unused)
334
#define ASSN_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
335
G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
336
G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
337
G##2 = make_float4(0,0,0,0); \
338
G##3 = make_float4(0,0,0,0); \
339
G##4 = make_float4(0,0,0,0); \
340
(G##3).z = (G##0).x; \
341
(G##3).w = (G##0).y;
342
343
#define ASSN_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
344
G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
345
G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
346
G##2 = make_float4(0,0,0,0); \
347
G##3 = make_float4(0,0,0,0); \
348
G##4 = make_float4(0,0,0,0); \
349
(G##3).z = (G##0).x = pi_f*(G##0).x; \
350
(G##3).w = (G##0).y = pi_f*(G##0).y;
351
352
353
/*----END--*/
354
355
#define RESCALE2(G, max) \
356
(G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \
357
(G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \
358
(G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \
359
(G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \
360
(G##8).x *= max; (G##8).y *= max;
361
362
// FIXME: merge staggered and Wilson reconstruct macros
363
364
#define RECONSTRUCT_MATRIX_18_DOUBLE(dir)
365
#define RECONSTRUCT_MATRIX_18_SINGLE(dir)
366
367
#ifndef MULTI_GPU
368
#define do_boundary ga_idx >= X4X3X2X1hmX3X2X1h
369
#else
370
#define do_boundary ( (Pt0 && (ga_idx >= Vh)) || ( PtNm1 && (ga_idx >= X4X3X2X1hmX3X2X1h) && (ga_idx < Vh) ) )
371
#endif
372
373
#define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \
374
ACC_CONJ_PROD(g20, +g01, +g12); \
375
ACC_CONJ_PROD(g20, -g02, +g11); \
376
ACC_CONJ_PROD(g21, +g02, +g10); \
377
ACC_CONJ_PROD(g21, -g00, +g12); \
378
ACC_CONJ_PROD(g22, +g00, +g11); \
379
ACC_CONJ_PROD(g22, -g01, +g10); \
380
double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
381
G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0;
382
383
#define RECONSTRUCT_MATRIX_12_SINGLE(dir) \
384
ACC_CONJ_PROD(g20, +g01, +g12); \
385
ACC_CONJ_PROD(g20, -g02, +g11); \
386
ACC_CONJ_PROD(g21, +g02, +g10); \
387
ACC_CONJ_PROD(g21, -g00, +g12); \
388
ACC_CONJ_PROD(g22, +g00, +g11); \
389
ACC_CONJ_PROD(g22, -g01, +g10); \
390
float u0 = (dir < 6 ? anisotropy_f : (do_boundary ? t_boundary_f : 1)); \
391
G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
392
393
#define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
394
double row_sum = g01_re*g01_re; \
395
row_sum += g01_im*g01_im; \
396
row_sum += g02_re*g02_re; \
397
row_sum += g02_im*g02_im; \
398
double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
399
double u02_inv = 1.0 / (u0*u0); \
400
double column_sum = u02_inv - row_sum; \
401
double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
402
sincos(g21_re, &g00_im, &g00_re); \
403
g00_re *= U00_mag; \
404
g00_im *= U00_mag; \
405
column_sum += g10_re*g10_re; \
406
column_sum += g10_im*g10_im; \
407
sincos(g21_im, &g20_im, &g20_re); \
408
double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
409
g20_re *= U20_mag; \
410
g20_im *= U20_mag; \
411
double r_inv2 = 1.0 / (u0*row_sum); \
412
double A_re, A_im; \
413
COMPLEX_DOT_PRODUCT(A, g00, g10); \
414
A_re *= u0; A_im *= u0; \
415
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
416
ACC_COMPLEX_PROD(g11, A, g01); \
417
g11_re *= -r_inv2; \
418
g11_im *= -r_inv2; \
419
COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
420
ACC_COMPLEX_PROD(g12, -A, g02); \
421
g12_re *= r_inv2; \
422
g12_im *= r_inv2; \
423
COMPLEX_DOT_PRODUCT(A, g00, g20); \
424
A_re *= u0; A_im *= u0; \
425
COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
426
ACC_COMPLEX_PROD(g21, -A, g01); \
427
g21_re *= r_inv2; \
428
g21_im *= r_inv2; \
429
COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
430
ACC_COMPLEX_PROD(g22, A, g02); \
431
g22_re *= -r_inv2; \
432
g22_im *= -r_inv2;
433
434
#define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
435
float row_sum = g01_re*g01_re; \
436
row_sum += g01_im*g01_im; \
437
row_sum += g02_re*g02_re; \
438
row_sum += g02_im*g02_im; \
439
__sincosf(g21_re, &g00_im, &g00_re); \
440
__sincosf(g21_im, &g20_im, &g20_re); \
441
float2 u0_2 = (dir < 6 ? An2 : (do_boundary ? TB2 : No2)); \
442
float column_sum = u0_2.y - row_sum; \
443
float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
444
g00_re *= U00_mag; \
445
g00_im *= U00_mag; \
446
column_sum += g10_re*g10_re; \
447
column_sum += g10_im*g10_im; \
448
column_sum = u0_2.y - column_sum; \
449
float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
450
g20_re *= U20_mag; \
451
g20_im *= U20_mag; \
452
float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \
453
float A_re, A_im; \
454
COMPLEX_DOT_PRODUCT(A, g00, g10); \
455
A_re *= u0_2.x; A_im *= u0_2.x; \
456
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
457
ACC_COMPLEX_PROD(g11, A, g01); \
458
g11_re *= -r_inv2; \
459
g11_im *= -r_inv2; \
460
COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
461
ACC_COMPLEX_PROD(g12, -A, g02); \
462
g12_re *= r_inv2; \
463
g12_im *= r_inv2; \
464
COMPLEX_DOT_PRODUCT(A, g00, g20); \
465
A_re *= u0_2.x; A_im *= u0_2.x; \
466
COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
467
ACC_COMPLEX_PROD(g21, -A, g01); \
468
g21_re *= r_inv2; \
469
g21_im *= r_inv2; \
470
COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
471
ACC_COMPLEX_PROD(g22, A, g02); \
472
g22_re *= -r_inv2; \
473
g22_im *= -r_inv2;
474
475
476
477
/************* the following is added for staggered *********/
478
479
#define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
480
ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
481
ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
482
ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
483
ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
484
ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
485
ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
486
{float u0 = coeff_f*sign; \
487
gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
488
gauge##22_re *=u0;gauge##22_im *=u0;}
489
490
#define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
491
ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
492
ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
493
ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
494
ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
495
ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
496
ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
497
{double u0 = coeff* sign; \
498
gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
499
gauge##22_re *=u0;gauge##22_im *=u0;}
500
501
502
#define RECONSTRUCT_GAUGE_MATRIX_13_SINGLE(dir, gauge, idx, sign) { \
503
RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
504
float exp_i3phase_re, exp_i3phase_im; \
505
sincosf(3.f*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
506
float A_re, A_im; \
507
COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
508
gauge##20_re = A_re; \
509
gauge##20_im = A_im; \
510
COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
511
gauge##21_re = A_re; \
512
gauge##21_im = A_im; \
513
COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
514
gauge##22_re = A_re; \
515
gauge##22_im = A_im; \
516
}
517
518
519
#define RECONSTRUCT_GAUGE_MATRIX_13_DOUBLE(dir, gauge, idx, sign) { \
520
RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
521
double exp_i3phase_re, exp_i3phase_im; \
522
sincos(3.*PHASE, &exp_i3phase_im, &exp_i3phase_re); \
523
double A_re, A_im; \
524
COMPLEX_PRODUCT(A, exp_i3phase, gauge##20); \
525
gauge##20_re = A_re; \
526
gauge##20_im = A_im; \
527
COMPLEX_PRODUCT(A, exp_i3phase, gauge##21); \
528
gauge##21_re = A_re; \
529
gauge##21_im = A_im; \
530
COMPLEX_PRODUCT(A, exp_i3phase, gauge##22); \
531
gauge##22_re = A_re; \
532
gauge##22_im = A_im; \
533
}
534
535
536
537
538
#define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
539
double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
540
row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
541
double u0 = coeff*sign; \
542
double u02_inv = 1.0 / (u0*u0); \
543
double column_sum = u02_inv - row_sum; \
544
double U00_mag = sqrt(column_sum); \
545
sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \
546
gauge##00_re *= U00_mag; \
547
gauge##00_im *= U00_mag; \
548
column_sum += gauge##10_re*gauge##10_re; \
549
column_sum += gauge##10_im*gauge##10_im; \
550
sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \
551
double U20_mag = sqrt(u02_inv - column_sum); \
552
gauge##20_re *= U20_mag; \
553
gauge##20_im *= U20_mag; \
554
double r_inv2 = 1.0 / (u0*row_sum); \
555
double A_re, A_im; \
556
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
557
A_re *= u0; A_im *= u0; \
558
COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
559
ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
560
gauge##11_re *= -r_inv2; \
561
gauge##11_im *= -r_inv2; \
562
COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
563
ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
564
gauge##12_re *= r_inv2; \
565
gauge##12_im *= r_inv2; \
566
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
567
A_re *= u0; A_im *= u0; \
568
COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
569
ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
570
gauge##21_re *= r_inv2; \
571
gauge##21_im *= r_inv2; \
572
COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
573
ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
574
gauge##22_re *= -r_inv2; \
575
gauge##22_im *= -r_inv2;
576
577
#define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \
578
float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
579
row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
580
float u0 = coeff_f*sign; \
581
float u02_inv = __fdividef(1.f, u0*u0); \
582
float column_sum = u02_inv - row_sum; \
583
float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \
584
__sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \
585
gauge##00_re *= U00_mag; \
586
gauge##00_im *= U00_mag; \
587
column_sum += gauge##10_re*gauge##10_re; \
588
column_sum += gauge##10_im*gauge##10_im; \
589
__sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \
590
float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \
591
gauge##20_re *= U20_mag; \
592
gauge##20_im *= U20_mag; \
593
float r_inv2 = __fdividef(1.0f, u0*row_sum); \
594
float A_re, A_im; \
595
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
596
A_re *= u0; A_im *= u0; \
597
COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
598
ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
599
gauge##11_re *= -r_inv2; \
600
gauge##11_im *= -r_inv2; \
601
COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
602
ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
603
gauge##12_re *= r_inv2; \
604
gauge##12_im *= r_inv2; \
605
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
606
A_re *= u0; A_im *= u0; \
607
COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
608
ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
609
gauge##21_re *= r_inv2; \
610
gauge##21_im *= r_inv2; \
611
COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
612
ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
613
gauge##22_re *= -r_inv2; \
614
gauge##22_im *= -r_inv2;}
615
616
617
618
#define RECONSTRUCT_GAUGE_MATRIX_9_SINGLE(dir, gauge, idx, sign) { \
619
RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) \
620
float exp_iphase_re, exp_iphase_im; \
621
__sincosf(PHASE, &exp_iphase_im, &exp_iphase_re); \
622
float B_re, B_im; \
623
COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
624
gauge##00_re = B_re; \
625
gauge##00_im = B_im; \
626
COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
627
gauge##01_re = B_re; \
628
gauge##01_im = B_im; \
629
COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
630
gauge##02_re = B_re; \
631
gauge##02_im = B_im; \
632
COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
633
gauge##10_re = B_re; \
634
gauge##10_im = B_im; \
635
COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
636
gauge##11_re = B_re; \
637
gauge##11_im = B_im; \
638
COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
639
gauge##12_re = B_re; \
640
gauge##12_im = B_im; \
641
COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
642
gauge##20_re = B_re; \
643
gauge##20_im = B_im; \
644
COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
645
gauge##21_re = B_re; \
646
gauge##21_im = B_im; \
647
COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
648
gauge##22_re = B_re; \
649
gauge##22_im = B_im; \
650
}
651
652
653
#define RECONSTRUCT_GAUGE_MATRIX_9_DOUBLE(dir, gauge, idx, sign) { \
654
RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
655
double exp_iphase_re, exp_iphase_im; \
656
sincos(PHASE, &exp_iphase_im, &exp_iphase_re); \
657
double B_re, B_im; \
658
COMPLEX_PRODUCT(B, exp_iphase, gauge##00); \
659
gauge##00_re = B_re; \
660
gauge##00_im = B_im; \
661
COMPLEX_PRODUCT(B, exp_iphase, gauge##01); \
662
gauge##01_re = B_re; \
663
gauge##01_im = B_im; \
664
COMPLEX_PRODUCT(B, exp_iphase, gauge##02); \
665
gauge##02_re = B_re; \
666
gauge##02_im = B_im; \
667
COMPLEX_PRODUCT(B, exp_iphase, gauge##10); \
668
gauge##10_re = B_re; \
669
gauge##10_im = B_im; \
670
COMPLEX_PRODUCT(B, exp_iphase, gauge##11); \
671
gauge##11_re = B_re; \
672
gauge##11_im = B_im; \
673
COMPLEX_PRODUCT(B, exp_iphase, gauge##12); \
674
gauge##12_re = B_re; \
675
gauge##12_im = B_im; \
676
COMPLEX_PRODUCT(B, exp_iphase, gauge##20); \
677
gauge##20_re = B_re; \
678
gauge##20_im = B_im; \
679
COMPLEX_PRODUCT(B, exp_iphase, gauge##21); \
680
gauge##21_re = B_re; \
681
gauge##21_im = B_im; \
682
COMPLEX_PRODUCT(B, exp_iphase, gauge##22); \
683
gauge##22_re = B_re; \
684
gauge##22_im = B_im; \
685
}
686
687
688
689
690
// Fermi patch to disable double-precision texture reads
691
#ifdef FERMI_NO_DBLE_TEX
692
#define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
693
READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
694
#define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
695
READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
696
#define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
697
READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
698
#define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) \
699
READ_GAUGE_PHASE_DOUBLE(P, phase, dir, idx, stride)
700
703
#define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
704
ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
705
#define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
706
ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
707
#define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
708
ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
709
710
#else
711
712
#define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
713
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
714
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
715
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
716
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
717
double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
718
double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
719
double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
720
double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
721
double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
722
723
#define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
724
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
725
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
726
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
727
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
728
double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
729
double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
730
double2 G##6 = make_double2(0,0); \
731
double2 G##7 = make_double2(0,0); \
732
double2 G##8 = make_double2(0,0); \
733
734
// set A to be last components of G4 (otherwise unused)
735
#define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
736
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
737
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
738
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
739
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
740
double2 G##4 = make_double2(0,0); \
741
double2 G##5 = make_double2(0,0); \
742
double2 G##6 = make_double2(0,0); \
743
double2 G##7 = make_double2(0,0); \
744
double2 G##8 = make_double2(0,0); \
745
(G##7).x = (G##0).x; \
746
(G##7).y = (G##0).y;
747
748
749
750
751
#define READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride) { \
752
P = 2.f*pi_f*TEX1DFETCH(float, (phase), idx + (dir/2)*stride); \
753
}
754
755
#define READ_GAUGE_PHASE_SHORT_TEX(P, phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT_TEX(P, phase, dir, idx, stride)
756
757
#define READ_GAUGE_PHASE_DOUBLE_TEX(P, phase, dir, idx, stride) { \
758
P = 2*M_PI*fetch_double((phase), idx + (dir/2)*stride); \
759
}
760
763
#define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
764
G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
765
G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
766
G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
767
G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
768
G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
769
G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
770
G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
771
G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
772
G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
773
774
#define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
775
G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
776
G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
777
G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
778
G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
779
G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
780
G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
781
G##6 = make_double2(0,0); \
782
G##7 = make_double2(0,0); \
783
G##8 = make_double2(0,0); \
784
785
// set A to be last components of G4 (otherwise unused)
786
#define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
787
G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
788
G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
789
G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
790
G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
791
G##4 = make_double2(0,0); \
792
G##5 = make_double2(0,0); \
793
G##6 = make_double2(0,0); \
794
G##7 = make_double2(0,0); \
795
G##8 = make_double2(0,0); \
796
(G##7).x = (G##0).x; \
797
(G##7).y = (G##0).y;
798
799
800
801
#endif
802
Generated on Wed Feb 4 2015 17:00:12 for QUDA by
1.8.6