QUDA
v0.5.0
A library for QCD on GPUs
Main Page
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
quda
lib
read_gauge.h
Go to the documentation of this file.
1
//#include <half_quda.h>
2
3
// Performs complex addition
4
#define COMPLEX_ADD_TO(a, b) \
5
a##_re += b##_re, \
6
a##_im += b##_im
7
8
#define COMPLEX_PRODUCT(a, b, c) \
9
a##_re = b##_re*c##_re; \
10
a##_re -= b##_im*c##_im; \
11
a##_im = b##_re*c##_im; \
12
a##_im += b##_im*c##_re
13
14
#define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \
15
a##_re = b##_re*c##_re; \
16
a##_re -= b##_im*c##_im; \
17
a##_im = -b##_re*c##_im; \
18
a##_im -= b##_im*c##_re
19
20
// Performs a complex dot product
21
#define COMPLEX_DOT_PRODUCT(a, b, c) \
22
a##_re = b##_re*c##_re; \
23
a##_re += b##_im*c##_im; \
24
a##_im = b##_re*c##_im; \
25
a##_im -= b##_im*c##_re
26
27
// Performs a complex norm
28
#define COMPLEX_NORM(a, b) \
29
a = b##_re*b##_re; \
30
a += b##_im*b##_im
31
32
#define ACC_COMPLEX_PROD(a, b, c) \
33
a##_re += b##_re*c##_re; \
34
a##_re -= b##_im*c##_im; \
35
a##_im += b##_re*c##_im; \
36
a##_im += b##_im*c##_re
37
38
// Performs the complex conjugated accumulation: a += b* c*
39
#define ACC_CONJ_PROD(a, b, c) \
40
a##_re += b##_re * c##_re; \
41
a##_re -= b##_im * c##_im; \
42
a##_im -= b##_re * c##_im; \
43
a##_im -= b##_im * c##_re
44
45
#define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
46
float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
47
float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
48
float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
49
float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
50
float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
51
float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
52
float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
53
float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
54
float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
55
56
#define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
57
float2 G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
58
float2 G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
59
float2 G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
60
float2 G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
61
float2 G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
62
float2 G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
63
float2 G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
64
float2 G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
65
float2 G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
66
67
#define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
68
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
69
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
70
float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
71
float4 G##3 = make_float4(0,0,0,0); \
72
float4 G##4 = make_float4(0,0,0,0);
73
74
#define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
75
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
76
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
77
float4 G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
78
float4 G##3 = make_float4(0,0,0,0); \
79
float4 G##4 = make_float4(0,0,0,0);
80
81
// set A to be last components of G4 (otherwise unused)
82
#define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
83
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
84
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
85
float4 G##2 = make_float4(0,0,0,0); \
86
float4 G##3 = make_float4(0,0,0,0); \
87
float4 G##4 = make_float4(0,0,0,0); \
88
(G##3).z = (G##0).x; \
89
(G##3).w = (G##0).y;
90
91
#define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
92
float4 G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
93
float4 G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
94
float4 G##2 = make_float4(0,0,0,0); \
95
float4 G##3 = make_float4(0,0,0,0); \
96
float4 G##4 = make_float4(0,0,0,0); \
97
(G##3).z = (G##0).x = pi_f*(G##0).x; \
98
(G##3).w = (G##0).y = pi_f*(G##0).y;
99
100
#define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
101
double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
102
double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
103
double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
104
double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
105
double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
106
double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
107
double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
108
double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
109
double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
110
111
#define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
112
float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
113
float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
114
float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
115
float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
116
float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
117
float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
118
float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
119
float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
120
float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
121
122
#define READ_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
123
float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
124
float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
125
float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
126
float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
127
float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
128
float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
129
float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
130
float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
131
float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
132
133
#define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
134
double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
135
double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
136
double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
137
double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
138
double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
139
double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
140
double2 G##6 = make_double2(0,0); \
141
double2 G##7 = make_double2(0,0); \
142
double2 G##8 = make_double2(0,0);
143
144
#define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
145
float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
146
float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
147
float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
148
float4 G##3 = make_float4(0,0,0,0); \
149
float4 G##4 = make_float4(0,0,0,0);
150
151
#define READ_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
152
float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
153
float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
154
float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
155
float4 G##3 = make_float4(0,0,0,0); \
156
float4 G##4 = make_float4(0,0,0,0);
157
158
// set A to be last components of G4 (otherwise unused)
159
#define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
160
double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
161
double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
162
double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
163
double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
164
double2 G##4 = make_double2(0,0); \
165
double2 G##5 = make_double2(0,0); \
166
double2 G##6 = make_double2(0,0); \
167
double2 G##7 = make_double2(0,0); \
168
double2 G##8 = make_double2(0,0); \
169
(G##7).x = (G##0).x; \
170
(G##7).y = (G##0).y;
171
172
// set A to be last components of G4 (otherwise unused)
173
#define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
174
float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
175
float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
176
float4 G##2 = make_float4(0,0,0,0); \
177
float4 G##3 = make_float4(0,0,0,0); \
178
float4 G##4 = make_float4(0,0,0,0); \
179
(G##3).z = (G##0).x; \
180
(G##3).w = (G##0).y;
181
182
#define READ_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
183
float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
184
float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
185
float4 G##2 = make_float4(0,0,0,0); \
186
float4 G##3 = make_float4(0,0,0,0); \
187
float4 G##4 = make_float4(0,0,0,0); \
188
(G##3).z = (G##0).x = pi_f*(G##0).x; \
189
(G##3).w = (G##0).y = pi_f*(G##0).y;
190
193
#define ASSN_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \
194
G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
195
G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
196
G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
197
G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
198
G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
199
G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
200
G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
201
G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
202
G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
203
204
#define ASSN_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \
205
G##0 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+0)*stride); \
206
G##1 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+1)*stride); \
207
G##2 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+2)*stride); \
208
G##3 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+3)*stride); \
209
G##4 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+4)*stride); \
210
G##5 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+5)*stride); \
211
G##6 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+6)*stride); \
212
G##7 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+7)*stride); \
213
G##8 = TEX1DFETCH(float2, (gauge), idx + ((dir/2)*9+8)*stride); \
214
215
#define ASSN_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \
216
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
217
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
218
G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
219
G##3 = make_float4(0,0,0,0); \
220
G##4 = make_float4(0,0,0,0);
221
222
#define ASSN_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \
223
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+0)*stride); \
224
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+1)*stride); \
225
G##2 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*3+2)*stride); \
226
G##3 = make_float4(0,0,0,0); \
227
G##4 = make_float4(0,0,0,0);
228
229
// set A to be last components of G4 (otherwise unused)
230
#define ASSN_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \
231
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
232
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
233
G##2 = make_float4(0,0,0,0); \
234
G##3 = make_float4(0,0,0,0); \
235
G##4 = make_float4(0,0,0,0); \
236
(G##3).z = (G##0).x; \
237
(G##3).w = (G##0).y;
238
239
#define ASSN_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \
240
G##0 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+0)*stride); \
241
G##1 = TEX1DFETCH(float4, (gauge), idx + ((dir/2)*2+1)*stride); \
242
G##2 = make_float4(0,0,0,0); \
243
G##3 = make_float4(0,0,0,0); \
244
G##4 = make_float4(0,0,0,0); \
245
(G##3).z = (G##0).x = pi_f*(G##0).x; \
246
(G##3).w = (G##0).y = pi_f*(G##0).y;
247
248
#define ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \
249
G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \
250
G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \
251
G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \
252
G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \
253
G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \
254
G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \
255
G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \
256
G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \
257
G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \
258
259
#define ASSN_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \
260
G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \
261
G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \
262
G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \
263
G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \
264
G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \
265
G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \
266
G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \
267
G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \
268
G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \
269
270
#define ASSN_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \
271
G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \
272
G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \
273
G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \
274
G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \
275
G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \
276
G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \
277
G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \
278
G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \
279
G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \
280
281
#define ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \
282
G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \
283
G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \
284
G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \
285
G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \
286
G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \
287
G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \
288
G##6 = make_double2(0,0); \
289
G##7 = make_double2(0,0); \
290
G##8 = make_double2(0,0);
291
292
#define ASSN_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \
293
G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \
294
G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \
295
G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \
296
G##3 = make_float4(0,0,0,0); \
297
G##4 = make_float4(0,0,0,0);
298
299
#define ASSN_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \
300
G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \
301
G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \
302
G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \
303
G##3 = make_float4(0,0,0,0); \
304
G##4 = make_float4(0,0,0,0);
305
306
// set A to be last components of G4 (otherwise unused)
307
#define ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \
308
G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \
309
G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \
310
G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \
311
G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \
312
G##4 = make_double2(0,0); \
313
G##5 = make_double2(0,0); \
314
G##6 = make_double2(0,0); \
315
G##7 = make_double2(0,0); \
316
G##8 = make_double2(0,0); \
317
(G##7).x = (G##0).x; \
318
(G##7).y = (G##0).y;
319
320
// set A to be last components of G4 (otherwise unused)
321
#define ASSN_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \
322
G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \
323
G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \
324
G##2 = make_float4(0,0,0,0); \
325
G##3 = make_float4(0,0,0,0); \
326
G##4 = make_float4(0,0,0,0); \
327
(G##3).z = (G##0).x; \
328
(G##3).w = (G##0).y;
329
330
#define ASSN_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \
331
G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \
332
G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \
333
G##2 = make_float4(0,0,0,0); \
334
G##3 = make_float4(0,0,0,0); \
335
G##4 = make_float4(0,0,0,0); \
336
(G##3).z = (G##0).x = pi_f*(G##0).x; \
337
(G##3).w = (G##0).y = pi_f*(G##0).y;
338
339
340
/*----END--*/
341
342
#define RESCALE2(G, max) \
343
(G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \
344
(G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \
345
(G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \
346
(G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \
347
(G##8).x *= max; (G##8).y *= max;
348
349
// FIXME: merge staggered and Wilson reconstruct macros
350
351
#define RECONSTRUCT_MATRIX_18_DOUBLE(dir)
352
#define RECONSTRUCT_MATRIX_18_SINGLE(dir)
353
354
#ifndef MULTI_GPU
355
#define do_boundary ga_idx >= X4X3X2X1hmX3X2X1h
356
#else
357
#define do_boundary ( (Pt0 && (ga_idx >= Vh)) || ( PtNm1 && (ga_idx >= X4X3X2X1hmX3X2X1h) && (ga_idx < Vh) ) )
358
#endif
359
360
#define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \
361
ACC_CONJ_PROD(g20, +g01, +g12); \
362
ACC_CONJ_PROD(g20, -g02, +g11); \
363
ACC_CONJ_PROD(g21, +g02, +g10); \
364
ACC_CONJ_PROD(g21, -g00, +g12); \
365
ACC_CONJ_PROD(g22, +g00, +g11); \
366
ACC_CONJ_PROD(g22, -g01, +g10); \
367
double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
368
G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0;
369
370
#define RECONSTRUCT_MATRIX_12_SINGLE(dir) \
371
ACC_CONJ_PROD(g20, +g01, +g12); \
372
ACC_CONJ_PROD(g20, -g02, +g11); \
373
ACC_CONJ_PROD(g21, +g02, +g10); \
374
ACC_CONJ_PROD(g21, -g00, +g12); \
375
ACC_CONJ_PROD(g22, +g00, +g11); \
376
ACC_CONJ_PROD(g22, -g01, +g10); \
377
float u0 = (dir < 6 ? anisotropy_f : (do_boundary ? t_boundary_f : 1)); \
378
G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
379
380
#define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
381
double row_sum = g01_re*g01_re; \
382
row_sum += g01_im*g01_im; \
383
row_sum += g02_re*g02_re; \
384
row_sum += g02_im*g02_im; \
385
double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \
386
double u02_inv = 1.0 / (u0*u0); \
387
double column_sum = u02_inv - row_sum; \
388
double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
389
sincos(g21_re, &g00_im, &g00_re); \
390
g00_re *= U00_mag; \
391
g00_im *= U00_mag; \
392
column_sum += g10_re*g10_re; \
393
column_sum += g10_im*g10_im; \
394
sincos(g21_im, &g20_im, &g20_re); \
395
double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
396
g20_re *= U20_mag; \
397
g20_im *= U20_mag; \
398
double r_inv2 = 1.0 / (u0*row_sum); \
399
double A_re, A_im; \
400
COMPLEX_DOT_PRODUCT(A, g00, g10); \
401
A_re *= u0; A_im *= u0; \
402
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
403
ACC_COMPLEX_PROD(g11, A, g01); \
404
g11_re *= -r_inv2; \
405
g11_im *= -r_inv2; \
406
COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
407
ACC_COMPLEX_PROD(g12, -A, g02); \
408
g12_re *= r_inv2; \
409
g12_im *= r_inv2; \
410
COMPLEX_DOT_PRODUCT(A, g00, g20); \
411
A_re *= u0; A_im *= u0; \
412
COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
413
ACC_COMPLEX_PROD(g21, -A, g01); \
414
g21_re *= r_inv2; \
415
g21_im *= r_inv2; \
416
COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
417
ACC_COMPLEX_PROD(g22, A, g02); \
418
g22_re *= -r_inv2; \
419
g22_im *= -r_inv2;
420
421
#define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
422
float row_sum = g01_re*g01_re; \
423
row_sum += g01_im*g01_im; \
424
row_sum += g02_re*g02_re; \
425
row_sum += g02_im*g02_im; \
426
__sincosf(g21_re, &g00_im, &g00_re); \
427
__sincosf(g21_im, &g20_im, &g20_re); \
428
float2 u0_2 = (dir < 6 ? An2 : (do_boundary ? TB2 : No2)); \
429
float column_sum = u0_2.y - row_sum; \
430
float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
431
g00_re *= U00_mag; \
432
g00_im *= U00_mag; \
433
column_sum += g10_re*g10_re; \
434
column_sum += g10_im*g10_im; \
435
column_sum = u0_2.y - column_sum; \
436
float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \
437
g20_re *= U20_mag; \
438
g20_im *= U20_mag; \
439
float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \
440
float A_re, A_im; \
441
COMPLEX_DOT_PRODUCT(A, g00, g10); \
442
A_re *= u0_2.x; A_im *= u0_2.x; \
443
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
444
ACC_COMPLEX_PROD(g11, A, g01); \
445
g11_re *= -r_inv2; \
446
g11_im *= -r_inv2; \
447
COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
448
ACC_COMPLEX_PROD(g12, -A, g02); \
449
g12_re *= r_inv2; \
450
g12_im *= r_inv2; \
451
COMPLEX_DOT_PRODUCT(A, g00, g20); \
452
A_re *= u0_2.x; A_im *= u0_2.x; \
453
COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
454
ACC_COMPLEX_PROD(g21, -A, g01); \
455
g21_re *= r_inv2; \
456
g21_im *= r_inv2; \
457
COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
458
ACC_COMPLEX_PROD(g22, A, g02); \
459
g22_re *= -r_inv2; \
460
g22_im *= -r_inv2;
461
462
463
464
/************* the following is added for staggered *********/
465
466
#define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \
467
ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
468
ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
469
ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
470
ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
471
ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
472
ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
473
{float u0 = coeff_f*sign; \
474
gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
475
gauge##22_re *=u0;gauge##22_im *=u0;}
476
477
#define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \
478
ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \
479
ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \
480
ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \
481
ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \
482
ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \
483
ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \
484
{double u0 = coeff* sign; \
485
gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \
486
gauge##22_re *=u0;gauge##22_im *=u0;}
487
488
#define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \
489
double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
490
row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
491
double u0 = coeff*sign; \
492
double u02_inv = 1.0 / (u0*u0); \
493
double column_sum = u02_inv - row_sum; \
494
double U00_mag = sqrt(column_sum); \
495
sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \
496
gauge##00_re *= U00_mag; \
497
gauge##00_im *= U00_mag; \
498
column_sum += gauge##10_re*gauge##10_re; \
499
column_sum += gauge##10_im*gauge##10_im; \
500
sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \
501
double U20_mag = sqrt(u02_inv - column_sum); \
502
gauge##20_re *= U20_mag; \
503
gauge##20_im *= U20_mag; \
504
double r_inv2 = 1.0 / (u0*row_sum); \
505
double A_re, A_im; \
506
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
507
A_re *= u0; A_im *= u0; \
508
COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
509
ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
510
gauge##11_re *= -r_inv2; \
511
gauge##11_im *= -r_inv2; \
512
COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
513
ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
514
gauge##12_re *= r_inv2; \
515
gauge##12_im *= r_inv2; \
516
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
517
A_re *= u0; A_im *= u0; \
518
COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
519
ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
520
gauge##21_re *= r_inv2; \
521
gauge##21_im *= r_inv2; \
522
COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
523
ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
524
gauge##22_re *= -r_inv2; \
525
gauge##22_im *= -r_inv2;
526
527
#define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \
528
float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \
529
row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \
530
float u0 = coeff_f*sign; \
531
float u02_inv = __fdividef(1.f, u0*u0); \
532
float column_sum = u02_inv - row_sum; \
533
float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \
534
__sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \
535
gauge##00_re *= U00_mag; \
536
gauge##00_im *= U00_mag; \
537
column_sum += gauge##10_re*gauge##10_re; \
538
column_sum += gauge##10_im*gauge##10_im; \
539
__sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \
540
float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \
541
gauge##20_re *= U20_mag; \
542
gauge##20_im *= U20_mag; \
543
float r_inv2 = __fdividef(1.0f, u0*row_sum); \
544
float A_re, A_im; \
545
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \
546
A_re *= u0; A_im *= u0; \
547
COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \
548
ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \
549
gauge##11_re *= -r_inv2; \
550
gauge##11_im *= -r_inv2; \
551
COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \
552
ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \
553
gauge##12_re *= r_inv2; \
554
gauge##12_im *= r_inv2; \
555
COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \
556
A_re *= u0; A_im *= u0; \
557
COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \
558
ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \
559
gauge##21_re *= r_inv2; \
560
gauge##21_im *= r_inv2; \
561
COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \
562
ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \
563
gauge##22_re *= -r_inv2; \
564
gauge##22_im *= -r_inv2;}
565
566
// Fermi patch to disable double-precision texture reads
567
#ifdef FERMI_NO_DBLE_TEX
568
#define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
569
READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
570
#define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
571
READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
572
#define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
573
READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
574
577
#define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
578
ASSN_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride)
579
#define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
580
ASSN_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride)
581
#define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
582
ASSN_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride)
583
584
#else
585
586
#define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
587
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
588
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
589
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
590
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
591
double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
592
double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
593
double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
594
double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
595
double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
596
597
#define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
598
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
599
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
600
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
601
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
602
double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
603
double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
604
double2 G##6 = make_double2(0,0); \
605
double2 G##7 = make_double2(0,0); \
606
double2 G##8 = make_double2(0,0); \
607
608
// set A to be last components of G4 (otherwise unused)
609
#define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
610
double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
611
double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
612
double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
613
double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
614
double2 G##4 = make_double2(0,0); \
615
double2 G##5 = make_double2(0,0); \
616
double2 G##6 = make_double2(0,0); \
617
double2 G##7 = make_double2(0,0); \
618
double2 G##8 = make_double2(0,0); \
619
(G##7).x = (G##0).x; \
620
(G##7).y = (G##0).y;
621
624
#define ASSN_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
625
G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \
626
G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \
627
G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \
628
G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \
629
G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \
630
G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \
631
G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \
632
G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \
633
G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \
634
635
#define ASSN_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
636
G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \
637
G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \
638
G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \
639
G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \
640
G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \
641
G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \
642
G##6 = make_double2(0,0); \
643
G##7 = make_double2(0,0); \
644
G##8 = make_double2(0,0); \
645
646
// set A to be last components of G4 (otherwise unused)
647
#define ASSN_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \
648
G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \
649
G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \
650
G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \
651
G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \
652
G##4 = make_double2(0,0); \
653
G##5 = make_double2(0,0); \
654
G##6 = make_double2(0,0); \
655
G##7 = make_double2(0,0); \
656
G##8 = make_double2(0,0); \
657
(G##7).x = (G##0).x; \
658
(G##7).y = (G##0).y;
659
660
#endif
661
Generated on Wed Mar 20 2013 12:52:17 for QUDA by
1.8.2