QUDA
v0.7.0
A library for QCD on GPUs
Main Page
Namespaces
Classes
Files
File List
File Members
•
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
quda
lib
force_common.h
Go to the documentation of this file.
1
#ifndef _FORCE_COMMON_H
2
#define _FORCE_COMMON_H
3
4
enum
{
5
XUP
= 0,
6
YUP
= 1,
7
ZUP
= 2,
8
TUP
= 3,
9
TDOWN
= 4,
10
ZDOWN
= 5,
11
YDOWN
= 6,
12
XDOWN
= 7
13
};
14
15
16
#define OPP_DIR(dir) (7-(dir))
17
#define GOES_FORWARDS(dir) (dir<=3)
18
#define GOES_BACKWARDS(dir) (dir>3)
19
20
21
#define linkaT00_re (+linka00_re)
22
#define linkaT00_im (-linka00_im)
23
#define linkaT01_re (+linka10_re)
24
#define linkaT01_im (-linka10_im)
25
#define linkaT02_re (+linka20_re)
26
#define linkaT02_im (-linka20_im)
27
#define linkaT10_re (+linka01_re)
28
#define linkaT10_im (-linka01_im)
29
#define linkaT11_re (+linka11_re)
30
#define linkaT11_im (-linka11_im)
31
#define linkaT12_re (+linka21_re)
32
#define linkaT12_im (-linka21_im)
33
#define linkaT20_re (+linka02_re)
34
#define linkaT20_im (-linka02_im)
35
#define linkaT21_re (+linka12_re)
36
#define linkaT21_im (-linka12_im)
37
#define linkaT22_re (+linka22_re)
38
#define linkaT22_im (-linka22_im)
39
40
41
#define linkbT00_re (+linkb00_re)
42
#define linkbT00_im (-linkb00_im)
43
#define linkbT01_re (+linkb10_re)
44
#define linkbT01_im (-linkb10_im)
45
#define linkbT02_re (+linkb20_re)
46
#define linkbT02_im (-linkb20_im)
47
#define linkbT10_re (+linkb01_re)
48
#define linkbT10_im (-linkb01_im)
49
#define linkbT11_re (+linkb11_re)
50
#define linkbT11_im (-linkb11_im)
51
#define linkbT12_re (+linkb21_re)
52
#define linkbT12_im (-linkb21_im)
53
#define linkbT20_re (+linkb02_re)
54
#define linkbT20_im (-linkb02_im)
55
#define linkbT21_re (+linkb12_re)
56
#define linkbT21_im (-linkb12_im)
57
#define linkbT22_re (+linkb22_re)
58
#define linkbT22_im (-linkb22_im)
59
60
61
62
63
#define linkc00_re LINKC0.x
64
#define linkc00_im LINKC0.y
65
#define linkc01_re LINKC0.z
66
#define linkc01_im LINKC0.w
67
#define linkc02_re LINKC1.x
68
#define linkc02_im LINKC1.y
69
#define linkc10_re LINKC1.z
70
#define linkc10_im LINKC1.w
71
#define linkc11_re LINKC2.x
72
#define linkc11_im LINKC2.y
73
#define linkc12_re LINKC2.z
74
#define linkc12_im LINKC2.w
75
#define linkc20_re LINKC3.x
76
#define linkc20_im LINKC3.y
77
#define linkc21_re LINKC3.z
78
#define linkc21_im LINKC3.w
79
#define linkc22_re LINKC4.x
80
#define linkc22_im LINKC4.y
81
82
#define linkcT00_re (+linkc00_re)
83
#define linkcT00_im (-linkc00_im)
84
#define linkcT01_re (+linkc10_re)
85
#define linkcT01_im (-linkc10_im)
86
#define linkcT02_re (+linkc20_re)
87
#define linkcT02_im (-linkc20_im)
88
#define linkcT10_re (+linkc01_re)
89
#define linkcT10_im (-linkc01_im)
90
#define linkcT11_re (+linkc11_re)
91
#define linkcT11_im (-linkc11_im)
92
#define linkcT12_re (+linkc21_re)
93
#define linkcT12_im (-linkc21_im)
94
#define linkcT20_re (+linkc02_re)
95
#define linkcT20_im (-linkc02_im)
96
#define linkcT21_re (+linkc12_re)
97
#define linkcT21_im (-linkc12_im)
98
#define linkcT22_re (+linkc22_re)
99
#define linkcT22_im (-linkc22_im)
100
101
102
#define staple00_re STAPLE0.x
103
#define staple00_im STAPLE0.y
104
#define staple01_re STAPLE1.x
105
#define staple01_im STAPLE1.y
106
#define staple02_re STAPLE2.x
107
#define staple02_im STAPLE2.y
108
#define staple10_re STAPLE3.x
109
#define staple10_im STAPLE3.y
110
#define staple11_re STAPLE4.x
111
#define staple11_im STAPLE4.y
112
#define staple12_re STAPLE5.x
113
#define staple12_im STAPLE5.y
114
#define staple20_re STAPLE6.x
115
#define staple20_im STAPLE6.y
116
#define staple21_re STAPLE7.x
117
#define staple21_im STAPLE7.y
118
#define staple22_re STAPLE8.x
119
#define staple22_im STAPLE8.y
120
121
#define stapleT00_re (+staple00_re)
122
#define stapleT00_im (-staple00_im)
123
#define stapleT01_re (+staple10_re)
124
#define stapleT01_im (-staple10_im)
125
#define stapleT02_re (+staple20_re)
126
#define stapleT02_im (-staple20_im)
127
#define stapleT10_re (+staple01_re)
128
#define stapleT10_im (-staple01_im)
129
#define stapleT11_re (+staple11_re)
130
#define stapleT11_im (-staple11_im)
131
#define stapleT12_re (+staple21_re)
132
#define stapleT12_im (-staple21_im)
133
#define stapleT20_re (+staple02_re)
134
#define stapleT20_im (-staple02_im)
135
#define stapleT21_re (+staple12_re)
136
#define stapleT21_im (-staple12_im)
137
#define stapleT22_re (+staple22_re)
138
#define stapleT22_im (-staple22_im)
139
140
#ifdef FERMI_NO_DBLE_TEX
141
#define READ_DOUBLE2_TEXTURE(x_tex, x, i) (x)[i]
142
#else
143
#define READ_DOUBLE2_TEXTURE(x_tex, x, i) fetch_double2_old(x_tex, i)
144
#endif
145
146
147
#define LOAD_MATRIX_12_SINGLE(gauge, dir, idx, var, stride)do{ \
148
var##0 = gauge[idx + dir*stride*3]; \
149
var##1 = gauge[idx + dir*stride*3 + stride]; \
150
var##2 = gauge[idx + dir*stride*3 + stride*2]; \
151
}while(0)
152
153
#define LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \
154
var##0 = tex1Dfetch(gauge, idx + dir*stride*3); \
155
var##1 = tex1Dfetch(gauge, idx + dir*stride*3 + stride); \
156
var##2 = tex1Dfetch(gauge, idx + dir*stride*3 + stride*2); \
157
}while(0)
158
159
#define LOAD_MATRIX_12_DOUBLE(gauge, dir, idx, var, stride)do{ \
160
var##0 = gauge[idx + dir*stride*6]; \
161
var##1 = gauge[idx + dir*stride*6 + stride]; \
162
var##2 = gauge[idx + dir*stride*6 + stride*2]; \
163
var##3 = gauge[idx + dir*stride*6 + stride*3]; \
164
var##4 = gauge[idx + dir*stride*6 + stride*4]; \
165
var##5 = gauge[idx + dir*stride*6 + stride*5]; \
166
}while(0)
167
168
#define LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
169
var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6); \
170
var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride); \
171
var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*2); \
172
var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*3); \
173
var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*4); \
174
var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*5); \
175
}while(0)
176
177
#define LOAD_MATRIX_18(gauge, dir, idx, var, stride)do{ \
178
var##0 = gauge[idx + dir*stride*9]; \
179
var##1 = gauge[idx + dir*stride*9 + stride]; \
180
var##2 = gauge[idx + dir*stride*9 + stride*2]; \
181
var##3 = gauge[idx + dir*stride*9 + stride*3]; \
182
var##4 = gauge[idx + dir*stride*9 + stride*4]; \
183
var##5 = gauge[idx + dir*stride*9 + stride*5]; \
184
var##6 = gauge[idx + dir*stride*9 + stride*6]; \
185
var##7 = gauge[idx + dir*stride*9 + stride*7]; \
186
var##8 = gauge[idx + dir*stride*9 + stride*8]; \
187
}while(0)
188
189
#define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \
190
var##0 = tex1Dfetch(gauge, idx + dir*stride*9); \
191
var##1 = tex1Dfetch(gauge, idx + dir*stride*9 + stride); \
192
var##2 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*2); \
193
var##3 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*3); \
194
var##4 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*4); \
195
var##5 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*5); \
196
var##6 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*6); \
197
var##7 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*7); \
198
var##8 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*8); \
199
}while(0)
200
201
#define LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
202
var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9); \
203
var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride); \
204
var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*2); \
205
var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*3); \
206
var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*4); \
207
var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*5); \
208
var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*6); \
209
var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*7); \
210
var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*8); \
211
}while(0)
212
213
#define MULT_SU3_NN(ma, mb, mc) \
214
mc##00_re = \
215
ma##00_re * mb##00_re - ma##00_im * mb##00_im + \
216
ma##01_re * mb##10_re - ma##01_im * mb##10_im + \
217
ma##02_re * mb##20_re - ma##02_im * mb##20_im; \
218
mc##00_im = \
219
ma##00_re * mb##00_im + ma##00_im * mb##00_re + \
220
ma##01_re * mb##10_im + ma##01_im * mb##10_re + \
221
ma##02_re * mb##20_im + ma##02_im * mb##20_re; \
222
mc##10_re = \
223
ma##10_re * mb##00_re - ma##10_im * mb##00_im + \
224
ma##11_re * mb##10_re - ma##11_im * mb##10_im + \
225
ma##12_re * mb##20_re - ma##12_im * mb##20_im; \
226
mc##10_im = \
227
ma##10_re * mb##00_im + ma##10_im * mb##00_re + \
228
ma##11_re * mb##10_im + ma##11_im * mb##10_re + \
229
ma##12_re * mb##20_im + ma##12_im * mb##20_re; \
230
mc##20_re = \
231
ma##20_re * mb##00_re - ma##20_im * mb##00_im + \
232
ma##21_re * mb##10_re - ma##21_im * mb##10_im + \
233
ma##22_re * mb##20_re - ma##22_im * mb##20_im; \
234
mc##20_im = \
235
ma##20_re * mb##00_im + ma##20_im * mb##00_re + \
236
ma##21_re * mb##10_im + ma##21_im * mb##10_re + \
237
ma##22_re * mb##20_im + ma##22_im * mb##20_re; \
238
mc##01_re = \
239
ma##00_re * mb##01_re - ma##00_im * mb##01_im + \
240
ma##01_re * mb##11_re - ma##01_im * mb##11_im + \
241
ma##02_re * mb##21_re - ma##02_im * mb##21_im; \
242
mc##01_im = \
243
ma##00_re * mb##01_im + ma##00_im * mb##01_re + \
244
ma##01_re * mb##11_im + ma##01_im * mb##11_re + \
245
ma##02_re * mb##21_im + ma##02_im * mb##21_re; \
246
mc##11_re = \
247
ma##10_re * mb##01_re - ma##10_im * mb##01_im + \
248
ma##11_re * mb##11_re - ma##11_im * mb##11_im + \
249
ma##12_re * mb##21_re - ma##12_im * mb##21_im; \
250
mc##11_im = \
251
ma##10_re * mb##01_im + ma##10_im * mb##01_re + \
252
ma##11_re * mb##11_im + ma##11_im * mb##11_re + \
253
ma##12_re * mb##21_im + ma##12_im * mb##21_re; \
254
mc##21_re = \
255
ma##20_re * mb##01_re - ma##20_im * mb##01_im + \
256
ma##21_re * mb##11_re - ma##21_im * mb##11_im + \
257
ma##22_re * mb##21_re - ma##22_im * mb##21_im; \
258
mc##21_im = \
259
ma##20_re * mb##01_im + ma##20_im * mb##01_re + \
260
ma##21_re * mb##11_im + ma##21_im * mb##11_re + \
261
ma##22_re * mb##21_im + ma##22_im * mb##21_re; \
262
mc##02_re = \
263
ma##00_re * mb##02_re - ma##00_im * mb##02_im + \
264
ma##01_re * mb##12_re - ma##01_im * mb##12_im + \
265
ma##02_re * mb##22_re - ma##02_im * mb##22_im; \
266
mc##02_im = \
267
ma##00_re * mb##02_im + ma##00_im * mb##02_re + \
268
ma##01_re * mb##12_im + ma##01_im * mb##12_re + \
269
ma##02_re * mb##22_im + ma##02_im * mb##22_re; \
270
mc##12_re = \
271
ma##10_re * mb##02_re - ma##10_im * mb##02_im + \
272
ma##11_re * mb##12_re - ma##11_im * mb##12_im + \
273
ma##12_re * mb##22_re - ma##12_im * mb##22_im; \
274
mc##12_im = \
275
ma##10_re * mb##02_im + ma##10_im * mb##02_re + \
276
ma##11_re * mb##12_im + ma##11_im * mb##12_re + \
277
ma##12_re * mb##22_im + ma##12_im * mb##22_re; \
278
mc##22_re = \
279
ma##20_re * mb##02_re - ma##20_im * mb##02_im + \
280
ma##21_re * mb##12_re - ma##21_im * mb##12_im + \
281
ma##22_re * mb##22_re - ma##22_im * mb##22_im; \
282
mc##22_im = \
283
ma##20_re * mb##02_im + ma##20_im * mb##02_re + \
284
ma##21_re * mb##12_im + ma##21_im * mb##12_re + \
285
ma##22_re * mb##22_im + ma##22_im * mb##22_re;
286
287
288
289
#define MULT_SU3_NA(ma, mb, mc) \
290
mc##00_re = \
291
ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \
292
ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \
293
ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \
294
mc##00_im = \
295
ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \
296
ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \
297
ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \
298
mc##10_re = \
299
ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \
300
ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \
301
ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \
302
mc##10_im = \
303
ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \
304
ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \
305
ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \
306
mc##20_re = \
307
ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \
308
ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \
309
ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \
310
mc##20_im = \
311
ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \
312
ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \
313
ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \
314
mc##01_re = \
315
ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \
316
ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \
317
ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \
318
mc##01_im = \
319
ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \
320
ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \
321
ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \
322
mc##11_re = \
323
ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \
324
ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \
325
ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \
326
mc##11_im = \
327
ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \
328
ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \
329
ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \
330
mc##21_re = \
331
ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \
332
ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \
333
ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \
334
mc##21_im = \
335
ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \
336
ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \
337
ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \
338
mc##02_re = \
339
ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \
340
ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \
341
ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \
342
mc##02_im = \
343
ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \
344
ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \
345
ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \
346
mc##12_re = \
347
ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \
348
ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \
349
ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \
350
mc##12_im = \
351
ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \
352
ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \
353
ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \
354
mc##22_re = \
355
ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \
356
ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \
357
ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \
358
mc##22_im = \
359
ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \
360
ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \
361
ma##22_re * mb##T22_im + ma##22_im * mb##T22_re;
362
363
364
365
#define MULT_SU3_AN(ma, mb, mc) \
366
mc##00_re = \
367
ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \
368
ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \
369
ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \
370
mc##00_im = \
371
ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \
372
ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \
373
ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \
374
mc##10_re = \
375
ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \
376
ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \
377
ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \
378
mc##10_im = \
379
ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \
380
ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \
381
ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \
382
mc##20_re = \
383
ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \
384
ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \
385
ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \
386
mc##20_im = \
387
ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \
388
ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \
389
ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \
390
mc##01_re = \
391
ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \
392
ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \
393
ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \
394
mc##01_im = \
395
ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \
396
ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \
397
ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \
398
mc##11_re = \
399
ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \
400
ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \
401
ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \
402
mc##11_im = \
403
ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \
404
ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \
405
ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \
406
mc##21_re = \
407
ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \
408
ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \
409
ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \
410
mc##21_im = \
411
ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \
412
ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \
413
ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \
414
mc##02_re = \
415
ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \
416
ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \
417
ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \
418
mc##02_im = \
419
ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \
420
ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \
421
ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \
422
mc##12_re = \
423
ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \
424
ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \
425
ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \
426
mc##12_im = \
427
ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \
428
ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \
429
ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \
430
mc##22_re = \
431
ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \
432
ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \
433
ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \
434
mc##22_im = \
435
ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \
436
ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \
437
ma##T22_re * mb##22_im + ma##T22_im * mb##22_re;
438
439
#define SET_SU3_MATRIX(a, value) \
440
a##00_re = value; \
441
a##00_im = value; \
442
a##01_re = value; \
443
a##01_im = value; \
444
a##02_re = value; \
445
a##02_im = value; \
446
a##10_re = value; \
447
a##10_im = value; \
448
a##11_re = value; \
449
a##11_im = value; \
450
a##12_re = value; \
451
a##12_im = value; \
452
a##20_re = value; \
453
a##20_im = value; \
454
a##21_re = value; \
455
a##21_im = value; \
456
a##22_re = value; \
457
a##22_im = value; \
458
459
#define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc) \
460
mc##00_re = ma##00_re + mb##00_re * s; \
461
mc##00_im = ma##00_im + mb##00_im * s; \
462
mc##01_re = ma##01_re + mb##01_re * s; \
463
mc##01_im = ma##01_im + mb##01_im * s; \
464
mc##02_re = ma##02_re + mb##02_re * s; \
465
mc##02_im = ma##02_im + mb##02_im * s; \
466
mc##10_re = ma##10_re + mb##10_re * s; \
467
mc##10_im = ma##10_im + mb##10_im * s; \
468
mc##11_re = ma##11_re + mb##11_re * s; \
469
mc##11_im = ma##11_im + mb##11_im * s; \
470
mc##12_re = ma##12_re + mb##12_re * s; \
471
mc##12_im = ma##12_im + mb##12_im * s; \
472
mc##20_re = ma##20_re + mb##20_re * s; \
473
mc##20_im = ma##20_im + mb##20_im * s; \
474
mc##21_re = ma##21_re + mb##21_re * s; \
475
mc##21_im = ma##21_im + mb##21_im * s; \
476
mc##22_re = ma##22_re + mb##22_re * s; \
477
mc##22_im = ma##22_im + mb##22_im * s;
478
479
#define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc) \
480
mc##00_re = ma##00_re - mb##00_re * s; \
481
mc##00_im = ma##00_im - mb##00_im * s; \
482
mc##01_re = ma##01_re - mb##01_re * s; \
483
mc##01_im = ma##01_im - mb##01_im * s; \
484
mc##02_re = ma##02_re - mb##02_re * s; \
485
mc##02_im = ma##02_im - mb##02_im * s; \
486
mc##10_re = ma##10_re - mb##10_re * s; \
487
mc##10_im = ma##10_im - mb##10_im * s; \
488
mc##11_re = ma##11_re - mb##11_re * s; \
489
mc##11_im = ma##11_im - mb##11_im * s; \
490
mc##12_re = ma##12_re - mb##12_re * s; \
491
mc##12_im = ma##12_im - mb##12_im * s; \
492
mc##20_re = ma##20_re - mb##20_re * s; \
493
mc##20_im = ma##20_im - mb##20_im * s; \
494
mc##21_re = ma##21_re - mb##21_re * s; \
495
mc##21_im = ma##21_im - mb##21_im * s; \
496
mc##22_re = ma##22_re - mb##22_re * s; \
497
mc##22_im = ma##22_im - mb##22_im * s;
498
499
500
#define ah01_re AH0.x
501
#define ah01_im AH0.y
502
#define ah02_re AH1.x
503
#define ah02_im AH1.y
504
#define ah12_re AH2.x
505
#define ah12_im AH2.y
506
#define ah00_im AH3.x
507
#define ah11_im AH3.y
508
#define ah22_im AH4.x
509
#define ahspace AH4.y
510
511
#define UNCOMPRESS_ANTI_HERMITIAN(ah, m) \
512
m##00_re = 0; \
513
m##00_im = ah##00_im; \
514
m##11_re = 0; \
515
m##11_im = ah##11_im; \
516
m##22_re = 0; \
517
m##22_im = ah##22_im; \
518
m##01_re = ah##01_re; \
519
m##01_im = ah##01_im; \
520
m##10_re = -ah##01_re; \
521
m##10_im = ah##01_im; \
522
m##02_re = ah##02_re; \
523
m##02_im = ah##02_im; \
524
m##20_re = -ah##02_re; \
525
m##20_im = ah##02_im; \
526
m##12_re = ah##12_re; \
527
m##12_im = ah##12_im; \
528
m##21_re = -ah##12_re; \
529
m##21_im = ah##12_im;
530
531
532
#define MAKE_ANTI_HERMITIAN(m, ah) do { \
533
typeof(ah##space) temp; \
534
temp = (m##00_im + m##11_im + m##22_im)*0.33333333333333333; \
535
ah##00_im = (m##00_im - temp); \
536
ah##11_im = (m##11_im - temp); \
537
ah##22_im = (m##22_im - temp); \
538
ah##01_re = (m##01_re - m##10_re)*0.5; \
539
ah##02_re = (m##02_re - m##20_re)*0.5; \
540
ah##12_re = (m##12_re - m##21_re)*0.5; \
541
ah##01_im = (m##01_im + m##10_im)*0.5; \
542
ah##02_im = (m##02_im + m##20_im)*0.5; \
543
ah##12_im = (m##12_im + m##21_im)*0.5; \
544
ah##space = 0; \
545
}while(0)
546
547
548
#define LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, stride) do{ \
549
int start_pos = idx + dir*stride*5; \
550
var##0 = src[start_pos]; \
551
var##1 = src[start_pos + stride]; \
552
var##2 = src[start_pos + stride*2]; \
553
var##3 = src[start_pos + stride*3]; \
554
var##4 = src[start_pos + stride*4]; \
555
}while(0)
556
557
#define LOAD_ANTI_HERMITIAN_SINGLE_TEX(src, dir, idx, var) do{ \
558
int start_pos = idx + dir*Vh*5; \
559
var##0 = tex1Dfetch(src, start_pos); \
560
var##1 = tex1Dfetch(src, start_pos + Vh); \
561
var##2 = tex1Dfetch(src, start_pos + Vh*2); \
562
var##3 = tex1Dfetch(src, start_pos + Vh*3); \
563
var##4 = tex1Dfetch(src, start_pos + Vh*4); \
564
}while(0)
565
566
#define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride) do{ \
567
int start_ps = idx + dir*stride*5; \
568
mem[start_ps] = var##0; \
569
mem[start_ps + stride] = var##1; \
570
mem[start_ps + stride*2] = var##2; \
571
mem[start_ps + stride*3] = var##3; \
572
mem[start_ps + stride*4] = var##4; \
573
}while(0)
574
575
#define COPY_SU3_MATRIX(a, b) \
576
b##00_re = a##00_re; \
577
b##00_im = a##00_im; \
578
b##01_re = a##01_re; \
579
b##01_im = a##01_im; \
580
b##02_re = a##02_re; \
581
b##02_im = a##02_im; \
582
b##10_re = a##10_re; \
583
b##10_im = a##10_im; \
584
b##11_re = a##11_re; \
585
b##11_im = a##11_im; \
586
b##12_re = a##12_re; \
587
b##12_im = a##12_im; \
588
b##20_re = a##20_re; \
589
b##20_im = a##20_im; \
590
b##21_re = a##21_re; \
591
b##21_im = a##21_im; \
592
b##22_re = a##22_re; \
593
b##22_im = a##22_im;
594
595
#define SU3_ADJOINT(a, b) \
596
b##00_re = a##00_re; \
597
b##00_im = - a##00_im; \
598
b##01_re = a##10_re; \
599
b##01_im = - a##10_im; \
600
b##02_re = a##20_re; \
601
b##02_im = - a##20_im; \
602
b##10_re = a##01_re; \
603
b##10_im = - a##01_im; \
604
b##11_re = a##11_re; \
605
b##11_im = - a##11_im; \
606
b##12_re = a##21_re; \
607
b##12_im = - a##21_im; \
608
b##20_re = a##02_re; \
609
b##20_im = - a##02_im; \
610
b##21_re = a##12_re; \
611
b##21_im = - a##12_im; \
612
b##22_re = a##22_re; \
613
b##22_im = - a##22_im;
614
615
#define SET_UNIT_SU3_MATRIX(a) \
616
a##00_re = 1.0; \
617
a##00_im = 0; \
618
a##01_re = 0; \
619
a##01_im = 0; \
620
a##02_re = 0; \
621
a##02_im = 0; \
622
a##10_re = 0; \
623
a##10_im = 0; \
624
a##11_re = 1.0; \
625
a##11_im = 0; \
626
a##12_re = 0; \
627
a##12_im = 0; \
628
a##20_re = 0; \
629
a##20_im = 0; \
630
a##21_re = 0; \
631
a##21_im = 0; \
632
a##22_re = 1.0; \
633
a##22_im = 0;
634
635
// Performs the complex conjugated accumulation: a = b* c*
636
#define ACC_CONJ_PROD_ASSIGN(a, b, c) \
637
a##_re = b##_re * c##_re; \
638
a##_re -= b##_im * c##_im; \
639
a##_im = - b##_re * c##_im; \
640
a##_im -= b##_im * c##_re
641
642
643
#define RECONSTRUCT_LINK_12(sign, var) \
644
ACC_CONJ_PROD_ASSIGN(var##20, +var##01, +var##12); \
645
ACC_CONJ_PROD(var##20, -var##02, +var##11); \
646
ACC_CONJ_PROD_ASSIGN(var##21, +var##02, +var##10); \
647
ACC_CONJ_PROD(var##21, -var##00, +var##12); \
648
ACC_CONJ_PROD_ASSIGN(var##22, +var##00, +var##11); \
649
ACC_CONJ_PROD(var##22, -var##01, +var##10); \
650
var##20_re *=sign;var##20_im *=sign; var##21_re *=sign; var##21_im *=sign; \
651
var##22_re *=sign;var##22_im *=sign;
652
653
#define COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \
654
switch(mydir){ \
655
case 0: \
656
new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>> 1; \
657
break; \
658
case 1: \
659
new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1) >> 1; \
660
break; \
661
case 2: \
662
new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1) >> 1; \
663
break; \
664
case 3: \
665
new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1) >> 1; \
666
break; \
667
} \
668
}while(0)
669
670
#define COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \
671
switch(mydir){ \
672
case 0: \
673
new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \
674
break; \
675
case 1: \
676
new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \
677
break; \
678
case 2: \
679
new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \
680
break; \
681
case 3: \
682
new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \
683
break; \
684
} \
685
}while(0)
686
687
688
#define COMPUTE_NEW_FULL_IDX_PLUS(mydir, idx) do { \
689
switch(mydir){ \
690
case 0: \
691
new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1); \
692
break; \
693
case 1: \
694
new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1); \
695
break; \
696
case 2: \
697
new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
698
break; \
699
case 3: \
700
new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
701
break; \
702
} \
703
}while(0)
704
705
#define COMPUTE_NEW_FULL_IDX_MINUS(mydir, idx) do { \
706
switch(mydir){ \
707
case 0: \
708
new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \
709
break; \
710
case 1: \
711
new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \
712
break; \
713
case 2: \
714
new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \
715
break; \
716
case 3: \
717
new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \
718
break; \
719
} \
720
}while(0)
721
722
723
#endif // _FORCE_COMMON_H
TDOWN
Definition:
force_common.h:9
ZUP
Definition:
force_common.h:7
YUP
Definition:
force_common.h:6
TUP
Definition:
force_common.h:8
XUP
Definition:
force_common.h:5
XDOWN
Definition:
force_common.h:12
ZDOWN
Definition:
force_common.h:10
YDOWN
Definition:
force_common.h:11
Generated on Wed Feb 4 2015 17:00:11 for QUDA by
1.8.6