QUDA
v0.7.0
A library for QCD on GPUs
Main Page
Namespaces
Classes
Files
File List
File Members
•
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
quda
lib
io_spinor.h
Go to the documentation of this file.
1
#define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
2
double2 I0 = spinor[sp_idx + 0*(stride)]; \
3
double2 I1 = spinor[sp_idx + 1*(stride)]; \
4
double2 I2 = spinor[sp_idx + 2*(stride)]; \
5
double2 I3 = spinor[sp_idx + 3*(stride)]; \
6
double2 I4 = spinor[sp_idx + 4*(stride)]; \
7
double2 I5 = spinor[sp_idx + 5*(stride)]; \
8
double2 I6 = spinor[sp_idx + 6*(stride)]; \
9
double2 I7 = spinor[sp_idx + 7*(stride)]; \
10
double2 I8 = spinor[sp_idx + 8*(stride)]; \
11
double2 I9 = spinor[sp_idx + 9*(stride)]; \
12
double2 I10 = spinor[sp_idx + 10*(stride)]; \
13
double2 I11 = spinor[sp_idx + 11*(stride)];
14
15
#define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \
16
double2 I0 = spinor[sp_idx + 0*(stride)]; \
17
double2 I1 = spinor[sp_idx + 1*(stride)]; \
18
double2 I2 = spinor[sp_idx + 2*(stride)]; \
19
double2 I3 = spinor[sp_idx + 3*(stride)]; \
20
double2 I4 = spinor[sp_idx + 4*(stride)]; \
21
double2 I5 = spinor[sp_idx + 5*(stride)];
22
23
#define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx) \
24
double2 I6 = spinor[sp_idx + 6*(stride)]; \
25
double2 I7 = spinor[sp_idx + 7*(stride)]; \
26
double2 I8 = spinor[sp_idx + 8*(stride)]; \
27
double2 I9 = spinor[sp_idx + 9*(stride)]; \
28
double2 I10 = spinor[sp_idx + 10*(stride)]; \
29
double2 I11 = spinor[sp_idx + 11*(stride)];
30
31
#define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
32
float4 I0 = spinor[sp_idx + 0*(stride)]; \
33
float4 I1 = spinor[sp_idx + 1*(stride)]; \
34
float4 I2 = spinor[sp_idx + 2*(stride)]; \
35
float4 I3 = spinor[sp_idx + 3*(stride)]; \
36
float4 I4 = spinor[sp_idx + 4*(stride)]; \
37
float4 I5 = spinor[sp_idx + 5*(stride)];
38
39
#define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx) \
40
float4 I0 = spinor[sp_idx + 0*(stride)]; \
41
float4 I1 = spinor[sp_idx + 1*(stride)]; \
42
float4 I2 = spinor[sp_idx + 2*(stride)]; \
43
44
#define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx) \
45
float4 I3 = spinor[sp_idx + 3*(stride)]; \
46
float4 I4 = spinor[sp_idx + 4*(stride)]; \
47
float4 I5 = spinor[sp_idx + 5*(stride)];
48
49
#define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) \
50
float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
51
float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
52
float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
53
float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \
54
float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \
55
float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \
56
float C = (spinor ## Norm)[norm_idx]; \
57
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
58
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
59
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
60
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
61
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
62
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
63
64
#define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \
65
READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)
66
67
#define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) \
68
float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
69
float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
70
float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
71
float C = (spinor ## Norm)[norm_idx]; \
72
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
73
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
74
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
75
76
#define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \
77
READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)
78
79
#define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) \
80
float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \
81
float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \
82
float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \
83
float C = (spinor ## Norm)[norm_idx]; \
84
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
85
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
86
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
87
88
#define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
89
READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)
90
91
#define READ_ACCUM_DOUBLE(spinor, stride) \
92
double2 accum0 = spinor[sid + 0*stride]; \
93
double2 accum1 = spinor[sid + 1*stride]; \
94
double2 accum2 = spinor[sid + 2*stride]; \
95
double2 accum3 = spinor[sid + 3*stride]; \
96
double2 accum4 = spinor[sid + 4*stride]; \
97
double2 accum5 = spinor[sid + 5*stride]; \
98
double2 accum6 = spinor[sid + 6*stride]; \
99
double2 accum7 = spinor[sid + 7*stride]; \
100
double2 accum8 = spinor[sid + 8*stride]; \
101
double2 accum9 = spinor[sid + 9*stride]; \
102
double2 accum10 = spinor[sid + 10*stride]; \
103
double2 accum11 = spinor[sid + 11*stride];
104
105
#define READ_ACCUM_SINGLE(spinor, stride) \
106
float4 accum0 = spinor[sid + 0*(stride)]; \
107
float4 accum1 = spinor[sid + 1*(stride)]; \
108
float4 accum2 = spinor[sid + 2*(stride)]; \
109
float4 accum3 = spinor[sid + 3*(stride)]; \
110
float4 accum4 = spinor[sid + 4*(stride)]; \
111
float4 accum5 = spinor[sid + 5*(stride)];
112
113
#define READ_ACCUM_HALF_(spinor, stride) \
114
float4 accum0 = short42float4(spinor[sid + 0*stride]); \
115
float4 accum1 = short42float4(spinor[sid + 1*stride]); \
116
float4 accum2 = short42float4(spinor[sid + 2*stride]); \
117
float4 accum3 = short42float4(spinor[sid + 3*stride]); \
118
float4 accum4 = short42float4(spinor[sid + 4*stride]); \
119
float4 accum5 = short42float4(spinor[sid + 5*stride]); \
120
float C = (spinor ## Norm)[sid]; \
121
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
122
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
123
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
124
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
125
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
126
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
127
128
#define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
129
130
#define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
131
double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
132
double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
133
double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
134
double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
135
double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
136
double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
137
double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
138
double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
139
double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
140
double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
141
double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
142
double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
143
144
#define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
145
double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
146
double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
147
double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
148
double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
149
double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
150
double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
151
152
#define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
153
double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
154
double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
155
double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
156
double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
157
double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
158
double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
159
160
#define READ_ACCUM_DOUBLE_TEX(spinor, stride) \
161
double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \
162
double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \
163
double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \
164
double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \
165
double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \
166
double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \
167
double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \
168
double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \
169
double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \
170
double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \
171
double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
172
double2 accum11 = fetch_double2((spinor), sid + 11*(stride));
173
174
#define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
175
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
176
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
177
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
178
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
179
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
180
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
181
182
#define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
183
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
184
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
185
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
186
187
#define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
188
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
189
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
190
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
191
192
#define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
193
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
194
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
195
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
196
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
197
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
198
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
199
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
200
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
201
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
202
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
203
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
204
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
205
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
206
207
#define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \
208
READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
209
210
#define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
211
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
212
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
213
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
214
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
215
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
216
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
217
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
218
219
#define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
220
READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
221
222
#define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
223
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
224
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
225
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
226
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
227
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
228
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
229
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
230
231
#define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
232
READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
233
234
#define READ_ACCUM_SINGLE_TEX(spinor, stride) \
235
float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
236
float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
237
float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
238
float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
239
float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
240
float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride));
241
242
#define READ_ACCUM_HALF_TEX_(spinor, stride) \
243
float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
244
float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
245
float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
246
float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
247
float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
248
float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
249
float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
250
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
251
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
252
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
253
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
254
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
255
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
256
257
#define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
258
259
260
#define WRITE_SPINOR_DOUBLE2(stride) \
261
out[0*(stride)+sid] = make_double2(o00_re, o00_im); \
262
out[1*(stride)+sid] = make_double2(o01_re, o01_im); \
263
out[2*(stride)+sid] = make_double2(o02_re, o02_im); \
264
out[3*(stride)+sid] = make_double2(o10_re, o10_im); \
265
out[4*(stride)+sid] = make_double2(o11_re, o11_im); \
266
out[5*(stride)+sid] = make_double2(o12_re, o12_im); \
267
out[6*(stride)+sid] = make_double2(o20_re, o20_im); \
268
out[7*(stride)+sid] = make_double2(o21_re, o21_im); \
269
out[8*(stride)+sid] = make_double2(o22_re, o22_im); \
270
out[9*(stride)+sid] = make_double2(o30_re, o30_im); \
271
out[10*(stride)+sid] = make_double2(o31_re, o31_im); \
272
out[11*(stride)+sid] = make_double2(o32_re, o32_im);
273
274
#define WRITE_SPINOR_FLOAT4(stride) \
275
out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
276
out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
277
out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
278
out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
279
out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
280
out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
281
282
#define WRITE_SPINOR_SHORT4(stride) \
283
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
284
float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
285
float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
286
float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
287
float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
288
float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
289
float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
290
float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
291
float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
292
float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
293
float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
294
float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
295
c0 = fmaxf(c0, c1); \
296
c1 = fmaxf(c2, c3); \
297
c2 = fmaxf(c4, c5); \
298
c3 = fmaxf(c6, c7); \
299
c4 = fmaxf(c8, c9); \
300
c5 = fmaxf(c10, c11); \
301
c0 = fmaxf(c0, c1); \
302
c1 = fmaxf(c2, c3); \
303
c2 = fmaxf(c4, c5); \
304
c0 = fmaxf(c0, c1); \
305
c0 = fmaxf(c0, c2); \
306
outNorm[sid] = c0; \
307
float scale = __fdividef(MAX_SHORT, c0); \
308
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
309
o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
310
o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
311
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
312
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
313
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
314
out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
315
out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
316
out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
317
out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
318
out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
319
out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
320
321
#if (__COMPUTE_CAPABILITY__ >= 200)
322
#define WRITE_SPINOR_DOUBLE2_STR(stride) \
323
store_streaming_double2(&out[0*stride+sid], o00_re, o00_im); \
324
store_streaming_double2(&out[1*stride+sid], o01_re, o01_im); \
325
store_streaming_double2(&out[2*stride+sid], o02_re, o02_im); \
326
store_streaming_double2(&out[3*stride+sid], o10_re, o10_im); \
327
store_streaming_double2(&out[4*stride+sid], o11_re, o11_im); \
328
store_streaming_double2(&out[5*stride+sid], o12_re, o12_im); \
329
store_streaming_double2(&out[6*stride+sid], o20_re, o20_im); \
330
store_streaming_double2(&out[7*stride+sid], o21_re, o21_im); \
331
store_streaming_double2(&out[8*stride+sid], o22_re, o22_im); \
332
store_streaming_double2(&out[9*stride+sid], o30_re, o30_im); \
333
store_streaming_double2(&out[10*stride+sid], o31_re, o31_im); \
334
store_streaming_double2(&out[11*stride+sid], o32_re, o32_im);
335
336
#define WRITE_SPINOR_FLOAT4_STR(stride) \
337
store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
338
store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
339
store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
340
store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
341
store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
342
store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
343
344
#define WRITE_SPINOR_SHORT4_STR(stride) \
345
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
346
float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
347
float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
348
float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
349
float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
350
float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
351
float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
352
float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
353
float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
354
float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
355
float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
356
float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
357
c0 = fmaxf(c0, c1); \
358
c1 = fmaxf(c2, c3); \
359
c2 = fmaxf(c4, c5); \
360
c3 = fmaxf(c6, c7); \
361
c4 = fmaxf(c8, c9); \
362
c5 = fmaxf(c10, c11); \
363
c0 = fmaxf(c0, c1); \
364
c1 = fmaxf(c2, c3); \
365
c2 = fmaxf(c4, c5); \
366
c0 = fmaxf(c0, c1); \
367
c0 = fmaxf(c0, c2); \
368
outNorm[sid] = c0; \
369
float scale = __fdividef(MAX_SHORT, c0); \
370
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
371
o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
372
o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
373
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
374
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
375
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
376
store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
377
store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
378
store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
379
store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
380
store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
381
store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
382
#else
383
#define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride)
384
#define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride)
385
#define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride)
386
#endif
387
388
// macros used for exterior Wilson Dslash kernels and face packing
389
390
#define READ_HALF_SPINOR READ_SPINOR_UP
391
392
#define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \
393
out[0*(stride)+sid] = make_double2(a0_re, a0_im); \
394
out[1*(stride)+sid] = make_double2(a1_re, a1_im); \
395
out[2*(stride)+sid] = make_double2(a2_re, a2_im); \
396
out[3*(stride)+sid] = make_double2(b0_re, b0_im); \
397
out[4*(stride)+sid] = make_double2(b1_re, b1_im); \
398
out[5*(stride)+sid] = make_double2(b2_re, b2_im);
399
400
#define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \
401
out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \
402
out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \
403
out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
404
405
#define WRITE_HALF_SPINOR_SHORT4(stride, sid) \
406
float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \
407
float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \
408
float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \
409
float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \
410
float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \
411
float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \
412
c0 = fmaxf(c0, c1); \
413
c1 = fmaxf(c2, c3); \
414
c2 = fmaxf(c4, c5); \
415
c0 = fmaxf(c0, c1); \
416
c0 = fmaxf(c0, c2); \
417
outNorm[sid] = c0; \
418
float scale = __fdividef(MAX_SHORT, c0); \
419
a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \
420
a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \
421
b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \
422
out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \
423
out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \
424
out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im);
425
427
/******************used by non-degenerate twisted mass**********************/
428
#define WRITE_FLAVOR_SPINOR_DOUBLE2() \
429
out[0*(param.sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \
430
out[1*(param.sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \
431
out[2*(param.sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \
432
out[3*(param.sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \
433
out[4*(param.sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \
434
out[5*(param.sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \
435
out[6*(param.sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \
436
out[7*(param.sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \
437
out[8*(param.sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \
438
out[9*(param.sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \
439
out[10*(param.sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \
440
out[11*(param.sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \
441
out[0*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_00_re, o2_00_im); \
442
out[1*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_01_re, o2_01_im); \
443
out[2*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_02_re, o2_02_im); \
444
out[3*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_10_re, o2_10_im); \
445
out[4*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_11_re, o2_11_im); \
446
out[5*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_12_re, o2_12_im); \
447
out[6*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_20_re, o2_20_im); \
448
out[7*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_21_re, o2_21_im); \
449
out[8*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_22_re, o2_22_im); \
450
out[9*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_30_re, o2_30_im); \
451
out[10*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_31_re, o2_31_im); \
452
out[11*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_32_re, o2_32_im);
453
454
455
#define WRITE_FLAVOR_SPINOR_FLOAT4() \
456
out[0*(param.sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \
457
out[1*(param.sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \
458
out[2*(param.sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \
459
out[3*(param.sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \
460
out[4*(param.sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \
461
out[5*(param.sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \
462
out[0*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \
463
out[1*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \
464
out[2*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \
465
out[3*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \
466
out[4*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \
467
out[5*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im);
468
469
470
#define WRITE_FLAVOR_SPINOR_SHORT4() \
471
float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \
472
float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \
473
float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \
474
float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \
475
float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \
476
float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \
477
float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \
478
float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \
479
float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \
480
float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \
481
float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \
482
float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \
483
c0 = fmaxf(c0, c1); \
484
c1 = fmaxf(c2, c3); \
485
c2 = fmaxf(c4, c5); \
486
c3 = fmaxf(c6, c7); \
487
c4 = fmaxf(c8, c9); \
488
c5 = fmaxf(c10, c11); \
489
c0 = fmaxf(c0, c1); \
490
c1 = fmaxf(c2, c3); \
491
c2 = fmaxf(c4, c5); \
492
c0 = fmaxf(c0, c1); \
493
c0 = fmaxf(c0, c2); \
494
outNorm[sid] = c0; \
495
float scale = __fdividef(MAX_SHORT, c0); \
496
o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \
497
o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \
498
o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \
499
o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \
500
o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \
501
o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \
502
out[sid+0*(param.sp_stride)] = make_short4((short)o1_00_re, (short)o1_00_im, (short)o1_01_re, (short)o1_01_im); \
503
out[sid+1*(param.sp_stride)] = make_short4((short)o1_02_re, (short)o1_02_im, (short)o1_10_re, (short)o1_10_im); \
504
out[sid+2*(param.sp_stride)] = make_short4((short)o1_11_re, (short)o1_11_im, (short)o1_12_re, (short)o1_12_im); \
505
out[sid+3*(param.sp_stride)] = make_short4((short)o1_20_re, (short)o1_20_im, (short)o1_21_re, (short)o1_21_im); \
506
out[sid+4*(param.sp_stride)] = make_short4((short)o1_22_re, (short)o1_22_im, (short)o1_30_re, (short)o1_30_im); \
507
out[sid+5*(param.sp_stride)] = make_short4((short)o1_31_re, (short)o1_31_im, (short)o1_32_re, (short)o1_32_im); \
508
c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \
509
c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \
510
c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \
511
c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \
512
c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \
513
c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \
514
c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \
515
c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \
516
c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \
517
c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \
518
c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \
519
c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \
520
c0 = fmaxf(c0, c1); \
521
c1 = fmaxf(c2, c3); \
522
c2 = fmaxf(c4, c5); \
523
c3 = fmaxf(c6, c7); \
524
c4 = fmaxf(c8, c9); \
525
c5 = fmaxf(c10, c11); \
526
c0 = fmaxf(c0, c1); \
527
c1 = fmaxf(c2, c3); \
528
c2 = fmaxf(c4, c5); \
529
c0 = fmaxf(c0, c1); \
530
c0 = fmaxf(c0, c2); \
531
outNorm[sid+param.fl_stride] = c0; \
532
scale = __fdividef(MAX_SHORT, c0); \
533
o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \
534
o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \
535
o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \
536
o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \
537
o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \
538
o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \
539
out[sid+param.fl_stride+0*(param.sp_stride)] = make_short4((short)o2_00_re, (short)o2_00_im, (short)o2_01_re, (short)o2_01_im); \
540
out[sid+param.fl_stride+1*(param.sp_stride)] = make_short4((short)o2_02_re, (short)o2_02_im, (short)o2_10_re, (short)o2_10_im); \
541
out[sid+param.fl_stride+2*(param.sp_stride)] = make_short4((short)o2_11_re, (short)o2_11_im, (short)o2_12_re, (short)o2_12_im); \
542
out[sid+param.fl_stride+3*(param.sp_stride)] = make_short4((short)o2_20_re, (short)o2_20_im, (short)o2_21_re, (short)o2_21_im); \
543
out[sid+param.fl_stride+4*(param.sp_stride)] = make_short4((short)o2_22_re, (short)o2_22_im, (short)o2_30_re, (short)o2_30_im); \
544
out[sid+param.fl_stride+5*(param.sp_stride)] = make_short4((short)o2_31_re, (short)o2_31_im, (short)o2_32_re, (short)o2_32_im);
545
546
547
/************* the following is used by staggered *****************/
548
549
#define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
550
double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
551
double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
552
double2 I2 = fetch_double2((spinor), idx + 2*mystride);
553
554
#define READ_KS_NBR_SPINOR_DOUBLE_TEX(T, spinor, idx, mystride) \
555
T##0 = fetch_double2((spinor), idx + 0*mystride); \
556
T##1 = fetch_double2((spinor), idx + 1*mystride); \
557
T##2 = fetch_double2((spinor), idx + 2*mystride);
558
559
#define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
560
float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
561
float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
562
float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
563
564
#define READ_KS_NBR_SPINOR_SINGLE_TEX(T, spinor, idx, mystride) \
565
T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
566
T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
567
T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
568
569
570
571
572
#define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
573
float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
574
float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
575
float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
576
{ \
577
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
578
I0.x *= C; I0.y *= C; \
579
I1.x *= C; I1.y *= C; \
580
I2.x *= C; I2.y *= C;}
581
582
#define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
583
READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
584
585
#define READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride) \
586
T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
587
T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
588
T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
589
{ \
590
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
591
(T##0).x *= C; (T##0).y *= C; \
592
(T##1).x *= C; (T##1).y *= C; \
593
(T##2).x *= C; (T##2).y *= C;}
594
595
#define READ_KS_NBR_SPINOR_HALF_TEX(T, spinor, idx, mystride) \
596
READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride)
597
598
#define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
599
double2 I0 = spinor[idx + 0*mystride]; \
600
double2 I1 = spinor[idx + 1*mystride]; \
601
double2 I2 = spinor[idx + 2*mystride];
602
603
#define READ_KS_NBR_SPINOR_DOUBLE(T, spinor, idx, mystride) \
604
T##0 = spinor[idx + 0*mystride]; \
605
T##1 = spinor[idx + 1*mystride]; \
606
T##2 = spinor[idx + 2*mystride];
607
608
#define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
609
float2 I0 = spinor[idx + 0*mystride]; \
610
float2 I1 = spinor[idx + 1*mystride]; \
611
float2 I2 = spinor[idx + 2*mystride];
612
613
#define READ_KS_NBR_SPINOR_SINGLE(T, spinor, idx, mystride) \
614
T##0 = spinor[idx + 0*mystride]; \
615
T##1 = spinor[idx + 1*mystride]; \
616
T##2 = spinor[idx + 2*mystride];
617
618
#define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \
619
float2 I0, I1, I2; \
620
{ \
621
short2 S0 = in[idx + 0*mystride]; \
622
short2 S1 = in[idx + 1*mystride]; \
623
short2 S2 = in[idx + 2*mystride]; \
624
float C = inNorm[idx]; \
625
I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \
626
I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \
627
I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \
628
}
629
630
#define READ_KS_NBR_SPINOR_HALF(T, spinor, idx, mystride) \
631
{ \
632
short2 S0 = in[idx + 0*mystride]; \
633
short2 S1 = in[idx + 1*mystride]; \
634
short2 S2 = in[idx + 2*mystride]; \
635
float C = inNorm[idx]; \
636
(T##0).x =C*short2float(S0.x); (T##0).y =C*short2float(S0.y); \
637
(T##1).x =C*short2float(S1.x); (T##1).y =C*short2float(S1.y); \
638
(T##2).x =C*short2float(S2.x); (T##2).y =C*short2float(S2.y); \
639
}
640
641
642
#define WRITE_ST_SPINOR_DOUBLE2(out, sid, mystride) \
643
out[0*mystride+sid] = make_double2(o00_re, o00_im); \
644
out[1*mystride+sid] = make_double2(o01_re, o01_im); \
645
out[2*mystride+sid] = make_double2(o02_re, o02_im);
646
647
#define WRITE_ST_SPINOR_FLOAT2(out, sid, mystride) \
648
out[0*mystride+sid] = make_float2(o00_re, o00_im); \
649
out[1*mystride+sid] = make_float2(o01_re, o01_im); \
650
out[2*mystride+sid] = make_float2(o02_re, o02_im);
651
652
#define WRITE_ST_SPINOR_SHORT2(out, sid, mystride) \
653
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
654
float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
655
float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
656
c0 = fmaxf(c0, c1); \
657
c0 = fmaxf(c0, c2); \
658
out ## Norm[sid] = c0; \
659
float scale = __fdividef(MAX_SHORT, c0); \
660
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
661
o02_re *= scale; o02_im *= scale; \
662
out[sid+0*mystride] = make_short2((short)o00_re, (short)o00_im); \
663
out[sid+1*mystride] = make_short2((short)o01_re, (short)o01_im); \
664
out[sid+2*mystride] = make_short2((short)o02_re, (short)o02_im);
665
666
// Non-cache writes to minimize cache polution
667
#if (__COMPUTE_CAPABILITY__ >= 200)
668
669
#define WRITE_ST_SPINOR_DOUBLE2_STR(out, sid, mystride) \
670
store_streaming_double2(&out[0*mystride+sid], o00_re, o00_im); \
671
store_streaming_double2(&out[1*mystride+sid], o01_re, o01_im); \
672
store_streaming_double2(&out[2*mystride+sid], o02_re, o02_im);
673
674
#define WRITE_ST_SPINOR_FLOAT2_STR(out, sid, mystride) \
675
store_streaming_float2(&out[0*mystride+sid], o00_re, o00_im); \
676
store_streaming_float2(&out[1*mystride+sid], o01_re, o01_im); \
677
store_streaming_float2(&out[2*mystride+sid], o02_re, o02_im);
678
679
#define WRITE_ST_SPINOR_SHORT2_STR(out, sid, mystride) \
680
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
681
float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
682
float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
683
c0 = fmaxf(c0, c1); \
684
c0 = fmaxf(c0, c2); \
685
out ## Norm[sid] = c0; \
686
float scale = __fdividef(MAX_SHORT, c0); \
687
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
688
o02_re *= scale; o02_im *= scale; \
689
store_streaming_short2(&g_out[0*mystride+sid], (short)o00_re, (short)o00_im); \
690
store_streaming_short2(&g_out[1*mystride+sid], (short)o01_re, (short)o01_im); \
691
store_streaming_short2(&g_out[2*mystride+sid], (short)o02_re, (short)o02_im);
692
#else
693
694
#define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2()
695
#define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4()
696
#define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4()
697
698
#endif
699
700
#define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor,sid) { \
701
double2 tmp0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
702
double2 tmp1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
703
double2 tmp2 = fetch_double2((spinor), sid + 2*(param.sp_stride)); \
704
o00_re += tmp0.x; o00_im += tmp0.y; \
705
o01_re += tmp1.x; o01_im += tmp1.y; \
706
o02_re += tmp2.x; o02_im += tmp2.y; }
707
708
#define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor,sid) { \
709
float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(param.sp_stride)); \
710
float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(param.sp_stride)); \
711
float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(param.sp_stride)); \
712
o00_re += tmp0.x; o00_im += tmp0.y; \
713
o01_re += tmp1.x; o01_im += tmp1.y; \
714
o02_re += tmp2.x; o02_im += tmp2.y; }
715
716
#define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid) { \
717
float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
718
float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
719
float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
720
float C = TEX1DFETCH(float, (spinor##Norm), sid); \
721
o00_re += C*tmp0.x; o00_im += C*tmp0.y; \
722
o01_re += C*tmp1.x; o01_im += C*tmp1.y; \
723
o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
724
725
#define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor,sid) \
726
READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid)
727
728
#define READ_AND_SUM_ST_SPINOR(spinor,sid) \
729
o00_re += spinor[0*param.sp_stride+sid].x; o00_im += spinor[0*param.sp_stride+sid].y; \
730
o01_re += spinor[1*param.sp_stride+sid].x; o01_im += spinor[1*param.sp_stride+sid].y; \
731
o02_re += spinor[2*param.sp_stride+sid].x; o02_im += spinor[2*param.sp_stride+sid].y; \
732
733
#define READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid) \
734
float C = spinor ## Norm[sid]; \
735
o00_re += C*short2float(spinor[0*param.sp_stride + sid].x); \
736
o00_im += C*short2float(spinor[0*param.sp_stride + sid].y); \
737
o01_re += C*short2float(spinor[1*param.sp_stride + sid].x); \
738
o01_im += C*short2float(spinor[1*param.sp_stride + sid].y); \
739
o02_re += C*short2float(spinor[2*param.sp_stride + sid].x); \
740
o02_im += C*short2float(spinor[2*param.sp_stride + sid].y);
741
742
#define READ_AND_SUM_ST_SPINOR_HALF(spinor,sid) \
743
READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid)
744
745
#define READ_ST_ACCUM_DOUBLE_TEX(spinor,sid) \
746
double2 accum0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
747
double2 accum1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
748
double2 accum2 = fetch_double2((spinor), sid + 2*(param.sp_stride));
749
750
#define READ_ST_ACCUM_SINGLE_TEX(spinor,sid) \
751
float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
752
float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
753
float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride);
754
755
#define READ_ST_ACCUM_HALF_TEX_(spinor,sid) \
756
float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
757
float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
758
float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
759
float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
760
accum0.x *= C; accum0.y *= C; \
761
accum1.x *= C; accum1.y *= C; \
762
accum2.x *= C; accum2.y *= C;
763
764
#define READ_ST_ACCUM_HALF_TEX(spinor,sid) READ_ST_ACCUM_HALF_TEX_(spinor,sid)
765
766
#define READ_ST_ACCUM_DOUBLE(spinor,sid) \
767
double2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
768
double2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
769
double2 accum2 = spinor[sid + 2*(param.sp_stride)];
770
771
#define READ_ST_ACCUM_SINGLE(spinor,sid) \
772
float2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
773
float2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
774
float2 accum2 = spinor[sid + 2*(param.sp_stride)];
775
776
#define READ_ST_ACCUM_HALF(spinor,sid) \
777
float2 accum0, accum1, accum2; \
778
{ \
779
short2 S0 = x[sid + 0*param.sp_stride]; \
780
short2 S1 = x[sid + 1*param.sp_stride]; \
781
short2 S2 = x[sid + 2*param.sp_stride]; \
782
float C = spinor##Norm[sid]; \
783
accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \
784
accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \
785
accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \
786
}
787
788
#define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \
789
extern __shared__ char s_data[]; \
790
spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
791
((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
792
sh[0*SHARED_STRIDE] = reg##00_re; \
793
sh[1*SHARED_STRIDE] = reg##00_im; \
794
sh[2*SHARED_STRIDE] = reg##01_re; \
795
sh[3*SHARED_STRIDE] = reg##01_im; \
796
sh[4*SHARED_STRIDE] = reg##02_re; \
797
sh[5*SHARED_STRIDE] = reg##02_im; \
798
sh[6*SHARED_STRIDE] = reg##10_re; \
799
sh[7*SHARED_STRIDE] = reg##10_im; \
800
sh[8*SHARED_STRIDE] = reg##11_re; \
801
sh[9*SHARED_STRIDE] = reg##11_im; \
802
sh[10*SHARED_STRIDE] = reg##12_re; \
803
sh[11*SHARED_STRIDE] = reg##12_im; \
804
sh[12*SHARED_STRIDE] = reg##20_re; \
805
sh[13*SHARED_STRIDE] = reg##20_im; \
806
sh[14*SHARED_STRIDE] = reg##21_re; \
807
sh[15*SHARED_STRIDE] = reg##21_im; \
808
sh[16*SHARED_STRIDE] = reg##22_re; \
809
sh[17*SHARED_STRIDE] = reg##22_im; \
810
sh[18*SHARED_STRIDE] = reg##30_re; \
811
sh[19*SHARED_STRIDE] = reg##30_im; \
812
sh[20*SHARED_STRIDE] = reg##31_re; \
813
sh[21*SHARED_STRIDE] = reg##31_im; \
814
sh[22*SHARED_STRIDE] = reg##32_re; \
815
sh[23*SHARED_STRIDE] = reg##32_im;
816
817
#define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
818
819
#define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \
820
extern __shared__ char s_data[]; \
821
double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
822
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
823
double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \
824
double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
825
double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \
826
double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
827
double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \
828
double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
829
double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
830
double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
831
double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
832
double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
833
double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
834
double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
835
836
#ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
837
838
#define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
839
840
#define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
841
extern __shared__ char s_data[]; \
842
float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
843
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
844
float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
845
float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
846
float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
847
float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
848
float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
849
float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
850
851
#else // 8-byte shared memory words
852
853
#define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \
854
extern __shared__ char s_data[]; \
855
float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
856
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
857
sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \
858
sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \
859
sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \
860
sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \
861
sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \
862
sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \
863
sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \
864
sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \
865
sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \
866
sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \
867
sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \
868
sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
869
870
#define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
871
extern __shared__ char s_data[]; \
872
float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
873
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
874
float2 tmp1, tmp2; \
875
tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
876
tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
877
tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
878
tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
879
tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
880
tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y);
881
882
#endif
883
884
886
#define READ_ACCUM_FLAVOR_DOUBLE(spinor, stride, fl_stride) \
887
double2 flv1_accum0 = spinor[sid + 0*stride]; \
888
double2 flv1_accum1 = spinor[sid + 1*stride]; \
889
double2 flv1_accum2 = spinor[sid + 2*stride]; \
890
double2 flv1_accum3 = spinor[sid + 3*stride]; \
891
double2 flv1_accum4 = spinor[sid + 4*stride]; \
892
double2 flv1_accum5 = spinor[sid + 5*stride]; \
893
double2 flv1_accum6 = spinor[sid + 6*stride]; \
894
double2 flv1_accum7 = spinor[sid + 7*stride]; \
895
double2 flv1_accum8 = spinor[sid + 8*stride]; \
896
double2 flv1_accum9 = spinor[sid + 9*stride]; \
897
double2 flv1_accum10 = spinor[sid + 10*stride]; \
898
double2 flv1_accum11 = spinor[sid + 11*stride]; \
899
double2 flv2_accum0 = spinor[sid + fl_stride + 0*stride]; \
900
double2 flv2_accum1 = spinor[sid + fl_stride + 1*stride]; \
901
double2 flv2_accum2 = spinor[sid + fl_stride + 2*stride]; \
902
double2 flv2_accum3 = spinor[sid + fl_stride + 3*stride]; \
903
double2 flv2_accum4 = spinor[sid + fl_stride + 4*stride]; \
904
double2 flv2_accum5 = spinor[sid + fl_stride + 5*stride]; \
905
double2 flv2_accum6 = spinor[sid + fl_stride + 6*stride]; \
906
double2 flv2_accum7 = spinor[sid + fl_stride + 7*stride]; \
907
double2 flv2_accum8 = spinor[sid + fl_stride + 8*stride]; \
908
double2 flv2_accum9 = spinor[sid + fl_stride + 9*stride]; \
909
double2 flv2_accum10 = spinor[sid + fl_stride + 10*stride]; \
910
double2 flv2_accum11 = spinor[sid + fl_stride + 11*stride];
911
912
913
#define READ_ACCUM_FLAVOR_SINGLE(spinor, stride, flv_stride) \
914
float4 flv1_accum0 = spinor[sid + 0*(stride)]; \
915
float4 flv1_accum1 = spinor[sid + 1*(stride)]; \
916
float4 flv1_accum2 = spinor[sid + 2*(stride)]; \
917
float4 flv1_accum3 = spinor[sid + 3*(stride)]; \
918
float4 flv1_accum4 = spinor[sid + 4*(stride)]; \
919
float4 flv1_accum5 = spinor[sid + 5*(stride)]; \
920
float4 flv2_accum0 = spinor[sid + flv_stride + 0*(stride)]; \
921
float4 flv2_accum1 = spinor[sid + flv_stride + 1*(stride)]; \
922
float4 flv2_accum2 = spinor[sid + flv_stride + 2*(stride)]; \
923
float4 flv2_accum3 = spinor[sid + flv_stride + 3*(stride)]; \
924
float4 flv2_accum4 = spinor[sid + flv_stride + 4*(stride)]; \
925
float4 flv2_accum5 = spinor[sid + flv_stride + 5*(stride)];
926
927
928
#define READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride) \
929
float4 flv1_accum0 = short42float4(spinor[sid + 0*stride]); \
930
float4 flv1_accum1 = short42float4(spinor[sid + 1*stride]); \
931
float4 flv1_accum2 = short42float4(spinor[sid + 2*stride]); \
932
float4 flv1_accum3 = short42float4(spinor[sid + 3*stride]); \
933
float4 flv1_accum4 = short42float4(spinor[sid + 4*stride]); \
934
float4 flv1_accum5 = short42float4(spinor[sid + 5*stride]); \
935
float C = (spinor ## Norm)[sid]; \
936
flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
937
flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
938
flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
939
flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
940
flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
941
flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
942
float4 flv2_accum0 = short42float4(spinor[sid + flv_stride + 0*stride]); \
943
float4 flv2_accum1 = short42float4(spinor[sid + flv_stride + 1*stride]); \
944
float4 flv2_accum2 = short42float4(spinor[sid + flv_stride + 2*stride]); \
945
float4 flv2_accum3 = short42float4(spinor[sid + flv_stride + 3*stride]); \
946
float4 flv2_accum4 = short42float4(spinor[sid + flv_stride + 4*stride]); \
947
float4 flv2_accum5 = short42float4(spinor[sid + flv_stride + 5*stride]); \
948
C = (spinor ## Norm)[sid + fl_stride]; \
949
flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
950
flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
951
flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
952
flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
953
flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
954
flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
955
956
#define READ_ACCUM_FLAVOR_HALF(spinor, stride, flv_stride) READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride)
957
958
959
#define READ_ACCUM_FLAVOR_DOUBLE_TEX(spinor, stride, flv_stride) \
960
double2 flv1_accum0 = fetch_double2((spinor), sid + 0*(stride)); \
961
double2 flv1_accum1 = fetch_double2((spinor), sid + 1*(stride)); \
962
double2 flv1_accum2 = fetch_double2((spinor), sid + 2*(stride)); \
963
double2 flv1_accum3 = fetch_double2((spinor), sid + 3*(stride)); \
964
double2 flv1_accum4 = fetch_double2((spinor), sid + 4*(stride)); \
965
double2 flv1_accum5 = fetch_double2((spinor), sid + 5*(stride)); \
966
double2 flv1_accum6 = fetch_double2((spinor), sid + 6*(stride)); \
967
double2 flv1_accum7 = fetch_double2((spinor), sid + 7*(stride)); \
968
double2 flv1_accum8 = fetch_double2((spinor), sid + 8*(stride)); \
969
double2 flv1_accum9 = fetch_double2((spinor), sid + 9*(stride)); \
970
double2 flv1_accum10 = fetch_double2((spinor), sid + 10*(stride)); \
971
double2 flv1_accum11 = fetch_double2((spinor), sid + 11*(stride)); \
972
double2 flv2_accum0 = fetch_double2((spinor), sid + flv_stride + 0*(stride)); \
973
double2 flv2_accum1 = fetch_double2((spinor), sid + flv_stride + 1*(stride)); \
974
double2 flv2_accum2 = fetch_double2((spinor), sid + flv_stride + 2*(stride)); \
975
double2 flv2_accum3 = fetch_double2((spinor), sid + flv_stride + 3*(stride)); \
976
double2 flv2_accum4 = fetch_double2((spinor), sid + flv_stride + 4*(stride)); \
977
double2 flv2_accum5 = fetch_double2((spinor), sid + flv_stride + 5*(stride)); \
978
double2 flv2_accum6 = fetch_double2((spinor), sid + flv_stride + 6*(stride)); \
979
double2 flv2_accum7 = fetch_double2((spinor), sid + flv_stride + 7*(stride)); \
980
double2 flv2_accum8 = fetch_double2((spinor), sid + flv_stride + 8*(stride)); \
981
double2 flv2_accum9 = fetch_double2((spinor), sid + flv_stride + 9*(stride)); \
982
double2 flv2_accum10 = fetch_double2((spinor), sid + flv_stride + 10*(stride)); \
983
double2 flv2_accum11 = fetch_double2((spinor), sid + flv_stride + 11*(stride));
984
985
986
#define READ_ACCUM_FLAVOR_SINGLE_TEX(spinor, stride, flv_stride) \
987
float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
988
float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
989
float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
990
float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
991
float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
992
float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
993
float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
994
float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
995
float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
996
float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
997
float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
998
float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride));
999
1000
#define READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride) \
1001
float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
1002
float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
1003
float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
1004
float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
1005
float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
1006
float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
1007
float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
1008
flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
1009
flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
1010
flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
1011
flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
1012
flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
1013
flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
1014
float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
1015
float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
1016
float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
1017
float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
1018
float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
1019
float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride)); \
1020
C = TEX1DFETCH(float, (spinor ## Norm), sid + flv_stride); \
1021
flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
1022
flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
1023
flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
1024
flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
1025
flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
1026
flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
1027
1028
1029
#define READ_ACCUM_FLAVOR_HALF_TEX(spinor, stride, flv_stride) READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride)
1030
1031
//single-flavor macros:
1032
1033
#define ASSN_ACCUM_DOUBLE(spinor, stride, fl_stride) \
1034
accum0 = spinor[sid + fl_stride + 0*stride]; \
1035
accum1 = spinor[sid + fl_stride + 1*stride]; \
1036
accum2 = spinor[sid + fl_stride + 2*stride]; \
1037
accum3 = spinor[sid + fl_stride + 3*stride]; \
1038
accum4 = spinor[sid + fl_stride + 4*stride]; \
1039
accum5 = spinor[sid + fl_stride + 5*stride]; \
1040
accum6 = spinor[sid + fl_stride + 6*stride]; \
1041
accum7 = spinor[sid + fl_stride + 7*stride]; \
1042
accum8 = spinor[sid + fl_stride + 8*stride]; \
1043
accum9 = spinor[sid + fl_stride + 9*stride]; \
1044
accum10 = spinor[sid + fl_stride + 10*stride]; \
1045
accum11 = spinor[sid + fl_stride + 11*stride];
1046
1047
#define ASSN_ACCUM_SINGLE(spinor, stride, fl_stride) \
1048
accum0 = spinor[sid + fl_stride + 0*(stride)]; \
1049
accum1 = spinor[sid + fl_stride + 1*(stride)]; \
1050
accum2 = spinor[sid + fl_stride + 2*(stride)]; \
1051
accum3 = spinor[sid + fl_stride + 3*(stride)]; \
1052
accum4 = spinor[sid + fl_stride + 4*(stride)]; \
1053
accum5 = spinor[sid + fl_stride + 5*(stride)];
1054
1055
#define ASSN_ACCUM_HALF_(spinor, stride, fl_stride) \
1056
accum0 = short42float4(spinor[sid + fl_stride + 0*stride]); \
1057
accum1 = short42float4(spinor[sid + fl_stride + 1*stride]); \
1058
accum2 = short42float4(spinor[sid + fl_stride + 2*stride]); \
1059
accum3 = short42float4(spinor[sid + fl_stride + 3*stride]); \
1060
accum4 = short42float4(spinor[sid + fl_stride + 4*stride]); \
1061
accum5 = short42float4(spinor[sid + fl_stride + 5*stride]); \
1062
{\
1063
float C = (spinor ## Norm)[sid + fl_stride]; \
1064
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1065
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1066
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1067
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1068
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1069
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1070
}
1071
1072
1073
#define ASSN_ACCUM_HALF(spinor, stride, fl_stride) ASSN_ACCUM_HALF_(spinor, stride, fl_stride)
1074
1075
//single-flavor macros:
1076
1077
#define ASSN_ACCUM_DOUBLE_TEX(spinor, stride, fl_stride) \
1078
accum0 = fetch_double2((spinor), sid + fl_stride + 0*(stride)); \
1079
accum1 = fetch_double2((spinor), sid + fl_stride + 1*(stride)); \
1080
accum2 = fetch_double2((spinor), sid + fl_stride + 2*(stride)); \
1081
accum3 = fetch_double2((spinor), sid + fl_stride + 3*(stride)); \
1082
accum4 = fetch_double2((spinor), sid + fl_stride + 4*(stride)); \
1083
accum5 = fetch_double2((spinor), sid + fl_stride + 5*(stride)); \
1084
accum6 = fetch_double2((spinor), sid + fl_stride + 6*(stride)); \
1085
accum7 = fetch_double2((spinor), sid + fl_stride + 7*(stride)); \
1086
accum8 = fetch_double2((spinor), sid + fl_stride + 8*(stride)); \
1087
accum9 = fetch_double2((spinor), sid + fl_stride + 9*(stride)); \
1088
accum10 = fetch_double2((spinor), sid + fl_stride + 10*(stride)); \
1089
accum11 = fetch_double2((spinor), sid + fl_stride + 11*(stride));
1090
1091
1092
#define ASSN_ACCUM_SINGLE_TEX(spinor, stride, fl_stride) \
1093
accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1094
accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1095
accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1096
accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1097
accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1098
accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride));
1099
1100
#define ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride) \
1101
accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1102
accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1103
accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1104
accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1105
accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1106
accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride)); \
1107
{\
1108
float C = TEX1DFETCH(float, (spinor ## Norm), sid + fl_stride); \
1109
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1110
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1111
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1112
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1113
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1114
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1115
}
1116
1117
#define ASSN_ACCUM_HALF_TEX(spinor, stride, fl_stride) ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride)
1118
1120
1121
//apply b*(1 + i*a*gamma_5) to the input spinor
1122
#define APPLY_TWIST_INV(a, b, reg)\
1123
{\
1124
spinorFloat tmp_re, tmp_im;\
1125
tmp_re = reg##00_re - a * reg##20_im;\
1126
tmp_im = reg##00_im + a * reg##20_re;\
1127
reg##20_re -= a * reg##00_im;\
1128
reg##20_im += a * reg##00_re;\
1129
\
1130
reg##00_re = b * tmp_re;\
1131
reg##00_im = b * tmp_im;\
1132
reg##20_re *= b;\
1133
reg##20_im *= b;\
1134
\
1135
tmp_re = reg##10_re - a * reg##30_im;\
1136
tmp_im = reg##10_im + a * reg##30_re;\
1137
reg##30_re -= a * reg##10_im;\
1138
reg##30_im += a * reg##10_re;\
1139
\
1140
reg##10_re = b * tmp_re;\
1141
reg##10_im = b * tmp_im;\
1142
reg##30_re *= b;\
1143
reg##30_im *= b;\
1144
\
1145
tmp_re = reg##01_re - a * reg##21_im;\
1146
tmp_im = reg##01_im + a * reg##21_re;\
1147
reg##21_re -= a * reg##01_im;\
1148
reg##21_im += a * reg##01_re;\
1149
\
1150
reg##01_re = b * tmp_re;\
1151
reg##01_im = b * tmp_im;\
1152
reg##21_re *= b;\
1153
reg##21_im *= b;\
1154
\
1155
tmp_re = reg##11_re - a * reg##31_im;\
1156
tmp_im = reg##11_im + a * reg##31_re;\
1157
reg##31_re -= a * reg##11_im;\
1158
reg##31_im += a * reg##11_re;\
1159
\
1160
reg##11_re = b * tmp_re;\
1161
reg##11_im = b * tmp_im;\
1162
reg##31_re *= b;\
1163
reg##31_im *= b;\
1164
\
1165
tmp_re = reg##02_re - a * reg##22_im;\
1166
tmp_im = reg##02_im + a * reg##22_re;\
1167
reg##22_re -= a * reg##02_im;\
1168
reg##22_im += a * reg##02_re;\
1169
\
1170
reg##02_re = b * tmp_re;\
1171
reg##02_im = b * tmp_im;\
1172
reg##22_re *= b;\
1173
reg##22_im *= b;\
1174
\
1175
tmp_re = reg##12_re - a * reg##32_im;\
1176
tmp_im = reg##12_im + a * reg##32_re;\
1177
reg##32_re -= a * reg##12_im;\
1178
reg##32_im += a * reg##12_re;\
1179
\
1180
reg##12_re = b * tmp_re;\
1181
reg##12_im = b * tmp_im;\
1182
reg##32_re *= b;\
1183
reg##32_im *= b;\
1184
}
1185
1186
1187
#define APPLY_TWIST(a, reg)\
1188
{\
1189
spinorFloat tmp_re, tmp_im;\
1190
tmp_re = reg##00_re - a * reg##20_im;\
1191
tmp_im = reg##00_im + a * reg##20_re;\
1192
reg##20_re -= a * reg##00_im;\
1193
reg##20_im += a * reg##00_re;\
1194
\
1195
reg##00_re = tmp_re;\
1196
reg##00_im = tmp_im;\
1197
\
1198
tmp_re = reg##10_re - a * reg##30_im;\
1199
tmp_im = reg##10_im + a * reg##30_re;\
1200
reg##30_re -= a * reg##10_im;\
1201
reg##30_im += a * reg##10_re;\
1202
\
1203
reg##10_re = tmp_re;\
1204
reg##10_im = tmp_im;\
1205
\
1206
tmp_re = reg##01_re - a * reg##21_im;\
1207
tmp_im = reg##01_im + a * reg##21_re;\
1208
reg##21_re -= a * reg##01_im;\
1209
reg##21_im += a * reg##01_re;\
1210
\
1211
reg##01_re = tmp_re;\
1212
reg##01_im = tmp_im;\
1213
\
1214
tmp_re = reg##11_re - a * reg##31_im;\
1215
tmp_im = reg##11_im + a * reg##31_re;\
1216
reg##31_re -= a * reg##11_im;\
1217
reg##31_im += a * reg##11_re;\
1218
\
1219
reg##11_re = tmp_re;\
1220
reg##11_im = tmp_im;\
1221
\
1222
tmp_re = reg##02_re - a * reg##22_im;\
1223
tmp_im = reg##02_im + a * reg##22_re;\
1224
reg##22_re -= a * reg##02_im;\
1225
reg##22_im += a * reg##02_re;\
1226
\
1227
reg##02_re = tmp_re;\
1228
reg##02_im = tmp_im;\
1229
\
1230
tmp_re = reg##12_re - a * reg##32_im;\
1231
tmp_im = reg##12_im + a * reg##32_re;\
1232
reg##32_re -= a * reg##12_im;\
1233
reg##32_im += a * reg##12_re;\
1234
\
1235
reg##12_re = tmp_re;\
1236
reg##12_im = tmp_im;\
1237
}
1238
1239
1240
Generated on Wed Feb 4 2015 17:00:12 for QUDA by
1.8.6