QUDA
v0.5.0
A library for QCD on GPUs
Main Page
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
quda
lib
io_spinor.h
Go to the documentation of this file.
1
#define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
2
double2 I0 = spinor[sp_idx + 0*(stride)]; \
3
double2 I1 = spinor[sp_idx + 1*(stride)]; \
4
double2 I2 = spinor[sp_idx + 2*(stride)]; \
5
double2 I3 = spinor[sp_idx + 3*(stride)]; \
6
double2 I4 = spinor[sp_idx + 4*(stride)]; \
7
double2 I5 = spinor[sp_idx + 5*(stride)]; \
8
double2 I6 = spinor[sp_idx + 6*(stride)]; \
9
double2 I7 = spinor[sp_idx + 7*(stride)]; \
10
double2 I8 = spinor[sp_idx + 8*(stride)]; \
11
double2 I9 = spinor[sp_idx + 9*(stride)]; \
12
double2 I10 = spinor[sp_idx + 10*(stride)]; \
13
double2 I11 = spinor[sp_idx + 11*(stride)];
14
15
#define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \
16
double2 I0 = spinor[sp_idx + 0*(stride)]; \
17
double2 I1 = spinor[sp_idx + 1*(stride)]; \
18
double2 I2 = spinor[sp_idx + 2*(stride)]; \
19
double2 I3 = spinor[sp_idx + 3*(stride)]; \
20
double2 I4 = spinor[sp_idx + 4*(stride)]; \
21
double2 I5 = spinor[sp_idx + 5*(stride)];
22
23
#define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx) \
24
double2 I6 = spinor[sp_idx + 6*(stride)]; \
25
double2 I7 = spinor[sp_idx + 7*(stride)]; \
26
double2 I8 = spinor[sp_idx + 8*(stride)]; \
27
double2 I9 = spinor[sp_idx + 9*(stride)]; \
28
double2 I10 = spinor[sp_idx + 10*(stride)]; \
29
double2 I11 = spinor[sp_idx + 11*(stride)];
30
31
#define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
32
float4 I0 = spinor[sp_idx + 0*(stride)]; \
33
float4 I1 = spinor[sp_idx + 1*(stride)]; \
34
float4 I2 = spinor[sp_idx + 2*(stride)]; \
35
float4 I3 = spinor[sp_idx + 3*(stride)]; \
36
float4 I4 = spinor[sp_idx + 4*(stride)]; \
37
float4 I5 = spinor[sp_idx + 5*(stride)];
38
39
#define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx) \
40
float4 I0 = spinor[sp_idx + 0*(stride)]; \
41
float4 I1 = spinor[sp_idx + 1*(stride)]; \
42
float4 I2 = spinor[sp_idx + 2*(stride)]; \
43
44
#define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx) \
45
float4 I3 = spinor[sp_idx + 3*(stride)]; \
46
float4 I4 = spinor[sp_idx + 4*(stride)]; \
47
float4 I5 = spinor[sp_idx + 5*(stride)];
48
49
#define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) \
50
float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
51
float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
52
float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
53
float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \
54
float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \
55
float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \
56
float C = (spinor ## Norm)[norm_idx]; \
57
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
58
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
59
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
60
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
61
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
62
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
63
64
#define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \
65
READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)
66
67
#define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) \
68
float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
69
float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
70
float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
71
float C = (spinor ## Norm)[norm_idx]; \
72
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
73
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
74
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
75
76
#define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \
77
READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)
78
79
#define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) \
80
float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \
81
float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \
82
float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \
83
float C = (spinor ## Norm)[norm_idx]; \
84
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
85
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
86
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
87
88
#define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
89
READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)
90
91
#define READ_ACCUM_DOUBLE(spinor, stride) \
92
double2 accum0 = spinor[sid + 0*stride]; \
93
double2 accum1 = spinor[sid + 1*stride]; \
94
double2 accum2 = spinor[sid + 2*stride]; \
95
double2 accum3 = spinor[sid + 3*stride]; \
96
double2 accum4 = spinor[sid + 4*stride]; \
97
double2 accum5 = spinor[sid + 5*stride]; \
98
double2 accum6 = spinor[sid + 6*stride]; \
99
double2 accum7 = spinor[sid + 7*stride]; \
100
double2 accum8 = spinor[sid + 8*stride]; \
101
double2 accum9 = spinor[sid + 9*stride]; \
102
double2 accum10 = spinor[sid + 10*stride]; \
103
double2 accum11 = spinor[sid + 11*stride];
104
105
#define READ_ACCUM_SINGLE(spinor, stride) \
106
float4 accum0 = spinor[sid + 0*(stride)]; \
107
float4 accum1 = spinor[sid + 1*(stride)]; \
108
float4 accum2 = spinor[sid + 2*(stride)]; \
109
float4 accum3 = spinor[sid + 3*(stride)]; \
110
float4 accum4 = spinor[sid + 4*(stride)]; \
111
float4 accum5 = spinor[sid + 5*(stride)];
112
113
#define READ_ACCUM_HALF_(spinor, stride) \
114
float4 accum0 = short42float4(spinor[sid + 0*stride]); \
115
float4 accum1 = short42float4(spinor[sid + 1*stride]); \
116
float4 accum2 = short42float4(spinor[sid + 2*stride]); \
117
float4 accum3 = short42float4(spinor[sid + 3*stride]); \
118
float4 accum4 = short42float4(spinor[sid + 4*stride]); \
119
float4 accum5 = short42float4(spinor[sid + 5*stride]); \
120
float C = (spinor ## Norm)[sid]; \
121
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
122
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
123
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
124
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
125
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
126
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
127
128
#define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
129
130
#define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
131
double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
132
double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
133
double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
134
double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
135
double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
136
double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
137
double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
138
double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
139
double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
140
double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
141
double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
142
double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
143
144
#define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
145
double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
146
double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
147
double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
148
double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
149
double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
150
double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
151
152
#define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
153
double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
154
double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
155
double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
156
double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
157
double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
158
double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
159
160
#define READ_ACCUM_DOUBLE_TEX(spinor, stride) \
161
double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \
162
double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \
163
double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \
164
double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \
165
double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \
166
double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \
167
double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \
168
double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \
169
double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \
170
double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \
171
double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
172
double2 accum11 = fetch_double2((spinor), sid + 11*(stride));
173
174
#define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
175
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
176
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
177
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
178
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
179
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
180
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
181
182
#define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
183
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
184
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
185
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
186
187
#define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
188
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
189
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
190
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
191
192
#define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
193
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
194
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
195
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
196
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
197
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
198
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
199
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
200
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
201
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
202
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
203
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
204
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
205
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
206
207
#define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \
208
READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
209
210
#define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
211
float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
212
float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
213
float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
214
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
215
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
216
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
217
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
218
219
#define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
220
READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
221
222
#define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
223
float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
224
float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
225
float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
226
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
227
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
228
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
229
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
230
231
#define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
232
READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
233
234
#define READ_ACCUM_SINGLE_TEX(spinor, stride) \
235
float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
236
float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
237
float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
238
float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
239
float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
240
float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride));
241
242
#define READ_ACCUM_HALF_TEX_(spinor, stride) \
243
float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
244
float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
245
float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
246
float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
247
float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
248
float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
249
float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
250
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
251
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
252
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
253
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
254
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
255
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
256
257
#define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
258
259
260
#define WRITE_SPINOR_DOUBLE2(stride) \
261
out[0*(stride)+sid] = make_double2(o00_re, o00_im); \
262
out[1*(stride)+sid] = make_double2(o01_re, o01_im); \
263
out[2*(stride)+sid] = make_double2(o02_re, o02_im); \
264
out[3*(stride)+sid] = make_double2(o10_re, o10_im); \
265
out[4*(stride)+sid] = make_double2(o11_re, o11_im); \
266
out[5*(stride)+sid] = make_double2(o12_re, o12_im); \
267
out[6*(stride)+sid] = make_double2(o20_re, o20_im); \
268
out[7*(stride)+sid] = make_double2(o21_re, o21_im); \
269
out[8*(stride)+sid] = make_double2(o22_re, o22_im); \
270
out[9*(stride)+sid] = make_double2(o30_re, o30_im); \
271
out[10*(stride)+sid] = make_double2(o31_re, o31_im); \
272
out[11*(stride)+sid] = make_double2(o32_re, o32_im);
273
274
#define WRITE_SPINOR_FLOAT4(stride) \
275
out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
276
out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
277
out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
278
out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
279
out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
280
out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
281
282
#define WRITE_SPINOR_SHORT4(stride) \
283
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
284
float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
285
float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
286
float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
287
float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
288
float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
289
float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
290
float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
291
float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
292
float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
293
float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
294
float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
295
c0 = fmaxf(c0, c1); \
296
c1 = fmaxf(c2, c3); \
297
c2 = fmaxf(c4, c5); \
298
c3 = fmaxf(c6, c7); \
299
c4 = fmaxf(c8, c9); \
300
c5 = fmaxf(c10, c11); \
301
c0 = fmaxf(c0, c1); \
302
c1 = fmaxf(c2, c3); \
303
c2 = fmaxf(c4, c5); \
304
c0 = fmaxf(c0, c1); \
305
c0 = fmaxf(c0, c2); \
306
outNorm[sid] = c0; \
307
float scale = __fdividef(MAX_SHORT, c0); \
308
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
309
o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
310
o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
311
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
312
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
313
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
314
out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
315
out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
316
out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
317
out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
318
out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
319
out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
320
321
#if (__COMPUTE_CAPABILITY__ >= 200)
322
#define WRITE_SPINOR_DOUBLE2_STR(stride) \
323
store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \
324
store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \
325
store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im); \
326
store_streaming_double2(&out[3*sp_stride+sid], o10_re, o10_im); \
327
store_streaming_double2(&out[4*sp_stride+sid], o11_re, o11_im); \
328
store_streaming_double2(&out[5*sp_stride+sid], o12_re, o12_im); \
329
store_streaming_double2(&out[6*sp_stride+sid], o20_re, o20_im); \
330
store_streaming_double2(&out[7*sp_stride+sid], o21_re, o21_im); \
331
store_streaming_double2(&out[8*sp_stride+sid], o22_re, o22_im); \
332
store_streaming_double2(&out[9*sp_stride+sid], o30_re, o30_im); \
333
store_streaming_double2(&out[10*sp_stride+sid], o31_re, o31_im); \
334
store_streaming_double2(&out[11*sp_stride+sid], o32_re, o32_im);
335
336
#define WRITE_SPINOR_FLOAT4_STR(stride) \
337
store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
338
store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
339
store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
340
store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
341
store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
342
store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
343
344
#define WRITE_SPINOR_SHORT4_STR(stride) \
345
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
346
float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
347
float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
348
float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
349
float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
350
float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
351
float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
352
float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
353
float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
354
float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
355
float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
356
float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
357
c0 = fmaxf(c0, c1); \
358
c1 = fmaxf(c2, c3); \
359
c2 = fmaxf(c4, c5); \
360
c3 = fmaxf(c6, c7); \
361
c4 = fmaxf(c8, c9); \
362
c5 = fmaxf(c10, c11); \
363
c0 = fmaxf(c0, c1); \
364
c1 = fmaxf(c2, c3); \
365
c2 = fmaxf(c4, c5); \
366
c0 = fmaxf(c0, c1); \
367
c0 = fmaxf(c0, c2); \
368
outNorm[sid] = c0; \
369
float scale = __fdividef(MAX_SHORT, c0); \
370
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
371
o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
372
o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
373
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
374
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
375
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
376
store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
377
store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
378
store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
379
store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
380
store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
381
store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
382
#else
383
#define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride)
384
#define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride)
385
#define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride)
386
#endif
387
388
// macros used for exterior Wilson Dslash kernels and face packing
389
390
#define READ_HALF_SPINOR READ_SPINOR_UP
391
392
#define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \
393
out[0*(stride)+sid] = make_double2(a0_re, a0_im); \
394
out[1*(stride)+sid] = make_double2(a1_re, a1_im); \
395
out[2*(stride)+sid] = make_double2(a2_re, a2_im); \
396
out[3*(stride)+sid] = make_double2(b0_re, b0_im); \
397
out[4*(stride)+sid] = make_double2(b1_re, b1_im); \
398
out[5*(stride)+sid] = make_double2(b2_re, b2_im);
399
400
#define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \
401
out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \
402
out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \
403
out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
404
405
#define WRITE_HALF_SPINOR_SHORT4(stride, sid) \
406
float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \
407
float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \
408
float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \
409
float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \
410
float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \
411
float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \
412
c0 = fmaxf(c0, c1); \
413
c1 = fmaxf(c2, c3); \
414
c2 = fmaxf(c4, c5); \
415
c0 = fmaxf(c0, c1); \
416
c0 = fmaxf(c0, c2); \
417
outNorm[sid] = c0; \
418
float scale = __fdividef(MAX_SHORT, c0); \
419
a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \
420
a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \
421
b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \
422
out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \
423
out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \
424
out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im);
425
427
/******************used by non-degenerate twisted mass**********************/
428
#define WRITE_FLAVOR_SPINOR_DOUBLE2() \
429
out[0*(sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \
430
out[1*(sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \
431
out[2*(sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \
432
out[3*(sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \
433
out[4*(sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \
434
out[5*(sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \
435
out[6*(sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \
436
out[7*(sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \
437
out[8*(sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \
438
out[9*(sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \
439
out[10*(sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \
440
out[11*(sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \
441
out[0*(sp_stride)+sid+fl_stride] = make_double2(o2_00_re, o2_00_im); \
442
out[1*(sp_stride)+sid+fl_stride] = make_double2(o2_01_re, o2_01_im); \
443
out[2*(sp_stride)+sid+fl_stride] = make_double2(o2_02_re, o2_02_im); \
444
out[3*(sp_stride)+sid+fl_stride] = make_double2(o2_10_re, o2_10_im); \
445
out[4*(sp_stride)+sid+fl_stride] = make_double2(o2_11_re, o2_11_im); \
446
out[5*(sp_stride)+sid+fl_stride] = make_double2(o2_12_re, o2_12_im); \
447
out[6*(sp_stride)+sid+fl_stride] = make_double2(o2_20_re, o2_20_im); \
448
out[7*(sp_stride)+sid+fl_stride] = make_double2(o2_21_re, o2_21_im); \
449
out[8*(sp_stride)+sid+fl_stride] = make_double2(o2_22_re, o2_22_im); \
450
out[9*(sp_stride)+sid+fl_stride] = make_double2(o2_30_re, o2_30_im); \
451
out[10*(sp_stride)+sid+fl_stride] = make_double2(o2_31_re, o2_31_im); \
452
out[11*(sp_stride)+sid+fl_stride] = make_double2(o2_32_re, o2_32_im);
453
454
455
#define WRITE_FLAVOR_SPINOR_FLOAT4() \
456
out[0*(sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \
457
out[1*(sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \
458
out[2*(sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \
459
out[3*(sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \
460
out[4*(sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \
461
out[5*(sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \
462
out[0*(sp_stride)+sid+fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \
463
out[1*(sp_stride)+sid+fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \
464
out[2*(sp_stride)+sid+fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \
465
out[3*(sp_stride)+sid+fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \
466
out[4*(sp_stride)+sid+fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \
467
out[5*(sp_stride)+sid+fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im);
468
469
470
#define WRITE_FLAVOR_SPINOR_SHORT4() \
471
float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \
472
float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \
473
float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \
474
float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \
475
float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \
476
float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \
477
float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \
478
float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \
479
float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \
480
float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \
481
float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \
482
float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \
483
c0 = fmaxf(c0, c1); \
484
c1 = fmaxf(c2, c3); \
485
c2 = fmaxf(c4, c5); \
486
c3 = fmaxf(c6, c7); \
487
c4 = fmaxf(c8, c9); \
488
c5 = fmaxf(c10, c11); \
489
c0 = fmaxf(c0, c1); \
490
c1 = fmaxf(c2, c3); \
491
c2 = fmaxf(c4, c5); \
492
c0 = fmaxf(c0, c1); \
493
c0 = fmaxf(c0, c2); \
494
outNorm[sid] = c0; \
495
float scale = __fdividef(MAX_SHORT, c0); \
496
o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \
497
o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \
498
o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \
499
o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \
500
o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \
501
o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \
502
out[sid+0*(sp_stride)] = make_short4((short)o1_00_re, (short)o1_00_im, (short)o1_01_re, (short)o1_01_im); \
503
out[sid+1*(sp_stride)] = make_short4((short)o1_02_re, (short)o1_02_im, (short)o1_10_re, (short)o1_10_im); \
504
out[sid+2*(sp_stride)] = make_short4((short)o1_11_re, (short)o1_11_im, (short)o1_12_re, (short)o1_12_im); \
505
out[sid+3*(sp_stride)] = make_short4((short)o1_20_re, (short)o1_20_im, (short)o1_21_re, (short)o1_21_im); \
506
out[sid+4*(sp_stride)] = make_short4((short)o1_22_re, (short)o1_22_im, (short)o1_30_re, (short)o1_30_im); \
507
out[sid+5*(sp_stride)] = make_short4((short)o1_31_re, (short)o1_31_im, (short)o1_32_re, (short)o1_32_im); \
508
c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \
509
c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \
510
c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \
511
c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \
512
c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \
513
c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \
514
c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \
515
c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \
516
c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \
517
c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \
518
c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \
519
c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \
520
c0 = fmaxf(c0, c1); \
521
c1 = fmaxf(c2, c3); \
522
c2 = fmaxf(c4, c5); \
523
c3 = fmaxf(c6, c7); \
524
c4 = fmaxf(c8, c9); \
525
c5 = fmaxf(c10, c11); \
526
c0 = fmaxf(c0, c1); \
527
c1 = fmaxf(c2, c3); \
528
c2 = fmaxf(c4, c5); \
529
c0 = fmaxf(c0, c1); \
530
c0 = fmaxf(c0, c2); \
531
outNorm[sid+fl_stride] = c0; \
532
scale = __fdividef(MAX_SHORT, c0); \
533
o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \
534
o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \
535
o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \
536
o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \
537
o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \
538
o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \
539
out[sid+fl_stride+0*(sp_stride)] = make_short4((short)o2_00_re, (short)o2_00_im, (short)o2_01_re, (short)o2_01_im); \
540
out[sid+fl_stride+1*(sp_stride)] = make_short4((short)o2_02_re, (short)o2_02_im, (short)o2_10_re, (short)o2_10_im); \
541
out[sid+fl_stride+2*(sp_stride)] = make_short4((short)o2_11_re, (short)o2_11_im, (short)o2_12_re, (short)o2_12_im); \
542
out[sid+fl_stride+3*(sp_stride)] = make_short4((short)o2_20_re, (short)o2_20_im, (short)o2_21_re, (short)o2_21_im); \
543
out[sid+fl_stride+4*(sp_stride)] = make_short4((short)o2_22_re, (short)o2_22_im, (short)o2_30_re, (short)o2_30_im); \
544
out[sid+fl_stride+5*(sp_stride)] = make_short4((short)o2_31_re, (short)o2_31_im, (short)o2_32_re, (short)o2_32_im);
545
546
547
/************* the following is used by staggered *****************/
548
549
#define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
550
double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
551
double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
552
double2 I2 = fetch_double2((spinor), idx + 2*mystride);
553
554
#define READ_3RD_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
555
double2 T0 = fetch_double2((spinor), idx + 0*mystride); \
556
double2 T1 = fetch_double2((spinor), idx + 1*mystride); \
557
double2 T2 = fetch_double2((spinor), idx + 2*mystride);
558
559
#define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
560
float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
561
float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
562
float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
563
564
#define READ_3RD_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
565
float2 T0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
566
float2 T1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
567
float2 T2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
568
569
#define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
570
float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
571
float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
572
float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
573
{ \
574
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
575
I0.x *= C; I0.y *= C; \
576
I1.x *= C; I1.y *= C; \
577
I2.x *= C; I2.y *= C;}
578
579
#define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
580
READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
581
582
#define READ_3RD_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
583
float2 T0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
584
float2 T1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
585
float2 T2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
586
{ \
587
float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
588
T0.x *= C; T0.y *= C; \
589
T1.x *= C; T1.y *= C; \
590
T2.x *= C; T2.y *= C;}
591
592
#define READ_3RD_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
593
READ_3RD_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
594
595
#define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
596
double2 I0 = spinor[idx + 0*mystride]; \
597
double2 I1 = spinor[idx + 1*mystride]; \
598
double2 I2 = spinor[idx + 2*mystride];
599
600
#define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
601
double2 T0 = spinor[idx + 0*mystride]; \
602
double2 T1 = spinor[idx + 1*mystride]; \
603
double2 T2 = spinor[idx + 2*mystride];
604
605
#define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
606
float2 I0 = spinor[idx + 0*mystride]; \
607
float2 I1 = spinor[idx + 1*mystride]; \
608
float2 I2 = spinor[idx + 2*mystride];
609
610
#define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
611
float2 T0 = spinor[idx + 0*mystride]; \
612
float2 T1 = spinor[idx + 1*mystride]; \
613
float2 T2 = spinor[idx + 2*mystride];
614
615
#define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \
616
float2 I0, I1, I2; \
617
{ \
618
short2 S0 = in[idx + 0*mystride]; \
619
short2 S1 = in[idx + 1*mystride]; \
620
short2 S2 = in[idx + 2*mystride]; \
621
float C = inNorm[idx]; \
622
I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \
623
I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \
624
I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \
625
}
626
627
#define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride) \
628
float2 T0, T1, T2; \
629
{ \
630
short2 S0 = in[idx + 0*mystride]; \
631
short2 S1 = in[idx + 1*mystride]; \
632
short2 S2 = in[idx + 2*mystride]; \
633
float C = inNorm[idx]; \
634
T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y); \
635
T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y); \
636
T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y); \
637
}
638
639
640
#define WRITE_ST_SPINOR_DOUBLE2(out) \
641
out[0*sp_stride+sid] = make_double2(o00_re, o00_im); \
642
out[1*sp_stride+sid] = make_double2(o01_re, o01_im); \
643
out[2*sp_stride+sid] = make_double2(o02_re, o02_im);
644
645
#define WRITE_ST_SPINOR_FLOAT2(out) \
646
out[0*sp_stride+sid] = make_float2(o00_re, o00_im); \
647
out[1*sp_stride+sid] = make_float2(o01_re, o01_im); \
648
out[2*sp_stride+sid] = make_float2(o02_re, o02_im);
649
650
#define WRITE_ST_SPINOR_SHORT2(out) \
651
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
652
float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
653
float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
654
c0 = fmaxf(c0, c1); \
655
c0 = fmaxf(c0, c2); \
656
out ## Norm[sid] = c0; \
657
float scale = __fdividef(MAX_SHORT, c0); \
658
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
659
o02_re *= scale; o02_im *= scale; \
660
out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im); \
661
out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im); \
662
out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im);
663
664
// Non-cache writes to minimize cache polution
665
#if (__COMPUTE_CAPABILITY__ >= 200)
666
667
#define WRITE_ST_SPINOR_DOUBLE2_STR(out) \
668
store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \
669
store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \
670
store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im);
671
672
#define WRITE_ST_SPINOR_FLOAT2_STR(out) \
673
store_streaming_float2(&out[0*sp_stride+sid], o00_re, o00_im); \
674
store_streaming_float2(&out[1*sp_stride+sid], o01_re, o01_im); \
675
store_streaming_float2(&out[2*sp_stride+sid], o02_re, o02_im);
676
677
#define WRITE_ST_SPINOR_SHORT2_STR(out) \
678
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
679
float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
680
float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
681
c0 = fmaxf(c0, c1); \
682
c0 = fmaxf(c0, c2); \
683
out ## Norm[sid] = c0; \
684
float scale = __fdividef(MAX_SHORT, c0); \
685
o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
686
o02_re *= scale; o02_im *= scale; \
687
store_streaming_short2(&g_out[0*sp_stride+sid], (short)o00_re, (short)o00_im); \
688
store_streaming_short2(&g_out[1*sp_stride+sid], (short)o01_re, (short)o01_im); \
689
store_streaming_short2(&g_out[2*sp_stride+sid], (short)o02_re, (short)o02_im);
690
#else
691
692
#define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2()
693
#define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4()
694
#define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4()
695
696
#endif
697
698
#define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor) { \
699
double2 tmp0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
700
double2 tmp1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
701
double2 tmp2 = fetch_double2((spinor), sid + 2*(sp_stride)); \
702
o00_re += tmp0.x; o00_im += tmp0.y; \
703
o01_re += tmp1.x; o01_im += tmp1.y; \
704
o02_re += tmp2.x; o02_im += tmp2.y; }
705
706
#define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor) { \
707
float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(sp_stride)); \
708
float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(sp_stride)); \
709
float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(sp_stride)); \
710
o00_re += tmp0.x; o00_im += tmp0.y; \
711
o01_re += tmp1.x; o01_im += tmp1.y; \
712
o02_re += tmp2.x; o02_im += tmp2.y; }
713
714
#define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor) { \
715
float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
716
float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
717
float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride); \
718
float C = TEX1DFETCH(float, (spinor##Norm), sid); \
719
o00_re += C*tmp0.x; o00_im += C*tmp0.y; \
720
o01_re += C*tmp1.x; o01_im += C*tmp1.y; \
721
o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
722
723
#define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor) \
724
READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor)
725
726
#define READ_AND_SUM_ST_SPINOR(spinor) \
727
o00_re += spinor[0*sp_stride+sid].x; o00_im += spinor[0*sp_stride+sid].y; \
728
o01_re += spinor[1*sp_stride+sid].x; o01_im += spinor[1*sp_stride+sid].y; \
729
o02_re += spinor[2*sp_stride+sid].x; o02_im += spinor[2*sp_stride+sid].y; \
730
731
#define READ_AND_SUM_ST_SPINOR_HALF_(spinor) \
732
float C = spinor ## Norm[sid]; \
733
o00_re += C*short2float(spinor[0*sp_stride + sid].x); \
734
o00_im += C*short2float(spinor[0*sp_stride + sid].y); \
735
o01_re += C*short2float(spinor[1*sp_stride + sid].x); \
736
o01_im += C*short2float(spinor[1*sp_stride + sid].y); \
737
o02_re += C*short2float(spinor[2*sp_stride + sid].x); \
738
o02_im += C*short2float(spinor[2*sp_stride + sid].y);
739
740
#define READ_AND_SUM_ST_SPINOR_HALF(spinor) \
741
READ_AND_SUM_ST_SPINOR_HALF_(spinor)
742
743
#define READ_ST_ACCUM_DOUBLE_TEX(spinor) \
744
double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
745
double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
746
double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));
747
748
#define READ_ST_ACCUM_SINGLE_TEX(spinor) \
749
float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
750
float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
751
float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride);
752
753
#define READ_ST_ACCUM_HALF_TEX_(spinor) \
754
float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
755
float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
756
float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride); \
757
float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
758
accum0.x *= C; accum0.y *= C; \
759
accum1.x *= C; accum1.y *= C; \
760
accum2.x *= C; accum2.y *= C;
761
762
#define READ_ST_ACCUM_HALF_TEX(spinor) READ_ST_ACCUM_HALF_TEX_(spinor)
763
764
#define READ_ST_ACCUM_DOUBLE(spinor) \
765
double2 accum0 = spinor[sid + 0*(sp_stride)]; \
766
double2 accum1 = spinor[sid + 1*(sp_stride)]; \
767
double2 accum2 = spinor[sid + 2*(sp_stride)];
768
769
#define READ_ST_ACCUM_SINGLE(spinor) \
770
float2 accum0 = spinor[sid + 0*(sp_stride)]; \
771
float2 accum1 = spinor[sid + 1*(sp_stride)]; \
772
float2 accum2 = spinor[sid + 2*(sp_stride)];
773
774
#define READ_ST_ACCUM_HALF(spinor) \
775
float2 accum0, accum1, accum2; \
776
{ \
777
short2 S0 = x[sid + 0*sp_stride]; \
778
short2 S1 = x[sid + 1*sp_stride]; \
779
short2 S2 = x[sid + 2*sp_stride]; \
780
float C = spinor##Norm[sid]; \
781
accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \
782
accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \
783
accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \
784
}
785
786
#define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \
787
extern __shared__ char s_data[]; \
788
spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
789
((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
790
sh[0*SHARED_STRIDE] = reg##00_re; \
791
sh[1*SHARED_STRIDE] = reg##00_im; \
792
sh[2*SHARED_STRIDE] = reg##01_re; \
793
sh[3*SHARED_STRIDE] = reg##01_im; \
794
sh[4*SHARED_STRIDE] = reg##02_re; \
795
sh[5*SHARED_STRIDE] = reg##02_im; \
796
sh[6*SHARED_STRIDE] = reg##10_re; \
797
sh[7*SHARED_STRIDE] = reg##10_im; \
798
sh[8*SHARED_STRIDE] = reg##11_re; \
799
sh[9*SHARED_STRIDE] = reg##11_im; \
800
sh[10*SHARED_STRIDE] = reg##12_re; \
801
sh[11*SHARED_STRIDE] = reg##12_im; \
802
sh[12*SHARED_STRIDE] = reg##20_re; \
803
sh[13*SHARED_STRIDE] = reg##20_im; \
804
sh[14*SHARED_STRIDE] = reg##21_re; \
805
sh[15*SHARED_STRIDE] = reg##21_im; \
806
sh[16*SHARED_STRIDE] = reg##22_re; \
807
sh[17*SHARED_STRIDE] = reg##22_im; \
808
sh[18*SHARED_STRIDE] = reg##30_re; \
809
sh[19*SHARED_STRIDE] = reg##30_im; \
810
sh[20*SHARED_STRIDE] = reg##31_re; \
811
sh[21*SHARED_STRIDE] = reg##31_im; \
812
sh[22*SHARED_STRIDE] = reg##32_re; \
813
sh[23*SHARED_STRIDE] = reg##32_im;
814
815
#define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
816
817
#define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \
818
extern __shared__ char s_data[]; \
819
double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
820
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
821
double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \
822
double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
823
double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \
824
double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
825
double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \
826
double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
827
double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
828
double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
829
double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
830
double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
831
double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
832
double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
833
834
#ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
835
836
#define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
837
838
#define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
839
extern __shared__ char s_data[]; \
840
float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
841
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
842
float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
843
float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
844
float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
845
float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
846
float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
847
float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
848
849
#else // 8-byte shared memory words
850
851
#define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \
852
extern __shared__ char s_data[]; \
853
float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
854
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
855
sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \
856
sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \
857
sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \
858
sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \
859
sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \
860
sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \
861
sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \
862
sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \
863
sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \
864
sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \
865
sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \
866
sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
867
868
#define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
869
extern __shared__ char s_data[]; \
870
float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
871
((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
872
float2 tmp1, tmp2; \
873
tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
874
tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
875
tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
876
tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
877
tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
878
tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y);
879
880
#endif
881
882
Generated on Wed Mar 20 2013 12:52:17 for QUDA by
1.8.2