QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
io_spinor.h
Go to the documentation of this file.
1 #define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
2  double2 I0 = spinor[sp_idx + 0*(stride)]; \
3  double2 I1 = spinor[sp_idx + 1*(stride)]; \
4  double2 I2 = spinor[sp_idx + 2*(stride)]; \
5  double2 I3 = spinor[sp_idx + 3*(stride)]; \
6  double2 I4 = spinor[sp_idx + 4*(stride)]; \
7  double2 I5 = spinor[sp_idx + 5*(stride)]; \
8  double2 I6 = spinor[sp_idx + 6*(stride)]; \
9  double2 I7 = spinor[sp_idx + 7*(stride)]; \
10  double2 I8 = spinor[sp_idx + 8*(stride)]; \
11  double2 I9 = spinor[sp_idx + 9*(stride)]; \
12  double2 I10 = spinor[sp_idx + 10*(stride)]; \
13  double2 I11 = spinor[sp_idx + 11*(stride)];
14 
15 #define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \
16  double2 I0 = spinor[sp_idx + 0*(stride)]; \
17  double2 I1 = spinor[sp_idx + 1*(stride)]; \
18  double2 I2 = spinor[sp_idx + 2*(stride)]; \
19  double2 I3 = spinor[sp_idx + 3*(stride)]; \
20  double2 I4 = spinor[sp_idx + 4*(stride)]; \
21  double2 I5 = spinor[sp_idx + 5*(stride)];
22 
23 #define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx) \
24  double2 I6 = spinor[sp_idx + 6*(stride)]; \
25  double2 I7 = spinor[sp_idx + 7*(stride)]; \
26  double2 I8 = spinor[sp_idx + 8*(stride)]; \
27  double2 I9 = spinor[sp_idx + 9*(stride)]; \
28  double2 I10 = spinor[sp_idx + 10*(stride)]; \
29  double2 I11 = spinor[sp_idx + 11*(stride)];
30 
31 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
32  float4 I0 = spinor[sp_idx + 0*(stride)]; \
33  float4 I1 = spinor[sp_idx + 1*(stride)]; \
34  float4 I2 = spinor[sp_idx + 2*(stride)]; \
35  float4 I3 = spinor[sp_idx + 3*(stride)]; \
36  float4 I4 = spinor[sp_idx + 4*(stride)]; \
37  float4 I5 = spinor[sp_idx + 5*(stride)];
38 
39 #define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx) \
40  float4 I0 = spinor[sp_idx + 0*(stride)]; \
41  float4 I1 = spinor[sp_idx + 1*(stride)]; \
42  float4 I2 = spinor[sp_idx + 2*(stride)]; \
43 
44 #define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx) \
45  float4 I3 = spinor[sp_idx + 3*(stride)]; \
46  float4 I4 = spinor[sp_idx + 4*(stride)]; \
47  float4 I5 = spinor[sp_idx + 5*(stride)];
48 
49 #define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) \
50  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
51  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
52  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
53  float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \
54  float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \
55  float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \
56  float C = (spinor ## Norm)[norm_idx]; \
57  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
58  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
59  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
60  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
61  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
62  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
63 
64 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \
65  READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)
66 
67 #define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) \
68  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
69  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
70  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
71  float C = (spinor ## Norm)[norm_idx]; \
72  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
73  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
74  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
75 
76 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \
77  READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)
78 
79 #define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) \
80  float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \
81  float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \
82  float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \
83  float C = (spinor ## Norm)[norm_idx]; \
84  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
85  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
86  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
87 
88 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
89  READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)
90 
91 #define READ_ACCUM_DOUBLE(spinor, stride) \
92  double2 accum0 = spinor[sid + 0*stride]; \
93  double2 accum1 = spinor[sid + 1*stride]; \
94  double2 accum2 = spinor[sid + 2*stride]; \
95  double2 accum3 = spinor[sid + 3*stride]; \
96  double2 accum4 = spinor[sid + 4*stride]; \
97  double2 accum5 = spinor[sid + 5*stride]; \
98  double2 accum6 = spinor[sid + 6*stride]; \
99  double2 accum7 = spinor[sid + 7*stride]; \
100  double2 accum8 = spinor[sid + 8*stride]; \
101  double2 accum9 = spinor[sid + 9*stride]; \
102  double2 accum10 = spinor[sid + 10*stride]; \
103  double2 accum11 = spinor[sid + 11*stride];
104 
105 #define READ_ACCUM_SINGLE(spinor, stride) \
106  float4 accum0 = spinor[sid + 0*(stride)]; \
107  float4 accum1 = spinor[sid + 1*(stride)]; \
108  float4 accum2 = spinor[sid + 2*(stride)]; \
109  float4 accum3 = spinor[sid + 3*(stride)]; \
110  float4 accum4 = spinor[sid + 4*(stride)]; \
111  float4 accum5 = spinor[sid + 5*(stride)];
112 
113 #define READ_ACCUM_HALF_(spinor, stride) \
114  float4 accum0 = short42float4(spinor[sid + 0*stride]); \
115  float4 accum1 = short42float4(spinor[sid + 1*stride]); \
116  float4 accum2 = short42float4(spinor[sid + 2*stride]); \
117  float4 accum3 = short42float4(spinor[sid + 3*stride]); \
118  float4 accum4 = short42float4(spinor[sid + 4*stride]); \
119  float4 accum5 = short42float4(spinor[sid + 5*stride]); \
120  float C = (spinor ## Norm)[sid]; \
121  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
122  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
123  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
124  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
125  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
126  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
127 
128 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
129 
130 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
131  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
132  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
133  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
134  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
135  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
136  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
137  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
138  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
139  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
140  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
141  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
142  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
143 
144 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
145  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
146  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
147  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
148  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
149  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
150  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
151 
152 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
153  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
154  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
155  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
156  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
157  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
158  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
159 
160 #define READ_ACCUM_DOUBLE_TEX(spinor, stride) \
161  double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \
162  double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \
163  double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \
164  double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \
165  double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \
166  double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \
167  double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \
168  double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \
169  double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \
170  double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \
171  double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
172  double2 accum11 = fetch_double2((spinor), sid + 11*(stride));
173 
174 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
175  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
176  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
177  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
178  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
179  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
180  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
181 
182 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
183  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
184  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
185  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
186 
187 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
188  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
189  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
190  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
191 
192 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
193  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
194  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
195  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
196  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
197  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
198  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
199  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
200  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
201  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
202  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
203  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
204  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
205  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
206 
207 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \
208  READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
209 
210 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
211  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
212  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
213  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
214  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
215  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
216  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
217  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
218 
219 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
220  READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
221 
222 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
223  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
224  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
225  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
226  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
227  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
228  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
229  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
230 
231 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
232  READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
233 
234 #define READ_ACCUM_SINGLE_TEX(spinor, stride) \
235  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
236  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
237  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
238  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
239  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
240  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride));
241 
242 #define READ_ACCUM_HALF_TEX_(spinor, stride) \
243  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
244  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
245  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
246  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
247  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
248  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
249  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
250  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
251  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
252  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
253  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
254  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
255  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
256 
257 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
258 
259 
260 #define WRITE_SPINOR_DOUBLE2(stride) \
261  out[0*(stride)+sid] = make_double2(o00_re, o00_im); \
262  out[1*(stride)+sid] = make_double2(o01_re, o01_im); \
263  out[2*(stride)+sid] = make_double2(o02_re, o02_im); \
264  out[3*(stride)+sid] = make_double2(o10_re, o10_im); \
265  out[4*(stride)+sid] = make_double2(o11_re, o11_im); \
266  out[5*(stride)+sid] = make_double2(o12_re, o12_im); \
267  out[6*(stride)+sid] = make_double2(o20_re, o20_im); \
268  out[7*(stride)+sid] = make_double2(o21_re, o21_im); \
269  out[8*(stride)+sid] = make_double2(o22_re, o22_im); \
270  out[9*(stride)+sid] = make_double2(o30_re, o30_im); \
271  out[10*(stride)+sid] = make_double2(o31_re, o31_im); \
272  out[11*(stride)+sid] = make_double2(o32_re, o32_im);
273 
274 #define WRITE_SPINOR_FLOAT4(stride) \
275  out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
276  out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
277  out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
278  out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
279  out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
280  out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
281 
282 #define WRITE_SPINOR_SHORT4(stride) \
283  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
284  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
285  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
286  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
287  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
288  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
289  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
290  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
291  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
292  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
293  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
294  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
295  c0 = fmaxf(c0, c1); \
296  c1 = fmaxf(c2, c3); \
297  c2 = fmaxf(c4, c5); \
298  c3 = fmaxf(c6, c7); \
299  c4 = fmaxf(c8, c9); \
300  c5 = fmaxf(c10, c11); \
301  c0 = fmaxf(c0, c1); \
302  c1 = fmaxf(c2, c3); \
303  c2 = fmaxf(c4, c5); \
304  c0 = fmaxf(c0, c1); \
305  c0 = fmaxf(c0, c2); \
306  outNorm[sid] = c0; \
307  float scale = __fdividef(MAX_SHORT, c0); \
308  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
309  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
310  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
311  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
312  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
313  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
314  out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
315  out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
316  out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
317  out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
318  out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
319  out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
320 
321 #if (__COMPUTE_CAPABILITY__ >= 200)
322 #define WRITE_SPINOR_DOUBLE2_STR(stride) \
323  store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \
324  store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \
325  store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im); \
326  store_streaming_double2(&out[3*sp_stride+sid], o10_re, o10_im); \
327  store_streaming_double2(&out[4*sp_stride+sid], o11_re, o11_im); \
328  store_streaming_double2(&out[5*sp_stride+sid], o12_re, o12_im); \
329  store_streaming_double2(&out[6*sp_stride+sid], o20_re, o20_im); \
330  store_streaming_double2(&out[7*sp_stride+sid], o21_re, o21_im); \
331  store_streaming_double2(&out[8*sp_stride+sid], o22_re, o22_im); \
332  store_streaming_double2(&out[9*sp_stride+sid], o30_re, o30_im); \
333  store_streaming_double2(&out[10*sp_stride+sid], o31_re, o31_im); \
334  store_streaming_double2(&out[11*sp_stride+sid], o32_re, o32_im);
335 
336 #define WRITE_SPINOR_FLOAT4_STR(stride) \
337  store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
338  store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
339  store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
340  store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
341  store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
342  store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
343 
344 #define WRITE_SPINOR_SHORT4_STR(stride) \
345  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
346  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
347  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
348  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
349  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
350  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
351  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
352  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
353  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
354  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
355  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
356  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
357  c0 = fmaxf(c0, c1); \
358  c1 = fmaxf(c2, c3); \
359  c2 = fmaxf(c4, c5); \
360  c3 = fmaxf(c6, c7); \
361  c4 = fmaxf(c8, c9); \
362  c5 = fmaxf(c10, c11); \
363  c0 = fmaxf(c0, c1); \
364  c1 = fmaxf(c2, c3); \
365  c2 = fmaxf(c4, c5); \
366  c0 = fmaxf(c0, c1); \
367  c0 = fmaxf(c0, c2); \
368  outNorm[sid] = c0; \
369  float scale = __fdividef(MAX_SHORT, c0); \
370  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
371  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
372  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
373  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
374  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
375  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
376  store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
377  store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
378  store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
379  store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
380  store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
381  store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
382 #else
383 #define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride)
384 #define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride)
385 #define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride)
386 #endif
387 
388 // macros used for exterior Wilson Dslash kernels and face packing
389 
390 #define READ_HALF_SPINOR READ_SPINOR_UP
391 
392 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \
393  out[0*(stride)+sid] = make_double2(a0_re, a0_im); \
394  out[1*(stride)+sid] = make_double2(a1_re, a1_im); \
395  out[2*(stride)+sid] = make_double2(a2_re, a2_im); \
396  out[3*(stride)+sid] = make_double2(b0_re, b0_im); \
397  out[4*(stride)+sid] = make_double2(b1_re, b1_im); \
398  out[5*(stride)+sid] = make_double2(b2_re, b2_im);
399 
400 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \
401  out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \
402  out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \
403  out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
404 
405 #define WRITE_HALF_SPINOR_SHORT4(stride, sid) \
406  float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \
407  float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \
408  float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \
409  float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \
410  float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \
411  float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \
412  c0 = fmaxf(c0, c1); \
413  c1 = fmaxf(c2, c3); \
414  c2 = fmaxf(c4, c5); \
415  c0 = fmaxf(c0, c1); \
416  c0 = fmaxf(c0, c2); \
417  outNorm[sid] = c0; \
418  float scale = __fdividef(MAX_SHORT, c0); \
419  a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \
420  a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \
421  b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \
422  out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \
423  out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \
424  out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im);
425 
427 /******************used by non-degenerate twisted mass**********************/
428 #define WRITE_FLAVOR_SPINOR_DOUBLE2() \
429  out[0*(sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \
430  out[1*(sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \
431  out[2*(sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \
432  out[3*(sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \
433  out[4*(sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \
434  out[5*(sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \
435  out[6*(sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \
436  out[7*(sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \
437  out[8*(sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \
438  out[9*(sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \
439  out[10*(sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \
440  out[11*(sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \
441  out[0*(sp_stride)+sid+fl_stride] = make_double2(o2_00_re, o2_00_im); \
442  out[1*(sp_stride)+sid+fl_stride] = make_double2(o2_01_re, o2_01_im); \
443  out[2*(sp_stride)+sid+fl_stride] = make_double2(o2_02_re, o2_02_im); \
444  out[3*(sp_stride)+sid+fl_stride] = make_double2(o2_10_re, o2_10_im); \
445  out[4*(sp_stride)+sid+fl_stride] = make_double2(o2_11_re, o2_11_im); \
446  out[5*(sp_stride)+sid+fl_stride] = make_double2(o2_12_re, o2_12_im); \
447  out[6*(sp_stride)+sid+fl_stride] = make_double2(o2_20_re, o2_20_im); \
448  out[7*(sp_stride)+sid+fl_stride] = make_double2(o2_21_re, o2_21_im); \
449  out[8*(sp_stride)+sid+fl_stride] = make_double2(o2_22_re, o2_22_im); \
450  out[9*(sp_stride)+sid+fl_stride] = make_double2(o2_30_re, o2_30_im); \
451  out[10*(sp_stride)+sid+fl_stride] = make_double2(o2_31_re, o2_31_im); \
452  out[11*(sp_stride)+sid+fl_stride] = make_double2(o2_32_re, o2_32_im);
453 
454 
455 #define WRITE_FLAVOR_SPINOR_FLOAT4() \
456  out[0*(sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \
457  out[1*(sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \
458  out[2*(sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \
459  out[3*(sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \
460  out[4*(sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \
461  out[5*(sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \
462  out[0*(sp_stride)+sid+fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \
463  out[1*(sp_stride)+sid+fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \
464  out[2*(sp_stride)+sid+fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \
465  out[3*(sp_stride)+sid+fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \
466  out[4*(sp_stride)+sid+fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \
467  out[5*(sp_stride)+sid+fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im);
468 
469 
470 #define WRITE_FLAVOR_SPINOR_SHORT4() \
471  float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \
472  float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \
473  float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \
474  float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \
475  float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \
476  float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \
477  float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \
478  float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \
479  float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \
480  float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \
481  float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \
482  float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \
483  c0 = fmaxf(c0, c1); \
484  c1 = fmaxf(c2, c3); \
485  c2 = fmaxf(c4, c5); \
486  c3 = fmaxf(c6, c7); \
487  c4 = fmaxf(c8, c9); \
488  c5 = fmaxf(c10, c11); \
489  c0 = fmaxf(c0, c1); \
490  c1 = fmaxf(c2, c3); \
491  c2 = fmaxf(c4, c5); \
492  c0 = fmaxf(c0, c1); \
493  c0 = fmaxf(c0, c2); \
494  outNorm[sid] = c0; \
495  float scale = __fdividef(MAX_SHORT, c0); \
496  o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \
497  o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \
498  o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \
499  o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \
500  o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \
501  o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \
502  out[sid+0*(sp_stride)] = make_short4((short)o1_00_re, (short)o1_00_im, (short)o1_01_re, (short)o1_01_im); \
503  out[sid+1*(sp_stride)] = make_short4((short)o1_02_re, (short)o1_02_im, (short)o1_10_re, (short)o1_10_im); \
504  out[sid+2*(sp_stride)] = make_short4((short)o1_11_re, (short)o1_11_im, (short)o1_12_re, (short)o1_12_im); \
505  out[sid+3*(sp_stride)] = make_short4((short)o1_20_re, (short)o1_20_im, (short)o1_21_re, (short)o1_21_im); \
506  out[sid+4*(sp_stride)] = make_short4((short)o1_22_re, (short)o1_22_im, (short)o1_30_re, (short)o1_30_im); \
507  out[sid+5*(sp_stride)] = make_short4((short)o1_31_re, (short)o1_31_im, (short)o1_32_re, (short)o1_32_im); \
508  c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \
509  c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \
510  c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \
511  c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \
512  c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \
513  c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \
514  c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \
515  c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \
516  c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \
517  c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \
518  c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \
519  c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \
520  c0 = fmaxf(c0, c1); \
521  c1 = fmaxf(c2, c3); \
522  c2 = fmaxf(c4, c5); \
523  c3 = fmaxf(c6, c7); \
524  c4 = fmaxf(c8, c9); \
525  c5 = fmaxf(c10, c11); \
526  c0 = fmaxf(c0, c1); \
527  c1 = fmaxf(c2, c3); \
528  c2 = fmaxf(c4, c5); \
529  c0 = fmaxf(c0, c1); \
530  c0 = fmaxf(c0, c2); \
531  outNorm[sid+fl_stride] = c0; \
532  scale = __fdividef(MAX_SHORT, c0); \
533  o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \
534  o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \
535  o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \
536  o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \
537  o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \
538  o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \
539  out[sid+fl_stride+0*(sp_stride)] = make_short4((short)o2_00_re, (short)o2_00_im, (short)o2_01_re, (short)o2_01_im); \
540  out[sid+fl_stride+1*(sp_stride)] = make_short4((short)o2_02_re, (short)o2_02_im, (short)o2_10_re, (short)o2_10_im); \
541  out[sid+fl_stride+2*(sp_stride)] = make_short4((short)o2_11_re, (short)o2_11_im, (short)o2_12_re, (short)o2_12_im); \
542  out[sid+fl_stride+3*(sp_stride)] = make_short4((short)o2_20_re, (short)o2_20_im, (short)o2_21_re, (short)o2_21_im); \
543  out[sid+fl_stride+4*(sp_stride)] = make_short4((short)o2_22_re, (short)o2_22_im, (short)o2_30_re, (short)o2_30_im); \
544  out[sid+fl_stride+5*(sp_stride)] = make_short4((short)o2_31_re, (short)o2_31_im, (short)o2_32_re, (short)o2_32_im);
545 
546 
547 /************* the following is used by staggered *****************/
548 
549 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
550  double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
551  double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
552  double2 I2 = fetch_double2((spinor), idx + 2*mystride);
553 
554 #define READ_3RD_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
555  double2 T0 = fetch_double2((spinor), idx + 0*mystride); \
556  double2 T1 = fetch_double2((spinor), idx + 1*mystride); \
557  double2 T2 = fetch_double2((spinor), idx + 2*mystride);
558 
559 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
560  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
561  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
562  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
563 
564 #define READ_3RD_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
565  float2 T0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
566  float2 T1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
567  float2 T2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
568 
569 #define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
570  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
571  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
572  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
573  { \
574  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
575  I0.x *= C; I0.y *= C; \
576  I1.x *= C; I1.y *= C; \
577  I2.x *= C; I2.y *= C;}
578 
579 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
580  READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
581 
582 #define READ_3RD_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
583  float2 T0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
584  float2 T1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
585  float2 T2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
586  { \
587  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
588  T0.x *= C; T0.y *= C; \
589  T1.x *= C; T1.y *= C; \
590  T2.x *= C; T2.y *= C;}
591 
592 #define READ_3RD_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
593  READ_3RD_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
594 
595 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
596  double2 I0 = spinor[idx + 0*mystride]; \
597  double2 I1 = spinor[idx + 1*mystride]; \
598  double2 I2 = spinor[idx + 2*mystride];
599 
600 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
601  double2 T0 = spinor[idx + 0*mystride]; \
602  double2 T1 = spinor[idx + 1*mystride]; \
603  double2 T2 = spinor[idx + 2*mystride];
604 
605 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
606  float2 I0 = spinor[idx + 0*mystride]; \
607  float2 I1 = spinor[idx + 1*mystride]; \
608  float2 I2 = spinor[idx + 2*mystride];
609 
610 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
611  float2 T0 = spinor[idx + 0*mystride]; \
612  float2 T1 = spinor[idx + 1*mystride]; \
613  float2 T2 = spinor[idx + 2*mystride];
614 
615 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \
616  float2 I0, I1, I2; \
617  { \
618  short2 S0 = in[idx + 0*mystride]; \
619  short2 S1 = in[idx + 1*mystride]; \
620  short2 S2 = in[idx + 2*mystride]; \
621  float C = inNorm[idx]; \
622  I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \
623  I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \
624  I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \
625  }
626 
627 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride) \
628  float2 T0, T1, T2; \
629  { \
630  short2 S0 = in[idx + 0*mystride]; \
631  short2 S1 = in[idx + 1*mystride]; \
632  short2 S2 = in[idx + 2*mystride]; \
633  float C = inNorm[idx]; \
634  T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y); \
635  T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y); \
636  T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y); \
637  }
638 
639 
640 #define WRITE_ST_SPINOR_DOUBLE2(out) \
641  out[0*sp_stride+sid] = make_double2(o00_re, o00_im); \
642  out[1*sp_stride+sid] = make_double2(o01_re, o01_im); \
643  out[2*sp_stride+sid] = make_double2(o02_re, o02_im);
644 
645 #define WRITE_ST_SPINOR_FLOAT2(out) \
646  out[0*sp_stride+sid] = make_float2(o00_re, o00_im); \
647  out[1*sp_stride+sid] = make_float2(o01_re, o01_im); \
648  out[2*sp_stride+sid] = make_float2(o02_re, o02_im);
649 
650 #define WRITE_ST_SPINOR_SHORT2(out) \
651  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
652  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
653  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
654  c0 = fmaxf(c0, c1); \
655  c0 = fmaxf(c0, c2); \
656  out ## Norm[sid] = c0; \
657  float scale = __fdividef(MAX_SHORT, c0); \
658  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
659  o02_re *= scale; o02_im *= scale; \
660  out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im); \
661  out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im); \
662  out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im);
663 
664 // Non-cache writes to minimize cache polution
665 #if (__COMPUTE_CAPABILITY__ >= 200)
666 
667 #define WRITE_ST_SPINOR_DOUBLE2_STR(out) \
668  store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \
669  store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \
670  store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im);
671 
672 #define WRITE_ST_SPINOR_FLOAT2_STR(out) \
673  store_streaming_float2(&out[0*sp_stride+sid], o00_re, o00_im); \
674  store_streaming_float2(&out[1*sp_stride+sid], o01_re, o01_im); \
675  store_streaming_float2(&out[2*sp_stride+sid], o02_re, o02_im);
676 
677 #define WRITE_ST_SPINOR_SHORT2_STR(out) \
678  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
679  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
680  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
681  c0 = fmaxf(c0, c1); \
682  c0 = fmaxf(c0, c2); \
683  out ## Norm[sid] = c0; \
684  float scale = __fdividef(MAX_SHORT, c0); \
685  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
686  o02_re *= scale; o02_im *= scale; \
687  store_streaming_short2(&g_out[0*sp_stride+sid], (short)o00_re, (short)o00_im); \
688  store_streaming_short2(&g_out[1*sp_stride+sid], (short)o01_re, (short)o01_im); \
689  store_streaming_short2(&g_out[2*sp_stride+sid], (short)o02_re, (short)o02_im);
690 #else
691 
692 #define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2()
693 #define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4()
694 #define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4()
695 
696 #endif
697 
698 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor) { \
699  double2 tmp0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
700  double2 tmp1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
701  double2 tmp2 = fetch_double2((spinor), sid + 2*(sp_stride)); \
702  o00_re += tmp0.x; o00_im += tmp0.y; \
703  o01_re += tmp1.x; o01_im += tmp1.y; \
704  o02_re += tmp2.x; o02_im += tmp2.y; }
705 
706 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor) { \
707  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(sp_stride)); \
708  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(sp_stride)); \
709  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(sp_stride)); \
710  o00_re += tmp0.x; o00_im += tmp0.y; \
711  o01_re += tmp1.x; o01_im += tmp1.y; \
712  o02_re += tmp2.x; o02_im += tmp2.y; }
713 
714 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor) { \
715  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
716  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
717  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride); \
718  float C = TEX1DFETCH(float, (spinor##Norm), sid); \
719  o00_re += C*tmp0.x; o00_im += C*tmp0.y; \
720  o01_re += C*tmp1.x; o01_im += C*tmp1.y; \
721  o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
722 
723 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor) \
724  READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor)
725 
726 #define READ_AND_SUM_ST_SPINOR(spinor) \
727  o00_re += spinor[0*sp_stride+sid].x; o00_im += spinor[0*sp_stride+sid].y; \
728  o01_re += spinor[1*sp_stride+sid].x; o01_im += spinor[1*sp_stride+sid].y; \
729  o02_re += spinor[2*sp_stride+sid].x; o02_im += spinor[2*sp_stride+sid].y; \
730 
731 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor) \
732  float C = spinor ## Norm[sid]; \
733  o00_re += C*short2float(spinor[0*sp_stride + sid].x); \
734  o00_im += C*short2float(spinor[0*sp_stride + sid].y); \
735  o01_re += C*short2float(spinor[1*sp_stride + sid].x); \
736  o01_im += C*short2float(spinor[1*sp_stride + sid].y); \
737  o02_re += C*short2float(spinor[2*sp_stride + sid].x); \
738  o02_im += C*short2float(spinor[2*sp_stride + sid].y);
739 
740 #define READ_AND_SUM_ST_SPINOR_HALF(spinor) \
741  READ_AND_SUM_ST_SPINOR_HALF_(spinor)
742 
743 #define READ_ST_ACCUM_DOUBLE_TEX(spinor) \
744  double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
745  double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
746  double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));
747 
748 #define READ_ST_ACCUM_SINGLE_TEX(spinor) \
749  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
750  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
751  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride);
752 
753 #define READ_ST_ACCUM_HALF_TEX_(spinor) \
754  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*sp_stride); \
755  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*sp_stride); \
756  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*sp_stride); \
757  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
758  accum0.x *= C; accum0.y *= C; \
759  accum1.x *= C; accum1.y *= C; \
760  accum2.x *= C; accum2.y *= C;
761 
762 #define READ_ST_ACCUM_HALF_TEX(spinor) READ_ST_ACCUM_HALF_TEX_(spinor)
763 
764 #define READ_ST_ACCUM_DOUBLE(spinor) \
765  double2 accum0 = spinor[sid + 0*(sp_stride)]; \
766  double2 accum1 = spinor[sid + 1*(sp_stride)]; \
767  double2 accum2 = spinor[sid + 2*(sp_stride)];
768 
769 #define READ_ST_ACCUM_SINGLE(spinor) \
770  float2 accum0 = spinor[sid + 0*(sp_stride)]; \
771  float2 accum1 = spinor[sid + 1*(sp_stride)]; \
772  float2 accum2 = spinor[sid + 2*(sp_stride)];
773 
774 #define READ_ST_ACCUM_HALF(spinor) \
775  float2 accum0, accum1, accum2; \
776  { \
777  short2 S0 = x[sid + 0*sp_stride]; \
778  short2 S1 = x[sid + 1*sp_stride]; \
779  short2 S2 = x[sid + 2*sp_stride]; \
780  float C = spinor##Norm[sid]; \
781  accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \
782  accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \
783  accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \
784  }
785 
786 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \
787  extern __shared__ char s_data[]; \
788  spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
789  ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
790  sh[0*SHARED_STRIDE] = reg##00_re; \
791  sh[1*SHARED_STRIDE] = reg##00_im; \
792  sh[2*SHARED_STRIDE] = reg##01_re; \
793  sh[3*SHARED_STRIDE] = reg##01_im; \
794  sh[4*SHARED_STRIDE] = reg##02_re; \
795  sh[5*SHARED_STRIDE] = reg##02_im; \
796  sh[6*SHARED_STRIDE] = reg##10_re; \
797  sh[7*SHARED_STRIDE] = reg##10_im; \
798  sh[8*SHARED_STRIDE] = reg##11_re; \
799  sh[9*SHARED_STRIDE] = reg##11_im; \
800  sh[10*SHARED_STRIDE] = reg##12_re; \
801  sh[11*SHARED_STRIDE] = reg##12_im; \
802  sh[12*SHARED_STRIDE] = reg##20_re; \
803  sh[13*SHARED_STRIDE] = reg##20_im; \
804  sh[14*SHARED_STRIDE] = reg##21_re; \
805  sh[15*SHARED_STRIDE] = reg##21_im; \
806  sh[16*SHARED_STRIDE] = reg##22_re; \
807  sh[17*SHARED_STRIDE] = reg##22_im; \
808  sh[18*SHARED_STRIDE] = reg##30_re; \
809  sh[19*SHARED_STRIDE] = reg##30_im; \
810  sh[20*SHARED_STRIDE] = reg##31_re; \
811  sh[21*SHARED_STRIDE] = reg##31_im; \
812  sh[22*SHARED_STRIDE] = reg##32_re; \
813  sh[23*SHARED_STRIDE] = reg##32_im;
814 
815 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
816 
817 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \
818  extern __shared__ char s_data[]; \
819  double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
820  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
821  double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \
822  double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
823  double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \
824  double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
825  double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \
826  double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
827  double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
828  double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
829  double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
830  double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
831  double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
832  double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
833 
834 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
835 
836 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
837 
838 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
839  extern __shared__ char s_data[]; \
840  float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
841  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
842  float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
843  float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
844  float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
845  float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
846  float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
847  float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
848 
849 #else // 8-byte shared memory words
850 
851 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \
852  extern __shared__ char s_data[]; \
853  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
854  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
855  sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \
856  sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \
857  sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \
858  sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \
859  sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \
860  sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \
861  sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \
862  sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \
863  sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \
864  sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \
865  sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \
866  sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
867 
868 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
869  extern __shared__ char s_data[]; \
870  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
871  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
872  float2 tmp1, tmp2; \
873  tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
874  tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
875  tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
876  tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
877  tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
878  tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y);
879 
880 #endif
881 
882