QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
io_spinor.h
Go to the documentation of this file.
1 #define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
2  double2 I0 = spinor[sp_idx + 0*(stride)]; \
3  double2 I1 = spinor[sp_idx + 1*(stride)]; \
4  double2 I2 = spinor[sp_idx + 2*(stride)]; \
5  double2 I3 = spinor[sp_idx + 3*(stride)]; \
6  double2 I4 = spinor[sp_idx + 4*(stride)]; \
7  double2 I5 = spinor[sp_idx + 5*(stride)]; \
8  double2 I6 = spinor[sp_idx + 6*(stride)]; \
9  double2 I7 = spinor[sp_idx + 7*(stride)]; \
10  double2 I8 = spinor[sp_idx + 8*(stride)]; \
11  double2 I9 = spinor[sp_idx + 9*(stride)]; \
12  double2 I10 = spinor[sp_idx + 10*(stride)]; \
13  double2 I11 = spinor[sp_idx + 11*(stride)];
14 
15 #define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \
16  double2 I0 = spinor[sp_idx + 0*(stride)]; \
17  double2 I1 = spinor[sp_idx + 1*(stride)]; \
18  double2 I2 = spinor[sp_idx + 2*(stride)]; \
19  double2 I3 = spinor[sp_idx + 3*(stride)]; \
20  double2 I4 = spinor[sp_idx + 4*(stride)]; \
21  double2 I5 = spinor[sp_idx + 5*(stride)];
22 
23 #define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx) \
24  double2 I6 = spinor[sp_idx + 6*(stride)]; \
25  double2 I7 = spinor[sp_idx + 7*(stride)]; \
26  double2 I8 = spinor[sp_idx + 8*(stride)]; \
27  double2 I9 = spinor[sp_idx + 9*(stride)]; \
28  double2 I10 = spinor[sp_idx + 10*(stride)]; \
29  double2 I11 = spinor[sp_idx + 11*(stride)];
30 
31 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
32  float4 I0 = spinor[sp_idx + 0*(stride)]; \
33  float4 I1 = spinor[sp_idx + 1*(stride)]; \
34  float4 I2 = spinor[sp_idx + 2*(stride)]; \
35  float4 I3 = spinor[sp_idx + 3*(stride)]; \
36  float4 I4 = spinor[sp_idx + 4*(stride)]; \
37  float4 I5 = spinor[sp_idx + 5*(stride)];
38 
39 #define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx) \
40  float4 I0 = spinor[sp_idx + 0*(stride)]; \
41  float4 I1 = spinor[sp_idx + 1*(stride)]; \
42  float4 I2 = spinor[sp_idx + 2*(stride)]; \
43 
44 #define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx) \
45  float4 I3 = spinor[sp_idx + 3*(stride)]; \
46  float4 I4 = spinor[sp_idx + 4*(stride)]; \
47  float4 I5 = spinor[sp_idx + 5*(stride)];
48 
49 #define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) \
50  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
51  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
52  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
53  float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \
54  float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \
55  float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \
56  float C = (spinor ## Norm)[norm_idx]; \
57  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
58  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
59  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
60  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
61  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
62  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
63 
64 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \
65  READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)
66 
67 #define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) \
68  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
69  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
70  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
71  float C = (spinor ## Norm)[norm_idx]; \
72  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
73  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
74  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
75 
76 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \
77  READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)
78 
79 #define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) \
80  float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \
81  float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \
82  float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \
83  float C = (spinor ## Norm)[norm_idx]; \
84  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
85  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
86  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
87 
88 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
89  READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)
90 
91 #define READ_ACCUM_DOUBLE(spinor, stride) \
92  double2 accum0 = spinor[sid + 0*stride]; \
93  double2 accum1 = spinor[sid + 1*stride]; \
94  double2 accum2 = spinor[sid + 2*stride]; \
95  double2 accum3 = spinor[sid + 3*stride]; \
96  double2 accum4 = spinor[sid + 4*stride]; \
97  double2 accum5 = spinor[sid + 5*stride]; \
98  double2 accum6 = spinor[sid + 6*stride]; \
99  double2 accum7 = spinor[sid + 7*stride]; \
100  double2 accum8 = spinor[sid + 8*stride]; \
101  double2 accum9 = spinor[sid + 9*stride]; \
102  double2 accum10 = spinor[sid + 10*stride]; \
103  double2 accum11 = spinor[sid + 11*stride];
104 
105 #define READ_ACCUM_SINGLE(spinor, stride) \
106  float4 accum0 = spinor[sid + 0*(stride)]; \
107  float4 accum1 = spinor[sid + 1*(stride)]; \
108  float4 accum2 = spinor[sid + 2*(stride)]; \
109  float4 accum3 = spinor[sid + 3*(stride)]; \
110  float4 accum4 = spinor[sid + 4*(stride)]; \
111  float4 accum5 = spinor[sid + 5*(stride)];
112 
113 #define READ_ACCUM_HALF_(spinor, stride) \
114  float4 accum0 = short42float4(spinor[sid + 0*stride]); \
115  float4 accum1 = short42float4(spinor[sid + 1*stride]); \
116  float4 accum2 = short42float4(spinor[sid + 2*stride]); \
117  float4 accum3 = short42float4(spinor[sid + 3*stride]); \
118  float4 accum4 = short42float4(spinor[sid + 4*stride]); \
119  float4 accum5 = short42float4(spinor[sid + 5*stride]); \
120  float C = (spinor ## Norm)[sid]; \
121  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
122  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
123  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
124  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
125  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
126  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
127 
128 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
129 
130 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
131  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
132  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
133  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
134  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
135  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
136  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
137  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
138  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
139  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
140  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
141  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
142  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
143 
144 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
145  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
146  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
147  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
148  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
149  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
150  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
151 
152 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
153  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
154  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
155  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
156  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
157  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
158  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
159 
160 #define READ_ACCUM_DOUBLE_TEX(spinor, stride) \
161  double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \
162  double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \
163  double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \
164  double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \
165  double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \
166  double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \
167  double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \
168  double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \
169  double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \
170  double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \
171  double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
172  double2 accum11 = fetch_double2((spinor), sid + 11*(stride));
173 
174 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
175  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
176  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
177  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
178  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
179  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
180  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
181 
182 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
183  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
184  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
185  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
186 
187 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
188  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
189  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
190  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
191 
192 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
193  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
194  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
195  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
196  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
197  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
198  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
199  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
200  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
201  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
202  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
203  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
204  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
205  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
206 
207 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \
208  READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
209 
210 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
211  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
212  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
213  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
214  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
215  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
216  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
217  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
218 
219 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
220  READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
221 
222 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
223  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
224  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
225  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
226  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
227  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
228  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
229  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
230 
231 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
232  READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
233 
234 #define READ_ACCUM_SINGLE_TEX(spinor, stride) \
235  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
236  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
237  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
238  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
239  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
240  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride));
241 
242 #define READ_ACCUM_HALF_TEX_(spinor, stride) \
243  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
244  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
245  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
246  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
247  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
248  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
249  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
250  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
251  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
252  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
253  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
254  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
255  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
256 
257 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
258 
259 
260 #define WRITE_SPINOR_DOUBLE2(stride) \
261  out[0*(stride)+sid] = make_double2(o00_re, o00_im); \
262  out[1*(stride)+sid] = make_double2(o01_re, o01_im); \
263  out[2*(stride)+sid] = make_double2(o02_re, o02_im); \
264  out[3*(stride)+sid] = make_double2(o10_re, o10_im); \
265  out[4*(stride)+sid] = make_double2(o11_re, o11_im); \
266  out[5*(stride)+sid] = make_double2(o12_re, o12_im); \
267  out[6*(stride)+sid] = make_double2(o20_re, o20_im); \
268  out[7*(stride)+sid] = make_double2(o21_re, o21_im); \
269  out[8*(stride)+sid] = make_double2(o22_re, o22_im); \
270  out[9*(stride)+sid] = make_double2(o30_re, o30_im); \
271  out[10*(stride)+sid] = make_double2(o31_re, o31_im); \
272  out[11*(stride)+sid] = make_double2(o32_re, o32_im);
273 
274 #define WRITE_SPINOR_FLOAT4(stride) \
275  out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
276  out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
277  out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
278  out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
279  out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
280  out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
281 
282 #define WRITE_SPINOR_SHORT4(stride) \
283  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
284  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
285  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
286  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
287  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
288  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
289  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
290  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
291  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
292  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
293  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
294  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
295  c0 = fmaxf(c0, c1); \
296  c1 = fmaxf(c2, c3); \
297  c2 = fmaxf(c4, c5); \
298  c3 = fmaxf(c6, c7); \
299  c4 = fmaxf(c8, c9); \
300  c5 = fmaxf(c10, c11); \
301  c0 = fmaxf(c0, c1); \
302  c1 = fmaxf(c2, c3); \
303  c2 = fmaxf(c4, c5); \
304  c0 = fmaxf(c0, c1); \
305  c0 = fmaxf(c0, c2); \
306  outNorm[sid] = c0; \
307  float scale = __fdividef(MAX_SHORT, c0); \
308  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
309  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
310  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
311  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
312  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
313  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
314  out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
315  out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
316  out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
317  out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
318  out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
319  out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
320 
321 #if (__COMPUTE_CAPABILITY__ >= 200)
322 #define WRITE_SPINOR_DOUBLE2_STR(stride) \
323  store_streaming_double2(&out[0*stride+sid], o00_re, o00_im); \
324  store_streaming_double2(&out[1*stride+sid], o01_re, o01_im); \
325  store_streaming_double2(&out[2*stride+sid], o02_re, o02_im); \
326  store_streaming_double2(&out[3*stride+sid], o10_re, o10_im); \
327  store_streaming_double2(&out[4*stride+sid], o11_re, o11_im); \
328  store_streaming_double2(&out[5*stride+sid], o12_re, o12_im); \
329  store_streaming_double2(&out[6*stride+sid], o20_re, o20_im); \
330  store_streaming_double2(&out[7*stride+sid], o21_re, o21_im); \
331  store_streaming_double2(&out[8*stride+sid], o22_re, o22_im); \
332  store_streaming_double2(&out[9*stride+sid], o30_re, o30_im); \
333  store_streaming_double2(&out[10*stride+sid], o31_re, o31_im); \
334  store_streaming_double2(&out[11*stride+sid], o32_re, o32_im);
335 
336 #define WRITE_SPINOR_FLOAT4_STR(stride) \
337  store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
338  store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
339  store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
340  store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
341  store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
342  store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
343 
344 #define WRITE_SPINOR_SHORT4_STR(stride) \
345  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
346  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
347  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
348  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
349  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
350  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
351  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
352  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
353  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
354  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
355  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
356  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
357  c0 = fmaxf(c0, c1); \
358  c1 = fmaxf(c2, c3); \
359  c2 = fmaxf(c4, c5); \
360  c3 = fmaxf(c6, c7); \
361  c4 = fmaxf(c8, c9); \
362  c5 = fmaxf(c10, c11); \
363  c0 = fmaxf(c0, c1); \
364  c1 = fmaxf(c2, c3); \
365  c2 = fmaxf(c4, c5); \
366  c0 = fmaxf(c0, c1); \
367  c0 = fmaxf(c0, c2); \
368  outNorm[sid] = c0; \
369  float scale = __fdividef(MAX_SHORT, c0); \
370  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
371  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
372  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
373  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
374  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
375  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
376  store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
377  store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
378  store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
379  store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
380  store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
381  store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
382 #else
383 #define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride)
384 #define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride)
385 #define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride)
386 #endif
387 
388 // macros used for exterior Wilson Dslash kernels and face packing
389 
390 #define READ_HALF_SPINOR READ_SPINOR_UP
391 
392 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \
393  out[0*(stride)+sid] = make_double2(a0_re, a0_im); \
394  out[1*(stride)+sid] = make_double2(a1_re, a1_im); \
395  out[2*(stride)+sid] = make_double2(a2_re, a2_im); \
396  out[3*(stride)+sid] = make_double2(b0_re, b0_im); \
397  out[4*(stride)+sid] = make_double2(b1_re, b1_im); \
398  out[5*(stride)+sid] = make_double2(b2_re, b2_im);
399 
400 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \
401  out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \
402  out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \
403  out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
404 
405 #define WRITE_HALF_SPINOR_SHORT4(stride, sid) \
406  float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \
407  float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \
408  float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \
409  float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \
410  float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \
411  float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \
412  c0 = fmaxf(c0, c1); \
413  c1 = fmaxf(c2, c3); \
414  c2 = fmaxf(c4, c5); \
415  c0 = fmaxf(c0, c1); \
416  c0 = fmaxf(c0, c2); \
417  outNorm[sid] = c0; \
418  float scale = __fdividef(MAX_SHORT, c0); \
419  a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \
420  a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \
421  b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \
422  out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \
423  out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \
424  out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im);
425 
427 /******************used by non-degenerate twisted mass**********************/
428 #define WRITE_FLAVOR_SPINOR_DOUBLE2() \
429  out[0*(param.sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \
430  out[1*(param.sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \
431  out[2*(param.sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \
432  out[3*(param.sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \
433  out[4*(param.sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \
434  out[5*(param.sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \
435  out[6*(param.sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \
436  out[7*(param.sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \
437  out[8*(param.sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \
438  out[9*(param.sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \
439  out[10*(param.sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \
440  out[11*(param.sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \
441  out[0*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_00_re, o2_00_im); \
442  out[1*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_01_re, o2_01_im); \
443  out[2*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_02_re, o2_02_im); \
444  out[3*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_10_re, o2_10_im); \
445  out[4*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_11_re, o2_11_im); \
446  out[5*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_12_re, o2_12_im); \
447  out[6*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_20_re, o2_20_im); \
448  out[7*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_21_re, o2_21_im); \
449  out[8*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_22_re, o2_22_im); \
450  out[9*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_30_re, o2_30_im); \
451  out[10*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_31_re, o2_31_im); \
452  out[11*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_32_re, o2_32_im);
453 
454 
455 #define WRITE_FLAVOR_SPINOR_FLOAT4() \
456  out[0*(param.sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \
457  out[1*(param.sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \
458  out[2*(param.sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \
459  out[3*(param.sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \
460  out[4*(param.sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \
461  out[5*(param.sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \
462  out[0*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \
463  out[1*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \
464  out[2*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \
465  out[3*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \
466  out[4*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \
467  out[5*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im);
468 
469 
470 #define WRITE_FLAVOR_SPINOR_SHORT4() \
471  float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \
472  float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \
473  float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \
474  float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \
475  float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \
476  float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \
477  float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \
478  float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \
479  float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \
480  float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \
481  float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \
482  float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \
483  c0 = fmaxf(c0, c1); \
484  c1 = fmaxf(c2, c3); \
485  c2 = fmaxf(c4, c5); \
486  c3 = fmaxf(c6, c7); \
487  c4 = fmaxf(c8, c9); \
488  c5 = fmaxf(c10, c11); \
489  c0 = fmaxf(c0, c1); \
490  c1 = fmaxf(c2, c3); \
491  c2 = fmaxf(c4, c5); \
492  c0 = fmaxf(c0, c1); \
493  c0 = fmaxf(c0, c2); \
494  outNorm[sid] = c0; \
495  float scale = __fdividef(MAX_SHORT, c0); \
496  o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \
497  o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \
498  o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \
499  o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \
500  o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \
501  o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \
502  out[sid+0*(param.sp_stride)] = make_short4((short)o1_00_re, (short)o1_00_im, (short)o1_01_re, (short)o1_01_im); \
503  out[sid+1*(param.sp_stride)] = make_short4((short)o1_02_re, (short)o1_02_im, (short)o1_10_re, (short)o1_10_im); \
504  out[sid+2*(param.sp_stride)] = make_short4((short)o1_11_re, (short)o1_11_im, (short)o1_12_re, (short)o1_12_im); \
505  out[sid+3*(param.sp_stride)] = make_short4((short)o1_20_re, (short)o1_20_im, (short)o1_21_re, (short)o1_21_im); \
506  out[sid+4*(param.sp_stride)] = make_short4((short)o1_22_re, (short)o1_22_im, (short)o1_30_re, (short)o1_30_im); \
507  out[sid+5*(param.sp_stride)] = make_short4((short)o1_31_re, (short)o1_31_im, (short)o1_32_re, (short)o1_32_im); \
508  c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \
509  c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \
510  c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \
511  c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \
512  c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \
513  c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \
514  c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \
515  c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \
516  c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \
517  c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \
518  c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \
519  c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \
520  c0 = fmaxf(c0, c1); \
521  c1 = fmaxf(c2, c3); \
522  c2 = fmaxf(c4, c5); \
523  c3 = fmaxf(c6, c7); \
524  c4 = fmaxf(c8, c9); \
525  c5 = fmaxf(c10, c11); \
526  c0 = fmaxf(c0, c1); \
527  c1 = fmaxf(c2, c3); \
528  c2 = fmaxf(c4, c5); \
529  c0 = fmaxf(c0, c1); \
530  c0 = fmaxf(c0, c2); \
531  outNorm[sid+param.fl_stride] = c0; \
532  scale = __fdividef(MAX_SHORT, c0); \
533  o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \
534  o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \
535  o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \
536  o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \
537  o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \
538  o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \
539  out[sid+param.fl_stride+0*(param.sp_stride)] = make_short4((short)o2_00_re, (short)o2_00_im, (short)o2_01_re, (short)o2_01_im); \
540  out[sid+param.fl_stride+1*(param.sp_stride)] = make_short4((short)o2_02_re, (short)o2_02_im, (short)o2_10_re, (short)o2_10_im); \
541  out[sid+param.fl_stride+2*(param.sp_stride)] = make_short4((short)o2_11_re, (short)o2_11_im, (short)o2_12_re, (short)o2_12_im); \
542  out[sid+param.fl_stride+3*(param.sp_stride)] = make_short4((short)o2_20_re, (short)o2_20_im, (short)o2_21_re, (short)o2_21_im); \
543  out[sid+param.fl_stride+4*(param.sp_stride)] = make_short4((short)o2_22_re, (short)o2_22_im, (short)o2_30_re, (short)o2_30_im); \
544  out[sid+param.fl_stride+5*(param.sp_stride)] = make_short4((short)o2_31_re, (short)o2_31_im, (short)o2_32_re, (short)o2_32_im);
545 
546 
547 /************* the following is used by staggered *****************/
548 
549 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
550  double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
551  double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
552  double2 I2 = fetch_double2((spinor), idx + 2*mystride);
553 
554 #define READ_KS_NBR_SPINOR_DOUBLE_TEX(T, spinor, idx, mystride) \
555  T##0 = fetch_double2((spinor), idx + 0*mystride); \
556  T##1 = fetch_double2((spinor), idx + 1*mystride); \
557  T##2 = fetch_double2((spinor), idx + 2*mystride);
558 
559 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
560  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
561  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
562  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
563 
564 #define READ_KS_NBR_SPINOR_SINGLE_TEX(T, spinor, idx, mystride) \
565  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
566  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
567  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
568 
569 
570 
571 
572 #define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
573  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
574  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
575  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
576  { \
577  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
578  I0.x *= C; I0.y *= C; \
579  I1.x *= C; I1.y *= C; \
580  I2.x *= C; I2.y *= C;}
581 
582 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
583  READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
584 
585 #define READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride) \
586  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
587  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
588  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
589  { \
590  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
591  (T##0).x *= C; (T##0).y *= C; \
592  (T##1).x *= C; (T##1).y *= C; \
593  (T##2).x *= C; (T##2).y *= C;}
594 
595 #define READ_KS_NBR_SPINOR_HALF_TEX(T, spinor, idx, mystride) \
596  READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride)
597 
598 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
599  double2 I0 = spinor[idx + 0*mystride]; \
600  double2 I1 = spinor[idx + 1*mystride]; \
601  double2 I2 = spinor[idx + 2*mystride];
602 
603 #define READ_KS_NBR_SPINOR_DOUBLE(T, spinor, idx, mystride) \
604  T##0 = spinor[idx + 0*mystride]; \
605  T##1 = spinor[idx + 1*mystride]; \
606  T##2 = spinor[idx + 2*mystride];
607 
608 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
609  float2 I0 = spinor[idx + 0*mystride]; \
610  float2 I1 = spinor[idx + 1*mystride]; \
611  float2 I2 = spinor[idx + 2*mystride];
612 
613 #define READ_KS_NBR_SPINOR_SINGLE(T, spinor, idx, mystride) \
614  T##0 = spinor[idx + 0*mystride]; \
615  T##1 = spinor[idx + 1*mystride]; \
616  T##2 = spinor[idx + 2*mystride];
617 
618 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \
619  float2 I0, I1, I2; \
620  { \
621  short2 S0 = in[idx + 0*mystride]; \
622  short2 S1 = in[idx + 1*mystride]; \
623  short2 S2 = in[idx + 2*mystride]; \
624  float C = inNorm[idx]; \
625  I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \
626  I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \
627  I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \
628  }
629 
630 #define READ_KS_NBR_SPINOR_HALF(T, spinor, idx, mystride) \
631  { \
632  short2 S0 = in[idx + 0*mystride]; \
633  short2 S1 = in[idx + 1*mystride]; \
634  short2 S2 = in[idx + 2*mystride]; \
635  float C = inNorm[idx]; \
636  (T##0).x =C*short2float(S0.x); (T##0).y =C*short2float(S0.y); \
637  (T##1).x =C*short2float(S1.x); (T##1).y =C*short2float(S1.y); \
638  (T##2).x =C*short2float(S2.x); (T##2).y =C*short2float(S2.y); \
639  }
640 
641 
642 #define WRITE_ST_SPINOR_DOUBLE2(out, sid, mystride) \
643  out[0*mystride+sid] = make_double2(o00_re, o00_im); \
644  out[1*mystride+sid] = make_double2(o01_re, o01_im); \
645  out[2*mystride+sid] = make_double2(o02_re, o02_im);
646 
647 #define WRITE_ST_SPINOR_FLOAT2(out, sid, mystride) \
648  out[0*mystride+sid] = make_float2(o00_re, o00_im); \
649  out[1*mystride+sid] = make_float2(o01_re, o01_im); \
650  out[2*mystride+sid] = make_float2(o02_re, o02_im);
651 
652 #define WRITE_ST_SPINOR_SHORT2(out, sid, mystride) \
653  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
654  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
655  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
656  c0 = fmaxf(c0, c1); \
657  c0 = fmaxf(c0, c2); \
658  out ## Norm[sid] = c0; \
659  float scale = __fdividef(MAX_SHORT, c0); \
660  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
661  o02_re *= scale; o02_im *= scale; \
662  out[sid+0*mystride] = make_short2((short)o00_re, (short)o00_im); \
663  out[sid+1*mystride] = make_short2((short)o01_re, (short)o01_im); \
664  out[sid+2*mystride] = make_short2((short)o02_re, (short)o02_im);
665 
666 // Non-cache writes to minimize cache polution
667 #if (__COMPUTE_CAPABILITY__ >= 200)
668 
669 #define WRITE_ST_SPINOR_DOUBLE2_STR(out, sid, mystride) \
670  store_streaming_double2(&out[0*mystride+sid], o00_re, o00_im); \
671  store_streaming_double2(&out[1*mystride+sid], o01_re, o01_im); \
672  store_streaming_double2(&out[2*mystride+sid], o02_re, o02_im);
673 
674 #define WRITE_ST_SPINOR_FLOAT2_STR(out, sid, mystride) \
675  store_streaming_float2(&out[0*mystride+sid], o00_re, o00_im); \
676  store_streaming_float2(&out[1*mystride+sid], o01_re, o01_im); \
677  store_streaming_float2(&out[2*mystride+sid], o02_re, o02_im);
678 
679 #define WRITE_ST_SPINOR_SHORT2_STR(out, sid, mystride) \
680  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
681  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
682  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
683  c0 = fmaxf(c0, c1); \
684  c0 = fmaxf(c0, c2); \
685  out ## Norm[sid] = c0; \
686  float scale = __fdividef(MAX_SHORT, c0); \
687  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
688  o02_re *= scale; o02_im *= scale; \
689  store_streaming_short2(&g_out[0*mystride+sid], (short)o00_re, (short)o00_im); \
690  store_streaming_short2(&g_out[1*mystride+sid], (short)o01_re, (short)o01_im); \
691  store_streaming_short2(&g_out[2*mystride+sid], (short)o02_re, (short)o02_im);
692 #else
693 
694 #define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2()
695 #define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4()
696 #define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4()
697 
698 #endif
699 
700 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor,sid) { \
701  double2 tmp0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
702  double2 tmp1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
703  double2 tmp2 = fetch_double2((spinor), sid + 2*(param.sp_stride)); \
704  o00_re += tmp0.x; o00_im += tmp0.y; \
705  o01_re += tmp1.x; o01_im += tmp1.y; \
706  o02_re += tmp2.x; o02_im += tmp2.y; }
707 
708 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor,sid) { \
709  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(param.sp_stride)); \
710  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(param.sp_stride)); \
711  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(param.sp_stride)); \
712  o00_re += tmp0.x; o00_im += tmp0.y; \
713  o01_re += tmp1.x; o01_im += tmp1.y; \
714  o02_re += tmp2.x; o02_im += tmp2.y; }
715 
716 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid) { \
717  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
718  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
719  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
720  float C = TEX1DFETCH(float, (spinor##Norm), sid); \
721  o00_re += C*tmp0.x; o00_im += C*tmp0.y; \
722  o01_re += C*tmp1.x; o01_im += C*tmp1.y; \
723  o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
724 
725 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor,sid) \
726  READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid)
727 
728 #define READ_AND_SUM_ST_SPINOR(spinor,sid) \
729  o00_re += spinor[0*param.sp_stride+sid].x; o00_im += spinor[0*param.sp_stride+sid].y; \
730  o01_re += spinor[1*param.sp_stride+sid].x; o01_im += spinor[1*param.sp_stride+sid].y; \
731  o02_re += spinor[2*param.sp_stride+sid].x; o02_im += spinor[2*param.sp_stride+sid].y; \
732 
733 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid) \
734  float C = spinor ## Norm[sid]; \
735  o00_re += C*short2float(spinor[0*param.sp_stride + sid].x); \
736  o00_im += C*short2float(spinor[0*param.sp_stride + sid].y); \
737  o01_re += C*short2float(spinor[1*param.sp_stride + sid].x); \
738  o01_im += C*short2float(spinor[1*param.sp_stride + sid].y); \
739  o02_re += C*short2float(spinor[2*param.sp_stride + sid].x); \
740  o02_im += C*short2float(spinor[2*param.sp_stride + sid].y);
741 
742 #define READ_AND_SUM_ST_SPINOR_HALF(spinor,sid) \
743  READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid)
744 
745 #define READ_ST_ACCUM_DOUBLE_TEX(spinor,sid) \
746  double2 accum0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
747  double2 accum1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
748  double2 accum2 = fetch_double2((spinor), sid + 2*(param.sp_stride));
749 
750 #define READ_ST_ACCUM_SINGLE_TEX(spinor,sid) \
751  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
752  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
753  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride);
754 
755 #define READ_ST_ACCUM_HALF_TEX_(spinor,sid) \
756  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
757  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
758  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
759  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
760  accum0.x *= C; accum0.y *= C; \
761  accum1.x *= C; accum1.y *= C; \
762  accum2.x *= C; accum2.y *= C;
763 
764 #define READ_ST_ACCUM_HALF_TEX(spinor,sid) READ_ST_ACCUM_HALF_TEX_(spinor,sid)
765 
766 #define READ_ST_ACCUM_DOUBLE(spinor,sid) \
767  double2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
768  double2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
769  double2 accum2 = spinor[sid + 2*(param.sp_stride)];
770 
771 #define READ_ST_ACCUM_SINGLE(spinor,sid) \
772  float2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
773  float2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
774  float2 accum2 = spinor[sid + 2*(param.sp_stride)];
775 
776 #define READ_ST_ACCUM_HALF(spinor,sid) \
777  float2 accum0, accum1, accum2; \
778  { \
779  short2 S0 = x[sid + 0*param.sp_stride]; \
780  short2 S1 = x[sid + 1*param.sp_stride]; \
781  short2 S2 = x[sid + 2*param.sp_stride]; \
782  float C = spinor##Norm[sid]; \
783  accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \
784  accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \
785  accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \
786  }
787 
788 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \
789  extern __shared__ char s_data[]; \
790  spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
791  ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
792  sh[0*SHARED_STRIDE] = reg##00_re; \
793  sh[1*SHARED_STRIDE] = reg##00_im; \
794  sh[2*SHARED_STRIDE] = reg##01_re; \
795  sh[3*SHARED_STRIDE] = reg##01_im; \
796  sh[4*SHARED_STRIDE] = reg##02_re; \
797  sh[5*SHARED_STRIDE] = reg##02_im; \
798  sh[6*SHARED_STRIDE] = reg##10_re; \
799  sh[7*SHARED_STRIDE] = reg##10_im; \
800  sh[8*SHARED_STRIDE] = reg##11_re; \
801  sh[9*SHARED_STRIDE] = reg##11_im; \
802  sh[10*SHARED_STRIDE] = reg##12_re; \
803  sh[11*SHARED_STRIDE] = reg##12_im; \
804  sh[12*SHARED_STRIDE] = reg##20_re; \
805  sh[13*SHARED_STRIDE] = reg##20_im; \
806  sh[14*SHARED_STRIDE] = reg##21_re; \
807  sh[15*SHARED_STRIDE] = reg##21_im; \
808  sh[16*SHARED_STRIDE] = reg##22_re; \
809  sh[17*SHARED_STRIDE] = reg##22_im; \
810  sh[18*SHARED_STRIDE] = reg##30_re; \
811  sh[19*SHARED_STRIDE] = reg##30_im; \
812  sh[20*SHARED_STRIDE] = reg##31_re; \
813  sh[21*SHARED_STRIDE] = reg##31_im; \
814  sh[22*SHARED_STRIDE] = reg##32_re; \
815  sh[23*SHARED_STRIDE] = reg##32_im;
816 
817 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
818 
819 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \
820  extern __shared__ char s_data[]; \
821  double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
822  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
823  double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \
824  double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
825  double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \
826  double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
827  double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \
828  double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
829  double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
830  double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
831  double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
832  double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
833  double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
834  double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
835 
836 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
837 
838 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
839 
840 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
841  extern __shared__ char s_data[]; \
842  float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
843  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
844  float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
845  float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
846  float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
847  float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
848  float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
849  float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
850 
851 #else // 8-byte shared memory words
852 
853 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \
854  extern __shared__ char s_data[]; \
855  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
856  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
857  sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \
858  sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \
859  sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \
860  sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \
861  sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \
862  sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \
863  sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \
864  sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \
865  sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \
866  sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \
867  sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \
868  sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
869 
870 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
871  extern __shared__ char s_data[]; \
872  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
873  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
874  float2 tmp1, tmp2; \
875  tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
876  tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
877  tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
878  tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
879  tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
880  tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y);
881 
882 #endif
883 
884 
886 #define READ_ACCUM_FLAVOR_DOUBLE(spinor, stride, fl_stride) \
887  double2 flv1_accum0 = spinor[sid + 0*stride]; \
888  double2 flv1_accum1 = spinor[sid + 1*stride]; \
889  double2 flv1_accum2 = spinor[sid + 2*stride]; \
890  double2 flv1_accum3 = spinor[sid + 3*stride]; \
891  double2 flv1_accum4 = spinor[sid + 4*stride]; \
892  double2 flv1_accum5 = spinor[sid + 5*stride]; \
893  double2 flv1_accum6 = spinor[sid + 6*stride]; \
894  double2 flv1_accum7 = spinor[sid + 7*stride]; \
895  double2 flv1_accum8 = spinor[sid + 8*stride]; \
896  double2 flv1_accum9 = spinor[sid + 9*stride]; \
897  double2 flv1_accum10 = spinor[sid + 10*stride]; \
898  double2 flv1_accum11 = spinor[sid + 11*stride]; \
899  double2 flv2_accum0 = spinor[sid + fl_stride + 0*stride]; \
900  double2 flv2_accum1 = spinor[sid + fl_stride + 1*stride]; \
901  double2 flv2_accum2 = spinor[sid + fl_stride + 2*stride]; \
902  double2 flv2_accum3 = spinor[sid + fl_stride + 3*stride]; \
903  double2 flv2_accum4 = spinor[sid + fl_stride + 4*stride]; \
904  double2 flv2_accum5 = spinor[sid + fl_stride + 5*stride]; \
905  double2 flv2_accum6 = spinor[sid + fl_stride + 6*stride]; \
906  double2 flv2_accum7 = spinor[sid + fl_stride + 7*stride]; \
907  double2 flv2_accum8 = spinor[sid + fl_stride + 8*stride]; \
908  double2 flv2_accum9 = spinor[sid + fl_stride + 9*stride]; \
909  double2 flv2_accum10 = spinor[sid + fl_stride + 10*stride]; \
910  double2 flv2_accum11 = spinor[sid + fl_stride + 11*stride];
911 
912 
913 #define READ_ACCUM_FLAVOR_SINGLE(spinor, stride, flv_stride) \
914  float4 flv1_accum0 = spinor[sid + 0*(stride)]; \
915  float4 flv1_accum1 = spinor[sid + 1*(stride)]; \
916  float4 flv1_accum2 = spinor[sid + 2*(stride)]; \
917  float4 flv1_accum3 = spinor[sid + 3*(stride)]; \
918  float4 flv1_accum4 = spinor[sid + 4*(stride)]; \
919  float4 flv1_accum5 = spinor[sid + 5*(stride)]; \
920  float4 flv2_accum0 = spinor[sid + flv_stride + 0*(stride)]; \
921  float4 flv2_accum1 = spinor[sid + flv_stride + 1*(stride)]; \
922  float4 flv2_accum2 = spinor[sid + flv_stride + 2*(stride)]; \
923  float4 flv2_accum3 = spinor[sid + flv_stride + 3*(stride)]; \
924  float4 flv2_accum4 = spinor[sid + flv_stride + 4*(stride)]; \
925  float4 flv2_accum5 = spinor[sid + flv_stride + 5*(stride)];
926 
927 
928 #define READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride) \
929  float4 flv1_accum0 = short42float4(spinor[sid + 0*stride]); \
930  float4 flv1_accum1 = short42float4(spinor[sid + 1*stride]); \
931  float4 flv1_accum2 = short42float4(spinor[sid + 2*stride]); \
932  float4 flv1_accum3 = short42float4(spinor[sid + 3*stride]); \
933  float4 flv1_accum4 = short42float4(spinor[sid + 4*stride]); \
934  float4 flv1_accum5 = short42float4(spinor[sid + 5*stride]); \
935  float C = (spinor ## Norm)[sid]; \
936  flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
937  flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
938  flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
939  flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
940  flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
941  flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
942  float4 flv2_accum0 = short42float4(spinor[sid + flv_stride + 0*stride]); \
943  float4 flv2_accum1 = short42float4(spinor[sid + flv_stride + 1*stride]); \
944  float4 flv2_accum2 = short42float4(spinor[sid + flv_stride + 2*stride]); \
945  float4 flv2_accum3 = short42float4(spinor[sid + flv_stride + 3*stride]); \
946  float4 flv2_accum4 = short42float4(spinor[sid + flv_stride + 4*stride]); \
947  float4 flv2_accum5 = short42float4(spinor[sid + flv_stride + 5*stride]); \
948  C = (spinor ## Norm)[sid + fl_stride]; \
949  flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
950  flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
951  flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
952  flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
953  flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
954  flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
955 
956 #define READ_ACCUM_FLAVOR_HALF(spinor, stride, flv_stride) READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride)
957 
958 
959 #define READ_ACCUM_FLAVOR_DOUBLE_TEX(spinor, stride, flv_stride) \
960  double2 flv1_accum0 = fetch_double2((spinor), sid + 0*(stride)); \
961  double2 flv1_accum1 = fetch_double2((spinor), sid + 1*(stride)); \
962  double2 flv1_accum2 = fetch_double2((spinor), sid + 2*(stride)); \
963  double2 flv1_accum3 = fetch_double2((spinor), sid + 3*(stride)); \
964  double2 flv1_accum4 = fetch_double2((spinor), sid + 4*(stride)); \
965  double2 flv1_accum5 = fetch_double2((spinor), sid + 5*(stride)); \
966  double2 flv1_accum6 = fetch_double2((spinor), sid + 6*(stride)); \
967  double2 flv1_accum7 = fetch_double2((spinor), sid + 7*(stride)); \
968  double2 flv1_accum8 = fetch_double2((spinor), sid + 8*(stride)); \
969  double2 flv1_accum9 = fetch_double2((spinor), sid + 9*(stride)); \
970  double2 flv1_accum10 = fetch_double2((spinor), sid + 10*(stride)); \
971  double2 flv1_accum11 = fetch_double2((spinor), sid + 11*(stride)); \
972  double2 flv2_accum0 = fetch_double2((spinor), sid + flv_stride + 0*(stride)); \
973  double2 flv2_accum1 = fetch_double2((spinor), sid + flv_stride + 1*(stride)); \
974  double2 flv2_accum2 = fetch_double2((spinor), sid + flv_stride + 2*(stride)); \
975  double2 flv2_accum3 = fetch_double2((spinor), sid + flv_stride + 3*(stride)); \
976  double2 flv2_accum4 = fetch_double2((spinor), sid + flv_stride + 4*(stride)); \
977  double2 flv2_accum5 = fetch_double2((spinor), sid + flv_stride + 5*(stride)); \
978  double2 flv2_accum6 = fetch_double2((spinor), sid + flv_stride + 6*(stride)); \
979  double2 flv2_accum7 = fetch_double2((spinor), sid + flv_stride + 7*(stride)); \
980  double2 flv2_accum8 = fetch_double2((spinor), sid + flv_stride + 8*(stride)); \
981  double2 flv2_accum9 = fetch_double2((spinor), sid + flv_stride + 9*(stride)); \
982  double2 flv2_accum10 = fetch_double2((spinor), sid + flv_stride + 10*(stride)); \
983  double2 flv2_accum11 = fetch_double2((spinor), sid + flv_stride + 11*(stride));
984 
985 
986 #define READ_ACCUM_FLAVOR_SINGLE_TEX(spinor, stride, flv_stride) \
987  float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
988  float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
989  float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
990  float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
991  float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
992  float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
993  float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
994  float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
995  float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
996  float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
997  float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
998  float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride));
999 
1000 #define READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride) \
1001  float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
1002  float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
1003  float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
1004  float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
1005  float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
1006  float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
1007  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
1008  flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
1009  flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
1010  flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
1011  flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
1012  flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
1013  flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
1014  float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
1015  float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
1016  float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
1017  float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
1018  float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
1019  float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride)); \
1020  C = TEX1DFETCH(float, (spinor ## Norm), sid + flv_stride); \
1021  flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
1022  flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
1023  flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
1024  flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
1025  flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
1026  flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
1027 
1028 
1029 #define READ_ACCUM_FLAVOR_HALF_TEX(spinor, stride, flv_stride) READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride)
1030 
1031 //single-flavor macros:
1032 
1033 #define ASSN_ACCUM_DOUBLE(spinor, stride, fl_stride) \
1034  accum0 = spinor[sid + fl_stride + 0*stride]; \
1035  accum1 = spinor[sid + fl_stride + 1*stride]; \
1036  accum2 = spinor[sid + fl_stride + 2*stride]; \
1037  accum3 = spinor[sid + fl_stride + 3*stride]; \
1038  accum4 = spinor[sid + fl_stride + 4*stride]; \
1039  accum5 = spinor[sid + fl_stride + 5*stride]; \
1040  accum6 = spinor[sid + fl_stride + 6*stride]; \
1041  accum7 = spinor[sid + fl_stride + 7*stride]; \
1042  accum8 = spinor[sid + fl_stride + 8*stride]; \
1043  accum9 = spinor[sid + fl_stride + 9*stride]; \
1044  accum10 = spinor[sid + fl_stride + 10*stride]; \
1045  accum11 = spinor[sid + fl_stride + 11*stride];
1046 
1047 #define ASSN_ACCUM_SINGLE(spinor, stride, fl_stride) \
1048  accum0 = spinor[sid + fl_stride + 0*(stride)]; \
1049  accum1 = spinor[sid + fl_stride + 1*(stride)]; \
1050  accum2 = spinor[sid + fl_stride + 2*(stride)]; \
1051  accum3 = spinor[sid + fl_stride + 3*(stride)]; \
1052  accum4 = spinor[sid + fl_stride + 4*(stride)]; \
1053  accum5 = spinor[sid + fl_stride + 5*(stride)];
1054 
1055 #define ASSN_ACCUM_HALF_(spinor, stride, fl_stride) \
1056  accum0 = short42float4(spinor[sid + fl_stride + 0*stride]); \
1057  accum1 = short42float4(spinor[sid + fl_stride + 1*stride]); \
1058  accum2 = short42float4(spinor[sid + fl_stride + 2*stride]); \
1059  accum3 = short42float4(spinor[sid + fl_stride + 3*stride]); \
1060  accum4 = short42float4(spinor[sid + fl_stride + 4*stride]); \
1061  accum5 = short42float4(spinor[sid + fl_stride + 5*stride]); \
1062  {\
1063  float C = (spinor ## Norm)[sid + fl_stride]; \
1064  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1065  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1066  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1067  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1068  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1069  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1070  }
1071 
1072 
1073 #define ASSN_ACCUM_HALF(spinor, stride, fl_stride) ASSN_ACCUM_HALF_(spinor, stride, fl_stride)
1074 
1075 //single-flavor macros:
1076 
1077 #define ASSN_ACCUM_DOUBLE_TEX(spinor, stride, fl_stride) \
1078  accum0 = fetch_double2((spinor), sid + fl_stride + 0*(stride)); \
1079  accum1 = fetch_double2((spinor), sid + fl_stride + 1*(stride)); \
1080  accum2 = fetch_double2((spinor), sid + fl_stride + 2*(stride)); \
1081  accum3 = fetch_double2((spinor), sid + fl_stride + 3*(stride)); \
1082  accum4 = fetch_double2((spinor), sid + fl_stride + 4*(stride)); \
1083  accum5 = fetch_double2((spinor), sid + fl_stride + 5*(stride)); \
1084  accum6 = fetch_double2((spinor), sid + fl_stride + 6*(stride)); \
1085  accum7 = fetch_double2((spinor), sid + fl_stride + 7*(stride)); \
1086  accum8 = fetch_double2((spinor), sid + fl_stride + 8*(stride)); \
1087  accum9 = fetch_double2((spinor), sid + fl_stride + 9*(stride)); \
1088  accum10 = fetch_double2((spinor), sid + fl_stride + 10*(stride)); \
1089  accum11 = fetch_double2((spinor), sid + fl_stride + 11*(stride));
1090 
1091 
1092 #define ASSN_ACCUM_SINGLE_TEX(spinor, stride, fl_stride) \
1093  accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1094  accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1095  accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1096  accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1097  accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1098  accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride));
1099 
1100 #define ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride) \
1101  accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1102  accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1103  accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1104  accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1105  accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1106  accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride)); \
1107  {\
1108  float C = TEX1DFETCH(float, (spinor ## Norm), sid + fl_stride); \
1109  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1110  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1111  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1112  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1113  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1114  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1115  }
1116 
1117 #define ASSN_ACCUM_HALF_TEX(spinor, stride, fl_stride) ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride)
1118 
1120 
1121 //apply b*(1 + i*a*gamma_5) to the input spinor
1122 #define APPLY_TWIST_INV(a, b, reg)\
1123 {\
1124  spinorFloat tmp_re, tmp_im;\
1125  tmp_re = reg##00_re - a * reg##20_im;\
1126  tmp_im = reg##00_im + a * reg##20_re;\
1127  reg##20_re -= a * reg##00_im;\
1128  reg##20_im += a * reg##00_re;\
1129  \
1130  reg##00_re = b * tmp_re;\
1131  reg##00_im = b * tmp_im;\
1132  reg##20_re *= b;\
1133  reg##20_im *= b;\
1134  \
1135  tmp_re = reg##10_re - a * reg##30_im;\
1136  tmp_im = reg##10_im + a * reg##30_re;\
1137  reg##30_re -= a * reg##10_im;\
1138  reg##30_im += a * reg##10_re;\
1139  \
1140  reg##10_re = b * tmp_re;\
1141  reg##10_im = b * tmp_im;\
1142  reg##30_re *= b;\
1143  reg##30_im *= b;\
1144  \
1145  tmp_re = reg##01_re - a * reg##21_im;\
1146  tmp_im = reg##01_im + a * reg##21_re;\
1147  reg##21_re -= a * reg##01_im;\
1148  reg##21_im += a * reg##01_re;\
1149  \
1150  reg##01_re = b * tmp_re;\
1151  reg##01_im = b * tmp_im;\
1152  reg##21_re *= b;\
1153  reg##21_im *= b;\
1154  \
1155  tmp_re = reg##11_re - a * reg##31_im;\
1156  tmp_im = reg##11_im + a * reg##31_re;\
1157  reg##31_re -= a * reg##11_im;\
1158  reg##31_im += a * reg##11_re;\
1159  \
1160  reg##11_re = b * tmp_re;\
1161  reg##11_im = b * tmp_im;\
1162  reg##31_re *= b;\
1163  reg##31_im *= b;\
1164  \
1165  tmp_re = reg##02_re - a * reg##22_im;\
1166  tmp_im = reg##02_im + a * reg##22_re;\
1167  reg##22_re -= a * reg##02_im;\
1168  reg##22_im += a * reg##02_re;\
1169  \
1170  reg##02_re = b * tmp_re;\
1171  reg##02_im = b * tmp_im;\
1172  reg##22_re *= b;\
1173  reg##22_im *= b;\
1174  \
1175  tmp_re = reg##12_re - a * reg##32_im;\
1176  tmp_im = reg##12_im + a * reg##32_re;\
1177  reg##32_re -= a * reg##12_im;\
1178  reg##32_im += a * reg##12_re;\
1179  \
1180  reg##12_re = b * tmp_re;\
1181  reg##12_im = b * tmp_im;\
1182  reg##32_re *= b;\
1183  reg##32_im *= b;\
1184 }
1185 
1186 
1187 #define APPLY_TWIST(a, reg)\
1188 {\
1189  spinorFloat tmp_re, tmp_im;\
1190  tmp_re = reg##00_re - a * reg##20_im;\
1191  tmp_im = reg##00_im + a * reg##20_re;\
1192  reg##20_re -= a * reg##00_im;\
1193  reg##20_im += a * reg##00_re;\
1194  \
1195  reg##00_re = tmp_re;\
1196  reg##00_im = tmp_im;\
1197  \
1198  tmp_re = reg##10_re - a * reg##30_im;\
1199  tmp_im = reg##10_im + a * reg##30_re;\
1200  reg##30_re -= a * reg##10_im;\
1201  reg##30_im += a * reg##10_re;\
1202  \
1203  reg##10_re = tmp_re;\
1204  reg##10_im = tmp_im;\
1205  \
1206  tmp_re = reg##01_re - a * reg##21_im;\
1207  tmp_im = reg##01_im + a * reg##21_re;\
1208  reg##21_re -= a * reg##01_im;\
1209  reg##21_im += a * reg##01_re;\
1210  \
1211  reg##01_re = tmp_re;\
1212  reg##01_im = tmp_im;\
1213  \
1214  tmp_re = reg##11_re - a * reg##31_im;\
1215  tmp_im = reg##11_im + a * reg##31_re;\
1216  reg##31_re -= a * reg##11_im;\
1217  reg##31_im += a * reg##11_re;\
1218  \
1219  reg##11_re = tmp_re;\
1220  reg##11_im = tmp_im;\
1221  \
1222  tmp_re = reg##02_re - a * reg##22_im;\
1223  tmp_im = reg##02_im + a * reg##22_re;\
1224  reg##22_re -= a * reg##02_im;\
1225  reg##22_im += a * reg##02_re;\
1226  \
1227  reg##02_re = tmp_re;\
1228  reg##02_im = tmp_im;\
1229  \
1230  tmp_re = reg##12_re - a * reg##32_im;\
1231  tmp_im = reg##12_im + a * reg##32_re;\
1232  reg##32_re -= a * reg##12_im;\
1233  reg##32_im += a * reg##12_re;\
1234  \
1235  reg##12_re = tmp_re;\
1236  reg##12_im = tmp_im;\
1237 }
1238 
1239 
1240