QUDA  0.9.0
io_spinor.h
Go to the documentation of this file.
1 #define READ_SPINOR_DOUBLE(spinor_, stride, sp_idx, norm_idx) \
2  double2 *spinor = (double2*)spinor_; \
3  double2 I0 = spinor[sp_idx + 0*(stride)]; \
4  double2 I1 = spinor[sp_idx + 1*(stride)]; \
5  double2 I2 = spinor[sp_idx + 2*(stride)]; \
6  double2 I3 = spinor[sp_idx + 3*(stride)]; \
7  double2 I4 = spinor[sp_idx + 4*(stride)]; \
8  double2 I5 = spinor[sp_idx + 5*(stride)]; \
9  double2 I6 = spinor[sp_idx + 6*(stride)]; \
10  double2 I7 = spinor[sp_idx + 7*(stride)]; \
11  double2 I8 = spinor[sp_idx + 8*(stride)]; \
12  double2 I9 = spinor[sp_idx + 9*(stride)]; \
13  double2 I10 = spinor[sp_idx + 10*(stride)]; \
14  double2 I11 = spinor[sp_idx + 11*(stride)];
15 
16 #define READ_SPINOR_GHOST_DOUBLE(spinor_, stride, sp_idx, norm_idx, dir) \
17  double2 *spinor = (double2*)spinor_[dir]; \
18  double2 I0 = spinor[sp_idx + 0*(stride)]; \
19  double2 I1 = spinor[sp_idx + 1*(stride)]; \
20  double2 I2 = spinor[sp_idx + 2*(stride)]; \
21  double2 I3 = spinor[sp_idx + 3*(stride)]; \
22  double2 I4 = spinor[sp_idx + 4*(stride)]; \
23  double2 I5 = spinor[sp_idx + 5*(stride)];
24 
25 #define READ_SPINOR_DOUBLE_UP(spinor_, stride, sp_idx, norm_idx) \
26  double2 *spinor = (double2*)spinor_; \
27  double2 I0 = spinor[sp_idx + 0*(stride)]; \
28  double2 I1 = spinor[sp_idx + 1*(stride)]; \
29  double2 I2 = spinor[sp_idx + 2*(stride)]; \
30  double2 I3 = spinor[sp_idx + 3*(stride)]; \
31  double2 I4 = spinor[sp_idx + 4*(stride)]; \
32  double2 I5 = spinor[sp_idx + 5*(stride)];
33 
34 #define READ_SPINOR_DOUBLE_DOWN(spinor_, stride, sp_idx, norm_idx) \
35  double2 *spinor = (double2*)spinor_; \
36  double2 I6 = spinor[sp_idx + 6*(stride)]; \
37  double2 I7 = spinor[sp_idx + 7*(stride)]; \
38  double2 I8 = spinor[sp_idx + 8*(stride)]; \
39  double2 I9 = spinor[sp_idx + 9*(stride)]; \
40  double2 I10 = spinor[sp_idx + 10*(stride)]; \
41  double2 I11 = spinor[sp_idx + 11*(stride)];
42 
43 #define READ_SPINOR_SINGLE(spinor_, stride, sp_idx, norm_idx) \
44  float4 *spinor = (float4*)spinor_; \
45  float4 I0 = spinor[sp_idx + 0*(stride)]; \
46  float4 I1 = spinor[sp_idx + 1*(stride)]; \
47  float4 I2 = spinor[sp_idx + 2*(stride)]; \
48  float4 I3 = spinor[sp_idx + 3*(stride)]; \
49  float4 I4 = spinor[sp_idx + 4*(stride)]; \
50  float4 I5 = spinor[sp_idx + 5*(stride)];
51 
52 #define READ_SPINOR_GHOST_SINGLE(spinor_, stride, sp_idx, norm_idx, dir) \
53  float4 *spinor = (float4*)spinor_[dir]; \
54  float4 I0 = spinor[sp_idx + 0*(stride)]; \
55  float4 I1 = spinor[sp_idx + 1*(stride)]; \
56  float4 I2 = spinor[sp_idx + 2*(stride)]; \
57 
58 #define READ_SPINOR_SINGLE_UP(spinor_, stride, sp_idx, norm_idx) \
59  float4 *spinor = (float4*)spinor_; \
60  float4 I0 = spinor[sp_idx + 0*(stride)]; \
61  float4 I1 = spinor[sp_idx + 1*(stride)]; \
62  float4 I2 = spinor[sp_idx + 2*(stride)]; \
63 
64 #define READ_SPINOR_SINGLE_DOWN(spinor_, stride, sp_idx, norm_idx) \
65  float4 *spinor = (float4*)spinor_; \
66  float4 I3 = spinor[sp_idx + 3*(stride)]; \
67  float4 I4 = spinor[sp_idx + 4*(stride)]; \
68  float4 I5 = spinor[sp_idx + 5*(stride)];
69 
70 #define READ_SPINOR_HALF_(spinor_, stride, sp_idx, norm_idx) \
71  short4 *spinor = (short4*)spinor_; \
72  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
73  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
74  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
75  float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \
76  float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \
77  float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \
78  float C = (spinor_ ## Norm)[norm_idx]; \
79  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
80  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
81  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
82  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
83  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
84  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
85 
86 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \
87  READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)
88 
89 #define READ_SPINOR_GHOST_HALF_(spinor_, stride, sp_idx, norm_idx, dir) \
90  short4 *spinor = (short4*)spinor_[dir]; \
91  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
92  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
93  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
94  float C = (spinor_ ## Norm)[norm_idx]; \
95  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
96  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
97  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
98 
99 #define READ_SPINOR_GHOST_HALF(spinor, stride, sp_idx, norm_idx, dir) \
100  READ_SPINOR_GHOST_HALF_(spinor, stride, sp_idx, norm_idx, dir)
101 
102 #define READ_SPINOR_HALF_UP_(spinor_, stride, sp_idx, norm_idx) \
103  short4 *spinor = (short4*)spinor_; \
104  float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \
105  float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \
106  float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \
107  float C = (spinor_ ## Norm)[norm_idx]; \
108  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
109  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
110  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
111 
112 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \
113  READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)
114 
115 #define READ_SPINOR_HALF_DOWN_(spinor_, stride, sp_idx, norm_idx) \
116  short4 *spinor = (short4*)spinor_; \
117  float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \
118  float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \
119  float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \
120  float C = (spinor_ ## Norm)[norm_idx]; \
121  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
122  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
123  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
124 
125 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
126  READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)
127 
128 #define READ_ACCUM_DOUBLE(spinor_, stride) \
129  double2 *spinor = (double2*)spinor_; \
130  double2 accum0 = spinor[sid + 0*stride]; \
131  double2 accum1 = spinor[sid + 1*stride]; \
132  double2 accum2 = spinor[sid + 2*stride]; \
133  double2 accum3 = spinor[sid + 3*stride]; \
134  double2 accum4 = spinor[sid + 4*stride]; \
135  double2 accum5 = spinor[sid + 5*stride]; \
136  double2 accum6 = spinor[sid + 6*stride]; \
137  double2 accum7 = spinor[sid + 7*stride]; \
138  double2 accum8 = spinor[sid + 8*stride]; \
139  double2 accum9 = spinor[sid + 9*stride]; \
140  double2 accum10 = spinor[sid + 10*stride]; \
141  double2 accum11 = spinor[sid + 11*stride];
142 
143 #define READ_ACCUM_SINGLE(spinor_, stride) \
144  float4 *spinor = (float4*)spinor_; \
145  float4 accum0 = spinor[sid + 0*(stride)]; \
146  float4 accum1 = spinor[sid + 1*(stride)]; \
147  float4 accum2 = spinor[sid + 2*(stride)]; \
148  float4 accum3 = spinor[sid + 3*(stride)]; \
149  float4 accum4 = spinor[sid + 4*(stride)]; \
150  float4 accum5 = spinor[sid + 5*(stride)];
151 
152 #define READ_ACCUM_HALF_(spinor_, stride) \
153  short4 *spinor = (short4*)spinor_; \
154  float4 accum0 = short42float4(spinor[sid + 0*stride]); \
155  float4 accum1 = short42float4(spinor[sid + 1*stride]); \
156  float4 accum2 = short42float4(spinor[sid + 2*stride]); \
157  float4 accum3 = short42float4(spinor[sid + 3*stride]); \
158  float4 accum4 = short42float4(spinor[sid + 4*stride]); \
159  float4 accum5 = short42float4(spinor[sid + 5*stride]); \
160  float C = (spinor_ ## Norm)[sid]; \
161  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
162  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
163  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
164  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
165  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
166  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
167 
168 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
169 
170 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
171  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
172  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
173  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
174  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
175  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
176  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
177  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
178  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
179  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
180  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
181  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
182  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
183 
184 #ifdef USE_TEXTURE_OBJECTS
185 #define READ_SPINOR_GHOST_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \
186  double2 I0 = fetch_double2((spinor)[dir], sp_idx + 0*(stride)); \
187  double2 I1 = fetch_double2((spinor)[dir], sp_idx + 1*(stride)); \
188  double2 I2 = fetch_double2((spinor)[dir], sp_idx + 2*(stride)); \
189  double2 I3 = fetch_double2((spinor)[dir], sp_idx + 3*(stride)); \
190  double2 I4 = fetch_double2((spinor)[dir], sp_idx + 4*(stride)); \
191  double2 I5 = fetch_double2((spinor)[dir], sp_idx + 5*(stride));
192 #else
193 #define READ_SPINOR_GHOST_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \
194  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
195  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
196  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
197  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
198  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
199  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
200 #endif
201 
202 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
203  double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
204  double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
205  double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
206  double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
207  double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
208  double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
209 
210 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
211  double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
212  double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
213  double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
214  double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
215  double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
216  double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
217 
218 #define READ_ACCUM_DOUBLE_TEX(spinor, stride) \
219  double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \
220  double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \
221  double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \
222  double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \
223  double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \
224  double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \
225  double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \
226  double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \
227  double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \
228  double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \
229  double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
230  double2 accum11 = fetch_double2((spinor), sid + 11*(stride));
231 
232 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
233  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
234  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
235  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
236  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
237  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
238  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
239 
240 #ifdef USE_TEXTURE_OBJECTS
241 #define READ_SPINOR_GHOST_SINGLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \
242  float4 I0 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 0*(stride)); \
243  float4 I1 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 1*(stride)); \
244  float4 I2 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 2*(stride));
245 #else
246 #define READ_SPINOR_GHOST_SINGLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \
247  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
248  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
249  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride));
250 #endif
251 
252 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
253  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
254  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
255  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
256 
257 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
258  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
259  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
260  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
261 
262 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
263  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
264  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
265  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
266  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
267  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
268  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
269  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
270  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
271  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
272  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
273  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
274  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
275  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
276 
277 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \
278  READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
279 
280 #ifdef USE_TEXTURE_OBJECTS
281 #define READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \
282  float4 I0 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 0*(stride)); \
283  float4 I1 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 1*(stride)); \
284  float4 I2 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 2*(stride)); \
285  float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx); \
286  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
287  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
288  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
289 #else
290 #define READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \
291  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
292  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
293  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
294  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
295  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
296  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
297  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
298 #endif
299 
300 #define READ_SPINOR_GHOST_HALF_TEX(spinor, stride, sp_idx, norm_idx, dir) \
301  READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \
302 
303 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
304  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
305  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
306  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
307  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
308  I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
309  I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
310  I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
311 
312 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
313  READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \
314 
315 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
316  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
317  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
318  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \
319  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \
320  I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
321  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
322  I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
323 
324 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \
325  READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \
326 
327 #define READ_ACCUM_SINGLE_TEX(spinor, stride) \
328  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
329  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
330  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
331  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
332  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
333  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride));
334 
335 #define READ_ACCUM_HALF_TEX_(spinor, stride) \
336  float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
337  float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
338  float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
339  float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
340  float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
341  float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
342  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
343  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
344  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
345  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
346  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
347  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
348  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
349 
350 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
351 
352 
353 #define WRITE_SPINOR_DOUBLE2(stride) \
354  double2 *out = (double2*)param.out; \
355  out[0*(stride)+sid] = make_double2(o00_re, o00_im); \
356  out[1*(stride)+sid] = make_double2(o01_re, o01_im); \
357  out[2*(stride)+sid] = make_double2(o02_re, o02_im); \
358  out[3*(stride)+sid] = make_double2(o10_re, o10_im); \
359  out[4*(stride)+sid] = make_double2(o11_re, o11_im); \
360  out[5*(stride)+sid] = make_double2(o12_re, o12_im); \
361  out[6*(stride)+sid] = make_double2(o20_re, o20_im); \
362  out[7*(stride)+sid] = make_double2(o21_re, o21_im); \
363  out[8*(stride)+sid] = make_double2(o22_re, o22_im); \
364  out[9*(stride)+sid] = make_double2(o30_re, o30_im); \
365  out[10*(stride)+sid] = make_double2(o31_re, o31_im); \
366  out[11*(stride)+sid] = make_double2(o32_re, o32_im);
367 
368 #define WRITE_SPINOR_FLOAT4(stride) \
369  float4 *out = (float4*)param.out; \
370  out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
371  out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
372  out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
373  out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
374  out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
375  out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
376 
377 #define WRITE_SPINOR_SHORT4(stride) \
378  short4 *out = (short4*)param.out; \
379  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
380  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
381  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
382  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
383  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
384  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
385  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
386  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
387  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
388  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
389  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
390  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
391  c0 = fmaxf(c0, c1); \
392  c1 = fmaxf(c2, c3); \
393  c2 = fmaxf(c4, c5); \
394  c3 = fmaxf(c6, c7); \
395  c4 = fmaxf(c8, c9); \
396  c5 = fmaxf(c10, c11); \
397  c0 = fmaxf(c0, c1); \
398  c1 = fmaxf(c2, c3); \
399  c2 = fmaxf(c4, c5); \
400  c0 = fmaxf(c0, c1); \
401  c0 = fmaxf(c0, c2); \
402  param.outNorm[sid] = c0; \
403  float scale = __fdividef(MAX_SHORT, c0); \
404  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
405  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
406  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
407  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
408  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
409  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
410  out[sid+0*(stride)] = make_short4(f2i(o00_re), f2i(o00_im), f2i(o01_re), f2i(o01_im)); \
411  out[sid+1*(stride)] = make_short4(f2i(o02_re), f2i(o02_im), f2i(o10_re), f2i(o10_im)); \
412  out[sid+2*(stride)] = make_short4(f2i(o11_re), f2i(o11_im), f2i(o12_re), f2i(o12_im)); \
413  out[sid+3*(stride)] = make_short4(f2i(o20_re), f2i(o20_im), f2i(o21_re), f2i(o21_im)); \
414  out[sid+4*(stride)] = make_short4(f2i(o22_re), f2i(o22_im), f2i(o30_re), f2i(o30_im)); \
415  out[sid+5*(stride)] = make_short4(f2i(o31_re), f2i(o31_im), f2i(o32_re), f2i(o32_im));
416 
417 #define WRITE_SPINOR_DOUBLE2_STR(stride) \
418  double2 *out = (double2*)param.out; \
419  store_streaming_double2(&out[0*stride+sid], o00_re, o00_im); \
420  store_streaming_double2(&out[1*stride+sid], o01_re, o01_im); \
421  store_streaming_double2(&out[2*stride+sid], o02_re, o02_im); \
422  store_streaming_double2(&out[3*stride+sid], o10_re, o10_im); \
423  store_streaming_double2(&out[4*stride+sid], o11_re, o11_im); \
424  store_streaming_double2(&out[5*stride+sid], o12_re, o12_im); \
425  store_streaming_double2(&out[6*stride+sid], o20_re, o20_im); \
426  store_streaming_double2(&out[7*stride+sid], o21_re, o21_im); \
427  store_streaming_double2(&out[8*stride+sid], o22_re, o22_im); \
428  store_streaming_double2(&out[9*stride+sid], o30_re, o30_im); \
429  store_streaming_double2(&out[10*stride+sid], o31_re, o31_im); \
430  store_streaming_double2(&out[11*stride+sid], o32_re, o32_im);
431 
432 #define WRITE_SPINOR_FLOAT4_STR(stride) \
433  float4 *out = (float4*)param.out; \
434  store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
435  store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
436  store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
437  store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
438  store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
439  store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
440 
441 #define WRITE_SPINOR_SHORT4_STR(stride) \
442  short4 *out = (short4*)param.out; \
443  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
444  float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \
445  float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \
446  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \
447  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \
448  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \
449  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \
450  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \
451  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \
452  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \
453  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \
454  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \
455  c0 = fmaxf(c0, c1); \
456  c1 = fmaxf(c2, c3); \
457  c2 = fmaxf(c4, c5); \
458  c3 = fmaxf(c6, c7); \
459  c4 = fmaxf(c8, c9); \
460  c5 = fmaxf(c10, c11); \
461  c0 = fmaxf(c0, c1); \
462  c1 = fmaxf(c2, c3); \
463  c2 = fmaxf(c4, c5); \
464  c0 = fmaxf(c0, c1); \
465  c0 = fmaxf(c0, c2); \
466  param.outNorm[sid] = c0; \
467  float scale = __fdividef(MAX_SHORT, c0); \
468  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
469  o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \
470  o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \
471  o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
472  o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
473  o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
474  store_streaming_short4(&out[0*(stride)+sid], f2i(o00_re), f2i(o00_im), f2i(o01_re), f2i(o01_im)); \
475  store_streaming_short4(&out[1*(stride)+sid], f2i(o02_re), f2i(o02_im), f2i(o10_re), f2i(o10_im)); \
476  store_streaming_short4(&out[2*(stride)+sid], f2i(o11_re), f2i(o11_im), f2i(o12_re), f2i(o12_im)); \
477  store_streaming_short4(&out[3*(stride)+sid], f2i(o20_re), f2i(o20_im), f2i(o21_re), f2i(o21_im)); \
478  store_streaming_short4(&out[4*(stride)+sid], f2i(o22_re), f2i(o22_im), f2i(o30_re), f2i(o30_im)); \
479  store_streaming_short4(&out[5*(stride)+sid], f2i(o31_re), f2i(o31_im), f2i(o32_re), f2i(o32_im));
480 
481 // macros used for exterior Wilson Dslash kernels and face packing
482 
483 #define READ_HALF_SPINOR READ_SPINOR_UP
484 
485 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \
486  out[0*(stride)+sid] = make_double2(a0_re, a0_im); \
487  out[1*(stride)+sid] = make_double2(a1_re, a1_im); \
488  out[2*(stride)+sid] = make_double2(a2_re, a2_im); \
489  out[3*(stride)+sid] = make_double2(b0_re, b0_im); \
490  out[4*(stride)+sid] = make_double2(b1_re, b1_im); \
491  out[5*(stride)+sid] = make_double2(b2_re, b2_im);
492 
493 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \
494  out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \
495  out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \
496  out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
497 
498 #define WRITE_HALF_SPINOR_SHORT4(stride, sid) \
499  float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \
500  float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \
501  float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \
502  float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \
503  float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \
504  float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \
505  c0 = fmaxf(c0, c1); \
506  c1 = fmaxf(c2, c3); \
507  c2 = fmaxf(c4, c5); \
508  c0 = fmaxf(c0, c1); \
509  c0 = fmaxf(c0, c2); \
510  outNorm[sid] = c0; \
511  float scale = __fdividef(MAX_SHORT, c0); \
512  a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \
513  a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \
514  b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \
515  out[sid+0*(stride)] = make_short4(f2i(a0_re), f2i(a0_im), f2i(a1_re), f2i(a1_im)); \
516  out[sid+1*(stride)] = make_short4(f2i(a2_re), f2i(a2_im), f2i(b0_re), f2i(b0_im)); \
517  out[sid+2*(stride)] = make_short4(f2i(b1_re), f2i(b1_im), f2i(b2_re), f2i(b2_im));
518 
520 /******************used by non-degenerate twisted mass**********************/
521 #define WRITE_FLAVOR_SPINOR_DOUBLE2() \
522  double2 *out = (double2*)param.out; \
523  out[0*(param.sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \
524  out[1*(param.sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \
525  out[2*(param.sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \
526  out[3*(param.sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \
527  out[4*(param.sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \
528  out[5*(param.sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \
529  out[6*(param.sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \
530  out[7*(param.sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \
531  out[8*(param.sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \
532  out[9*(param.sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \
533  out[10*(param.sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \
534  out[11*(param.sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \
535  out[0*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_00_re, o2_00_im); \
536  out[1*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_01_re, o2_01_im); \
537  out[2*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_02_re, o2_02_im); \
538  out[3*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_10_re, o2_10_im); \
539  out[4*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_11_re, o2_11_im); \
540  out[5*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_12_re, o2_12_im); \
541  out[6*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_20_re, o2_20_im); \
542  out[7*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_21_re, o2_21_im); \
543  out[8*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_22_re, o2_22_im); \
544  out[9*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_30_re, o2_30_im); \
545  out[10*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_31_re, o2_31_im); \
546  out[11*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_32_re, o2_32_im);
547 
548 
549 #define WRITE_FLAVOR_SPINOR_FLOAT4() \
550  float4 *out = (float4*)param.out; \
551  out[0*(param.sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \
552  out[1*(param.sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \
553  out[2*(param.sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \
554  out[3*(param.sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \
555  out[4*(param.sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \
556  out[5*(param.sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \
557  out[0*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \
558  out[1*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \
559  out[2*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \
560  out[3*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \
561  out[4*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \
562  out[5*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im);
563 
564 
565 #define WRITE_FLAVOR_SPINOR_SHORT4() \
566  short4 *out = (short4*)param.out; \
567  float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \
568  float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \
569  float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \
570  float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \
571  float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \
572  float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \
573  float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \
574  float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \
575  float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \
576  float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \
577  float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \
578  float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \
579  c0 = fmaxf(c0, c1); \
580  c1 = fmaxf(c2, c3); \
581  c2 = fmaxf(c4, c5); \
582  c3 = fmaxf(c6, c7); \
583  c4 = fmaxf(c8, c9); \
584  c5 = fmaxf(c10, c11); \
585  c0 = fmaxf(c0, c1); \
586  c1 = fmaxf(c2, c3); \
587  c2 = fmaxf(c4, c5); \
588  c0 = fmaxf(c0, c1); \
589  c0 = fmaxf(c0, c2); \
590  param.outNorm[sid] = c0; \
591  float scale = __fdividef(MAX_SHORT, c0); \
592  o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \
593  o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \
594  o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \
595  o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \
596  o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \
597  o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \
598  out[sid+0*(param.sp_stride)] = make_short4(f2i(o1_00_re), f2i(o1_00_im), f2i(o1_01_re), f2i(o1_01_im)); \
599  out[sid+1*(param.sp_stride)] = make_short4(f2i(o1_02_re), f2i(o1_02_im), f2i(o1_10_re), f2i(o1_10_im)); \
600  out[sid+2*(param.sp_stride)] = make_short4(f2i(o1_11_re), f2i(o1_11_im), f2i(o1_12_re), f2i(o1_12_im)); \
601  out[sid+3*(param.sp_stride)] = make_short4(f2i(o1_20_re), f2i(o1_20_im), f2i(o1_21_re), f2i(o1_21_im)); \
602  out[sid+4*(param.sp_stride)] = make_short4(f2i(o1_22_re), f2i(o1_22_im), f2i(o1_30_re), f2i(o1_30_im)); \
603  out[sid+5*(param.sp_stride)] = make_short4(f2i(o1_31_re), f2i(o1_31_im), f2i(o1_32_re), f2i(o1_32_im)); \
604  c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \
605  c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \
606  c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \
607  c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \
608  c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \
609  c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \
610  c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \
611  c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \
612  c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \
613  c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \
614  c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \
615  c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \
616  c0 = fmaxf(c0, c1); \
617  c1 = fmaxf(c2, c3); \
618  c2 = fmaxf(c4, c5); \
619  c3 = fmaxf(c6, c7); \
620  c4 = fmaxf(c8, c9); \
621  c5 = fmaxf(c10, c11); \
622  c0 = fmaxf(c0, c1); \
623  c1 = fmaxf(c2, c3); \
624  c2 = fmaxf(c4, c5); \
625  c0 = fmaxf(c0, c1); \
626  c0 = fmaxf(c0, c2); \
627  param.outNorm[sid+param.fl_stride] = c0; \
628  scale = __fdividef(MAX_SHORT, c0); \
629  o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \
630  o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \
631  o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \
632  o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \
633  o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \
634  o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \
635  out[sid+param.fl_stride+0*(param.sp_stride)] = make_short4(f2i(o2_00_re), f2i(o2_00_im), f2i(o2_01_re), f2i(o2_01_im)); \
636  out[sid+param.fl_stride+1*(param.sp_stride)] = make_short4(f2i(o2_02_re), f2i(o2_02_im), f2i(o2_10_re), f2i(o2_10_im)); \
637  out[sid+param.fl_stride+2*(param.sp_stride)] = make_short4(f2i(o2_11_re), f2i(o2_11_im), f2i(o2_12_re), f2i(o2_12_im)); \
638  out[sid+param.fl_stride+3*(param.sp_stride)] = make_short4(f2i(o2_20_re), f2i(o2_20_im), f2i(o2_21_re), f2i(o2_21_im)); \
639  out[sid+param.fl_stride+4*(param.sp_stride)] = make_short4(f2i(o2_22_re), f2i(o2_22_im), f2i(o2_30_re), f2i(o2_30_im)); \
640  out[sid+param.fl_stride+5*(param.sp_stride)] = make_short4(f2i(o2_31_re), f2i(o2_31_im), f2i(o2_32_re), f2i(o2_32_im));
641 
642 
643 /************* the following is used by staggered *****************/
644 
645 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \
646  double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
647  double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
648  double2 I2 = fetch_double2((spinor), idx + 2*mystride);
649 
650 #ifdef USE_TEXTURE_OBJECTS
651 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE_TEX(spinor, idx, mystride, dir) \
652  double2 I0 = fetch_double2((spinor[dir]), idx + 0*mystride); \
653  double2 I1 = fetch_double2((spinor[dir]), idx + 1*mystride); \
654  double2 I2 = fetch_double2((spinor[dir]), idx + 2*mystride);
655 #else
656 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE_TEX(spinor, idx, mystride, dir) \
657  double2 I0 = fetch_double2((spinor), idx + 0*mystride); \
658  double2 I1 = fetch_double2((spinor), idx + 1*mystride); \
659  double2 I2 = fetch_double2((spinor), idx + 2*mystride);
660 #endif
661 
662 #define READ_KS_NBR_SPINOR_DOUBLE_TEX(T, spinor, idx, mystride) \
663  T##0 = fetch_double2((spinor), idx + 0*mystride); \
664  T##1 = fetch_double2((spinor), idx + 1*mystride); \
665  T##2 = fetch_double2((spinor), idx + 2*mystride);
666 
667 #ifdef USE_TEXTURE_OBJECTS
668 #define READ_KS_NBR_SPINOR_GHOST_DOUBLE_TEX(T, spinor, idx, mystride, dir) \
669  T##0 = fetch_double2((spinor)[dir], idx + 0*mystride); \
670  T##1 = fetch_double2((spinor)[dir], idx + 1*mystride); \
671  T##2 = fetch_double2((spinor)[dir], idx + 2*mystride);
672 #else
673 #define READ_KS_NBR_SPINOR_GHOST_DOUBLE_TEX(T, spinor, idx, mystride, dir) \
674  T##0 = fetch_double2((spinor), idx + 0*mystride); \
675  T##1 = fetch_double2((spinor), idx + 1*mystride); \
676  T##2 = fetch_double2((spinor), idx + 2*mystride);
677 #endif
678 
679 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \
680  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
681  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
682  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
683 
684 #ifdef USE_TEXTURE_OBJECTS
685 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE_TEX(spinor, idx, mystride, dir) \
686  float2 I0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \
687  float2 I1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \
688  float2 I2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride);
689 #else
690 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE_TEX(spinor, idx, mystride, dir) \
691  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
692  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
693  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
694 #endif
695 
696 #define READ_KS_NBR_SPINOR_SINGLE_TEX(T, spinor, idx, mystride) \
697  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
698  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
699  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
700 
701 #ifdef USE_TEXTURE_OBJECTS
702 #define READ_KS_NBR_SPINOR_GHOST_SINGLE_TEX(T, spinor, idx, mystride, dir) \
703  T##0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \
704  T##1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \
705  T##2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride);
706 #else
707 #define READ_KS_NBR_SPINOR_GHOST_SINGLE_TEX(T, spinor, idx, mystride, dir) \
708  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
709  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
710  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride);
711 #endif
712 
713 #define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \
714  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
715  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
716  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
717  { \
718  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
719  I0.x *= C; I0.y *= C; \
720  I1.x *= C; I1.y *= C; \
721  I2.x *= C; I2.y *= C;}
722 
723 #ifdef USE_TEXTURE_OBJECTS
724 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir) \
725  float2 I0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \
726  float2 I1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \
727  float2 I2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); \
728  { \
729  float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx1); \
730  I0.x *= C; I0.y *= C; \
731  I1.x *= C; I1.y *= C; \
732  I2.x *= C; I2.y *= C;}
733 #else
734 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir) \
735  float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
736  float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
737  float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
738  { \
739  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \
740  I0.x *= C; I0.y *= C; \
741  I1.x *= C; I1.y *= C; \
742  I2.x *= C; I2.y *= C;}
743 #endif
744 
745 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \
746  READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride)
747 
748 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX(spinor, idx, mystride, dir) \
749  READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir)
750 
751 #define READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride) \
752  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
753  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
754  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
755  { \
756  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
757  (T##0).x *= C; (T##0).y *= C; \
758  (T##1).x *= C; (T##1).y *= C; \
759  (T##2).x *= C; (T##2).y *= C;}
760 
761 #ifdef USE_TEXTURE_OBJECTS
762 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir) \
763  T##0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \
764  T##1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \
765  T##2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); \
766  { \
767  float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx3); \
768  (T##0).x *= C; (T##0).y *= C; \
769  (T##1).x *= C; (T##1).y *= C; \
770  (T##2).x *= C; (T##2).y *= C;}
771 #else
772 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir) \
773  T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \
774  T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \
775  T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \
776  { \
777  float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \
778  (T##0).x *= C; (T##0).y *= C; \
779  (T##1).x *= C; (T##1).y *= C; \
780  (T##2).x *= C; (T##2).y *= C;}
781 #endif
782 
783 #define READ_KS_NBR_SPINOR_HALF_TEX(T, spinor, idx, mystride) \
784  READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride)
785 
786 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX(T, spinor, idx, mystride, dir) \
787  READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir)
788 
789 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \
790  double2 I0 = spinor[idx + 0*mystride]; \
791  double2 I1 = spinor[idx + 1*mystride]; \
792  double2 I2 = spinor[idx + 2*mystride];
793 
794 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE(spinor, idx, mystride, dir) \
795  double2 I0 = spinor[dir][idx + 0*mystride]; \
796  double2 I1 = spinor[dir][idx + 1*mystride]; \
797  double2 I2 = spinor[dir][idx + 2*mystride];
798 
799 #define READ_KS_NBR_SPINOR_DOUBLE(T, spinor, idx, mystride) \
800  T##0 = spinor[idx + 0*mystride]; \
801  T##1 = spinor[idx + 1*mystride]; \
802  T##2 = spinor[idx + 2*mystride];
803 
804 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \
805  float2 I0 = spinor[idx + 0*mystride]; \
806  float2 I1 = spinor[idx + 1*mystride]; \
807  float2 I2 = spinor[idx + 2*mystride];
808 
809 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE(spinor, idx, mystride, dir) \
810  float2 I0 = spinor[dir][idx + 0*mystride]; \
811  float2 I1 = spinor[dir][idx + 1*mystride]; \
812  float2 I2 = spinor[dir][idx + 2*mystride];
813 
814 #define READ_KS_NBR_SPINOR_SINGLE(T, spinor, idx, mystride) \
815  T##0 = spinor[idx + 0*mystride]; \
816  T##1 = spinor[idx + 1*mystride]; \
817  T##2 = spinor[idx + 2*mystride];
818 
819 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \
820  float2 I0, I1, I2; \
821  { \
822  short2 S0 = in[idx + 0*mystride]; \
823  short2 S1 = in[idx + 1*mystride]; \
824  short2 S2 = in[idx + 2*mystride]; \
825  float C = inNorm[idx]; \
826  I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \
827  I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \
828  I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \
829  }
830 
831 #define READ_KS_NBR_SPINOR_HALF(T, spinor, idx, mystride) \
832  { \
833  short2 S0 = in[idx + 0*mystride]; \
834  short2 S1 = in[idx + 1*mystride]; \
835  short2 S2 = in[idx + 2*mystride]; \
836  float C = inNorm[idx]; \
837  (T##0).x =C*short2float(S0.x); (T##0).y =C*short2float(S0.y); \
838  (T##1).x =C*short2float(S1.x); (T##1).y =C*short2float(S1.y); \
839  (T##2).x =C*short2float(S2.x); (T##2).y =C*short2float(S2.y); \
840  }
841 
842 
843 #define WRITE_ST_SPINOR_DOUBLE2(out, sid, mystride) \
844  ((double2*)out)[0*mystride+sid] = make_double2(o00_re, o00_im); \
845  ((double2*)out)[1*mystride+sid] = make_double2(o01_re, o01_im); \
846  ((double2*)out)[2*mystride+sid] = make_double2(o02_re, o02_im);
847 
848 #define WRITE_ST_SPINOR_FLOAT2(out, sid, mystride) \
849  ((float2*)out)[0*mystride+sid] = make_float2(o00_re, o00_im); \
850  ((float2*)out)[1*mystride+sid] = make_float2(o01_re, o01_im); \
851  ((float2*)out)[2*mystride+sid] = make_float2(o02_re, o02_im);
852 
853 #define WRITE_ST_SPINOR_SHORT2(out, sid, mystride) \
854  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
855  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
856  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
857  c0 = fmaxf(c0, c1); \
858  c0 = fmaxf(c0, c2); \
859  out ## Norm[sid] = c0; \
860  float scale = __fdividef(MAX_SHORT, c0); \
861  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
862  o02_re *= scale; o02_im *= scale; \
863  ((short2*)out)[sid+0*mystride] = make_short2(f2i(o00_re), f2i(o00_im)); \
864  ((short2*)out)[sid+1*mystride] = make_short2(f2i(o01_re), f2i(o01_im)); \
865  ((short2*)out)[sid+2*mystride] = make_short2(f2i(o02_re), f2i(o02_im));
866 
867 // Non-cache writes to minimize cache polution
868 #define WRITE_ST_SPINOR_DOUBLE2_STR(out, sid, mystride) \
869  store_streaming_double2(&out[0*mystride+sid], o00_re, o00_im); \
870  store_streaming_double2(&out[1*mystride+sid], o01_re, o01_im); \
871  store_streaming_double2(&out[2*mystride+sid], o02_re, o02_im);
872 
873 #define WRITE_ST_SPINOR_FLOAT2_STR(out, sid, mystride) \
874  store_streaming_float2(&out[0*mystride+sid], o00_re, o00_im); \
875  store_streaming_float2(&out[1*mystride+sid], o01_re, o01_im); \
876  store_streaming_float2(&out[2*mystride+sid], o02_re, o02_im);
877 
878 #define WRITE_ST_SPINOR_SHORT2_STR(out, sid, mystride) \
879  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
880  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \
881  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \
882  c0 = fmaxf(c0, c1); \
883  c0 = fmaxf(c0, c2); \
884  out ## Norm[sid] = c0; \
885  float scale = __fdividef(MAX_SHORT, c0); \
886  o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \
887  o02_re *= scale; o02_im *= scale; \
888  store_streaming_short2(&g_out[0*mystride+sid], f2i(o00_re), f2i(o00_im)); \
889  store_streaming_short2(&g_out[1*mystride+sid], f2i(o01_re), f2i(o01_im)); \
890  store_streaming_short2(&g_out[2*mystride+sid], f2i(o02_re), f2i(o02_im));
891 
892 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor,sid) { \
893  double2 tmp0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
894  double2 tmp1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
895  double2 tmp2 = fetch_double2((spinor), sid + 2*(param.sp_stride)); \
896  o00_re += tmp0.x; o00_im += tmp0.y; \
897  o01_re += tmp1.x; o01_im += tmp1.y; \
898  o02_re += tmp2.x; o02_im += tmp2.y; }
899 
900 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor,sid) { \
901  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(param.sp_stride)); \
902  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(param.sp_stride)); \
903  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(param.sp_stride)); \
904  o00_re += tmp0.x; o00_im += tmp0.y; \
905  o01_re += tmp1.x; o01_im += tmp1.y; \
906  o02_re += tmp2.x; o02_im += tmp2.y; }
907 
908 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid) { \
909  float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
910  float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
911  float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
912  float C = TEX1DFETCH(float, (spinor##Norm), sid); \
913  o00_re += C*tmp0.x; o00_im += C*tmp0.y; \
914  o01_re += C*tmp1.x; o01_im += C*tmp1.y; \
915  o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
916 
917 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor,sid) \
918  READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid)
919 
920 #define READ_AND_SUM_ST_SPINOR(spinor,sid) \
921  o00_re += spinor[0*param.sp_stride+sid].x; o00_im += spinor[0*param.sp_stride+sid].y; \
922  o01_re += spinor[1*param.sp_stride+sid].x; o01_im += spinor[1*param.sp_stride+sid].y; \
923  o02_re += spinor[2*param.sp_stride+sid].x; o02_im += spinor[2*param.sp_stride+sid].y; \
924 
925 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid) \
926  float C = spinor ## Norm[sid]; \
927  o00_re += C*short2float(spinor[0*param.sp_stride + sid].x); \
928  o00_im += C*short2float(spinor[0*param.sp_stride + sid].y); \
929  o01_re += C*short2float(spinor[1*param.sp_stride + sid].x); \
930  o01_im += C*short2float(spinor[1*param.sp_stride + sid].y); \
931  o02_re += C*short2float(spinor[2*param.sp_stride + sid].x); \
932  o02_im += C*short2float(spinor[2*param.sp_stride + sid].y);
933 
934 #define READ_AND_SUM_ST_SPINOR_HALF(spinor,sid) \
935  READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid)
936 
937 #define READ_ST_ACCUM_DOUBLE_TEX(spinor,sid) \
938  double2 accum0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \
939  double2 accum1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \
940  double2 accum2 = fetch_double2((spinor), sid + 2*(param.sp_stride));
941 
942 #define READ_ST_ACCUM_SINGLE_TEX(spinor,sid) \
943  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
944  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
945  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride);
946 
947 #define READ_ST_ACCUM_HALF_TEX_(spinor,sid) \
948  float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \
949  float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \
950  float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \
951  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
952  accum0.x *= C; accum0.y *= C; \
953  accum1.x *= C; accum1.y *= C; \
954  accum2.x *= C; accum2.y *= C;
955 
956 #define READ_ST_ACCUM_HALF_TEX(spinor,sid) READ_ST_ACCUM_HALF_TEX_(spinor,sid)
957 
958 #define READ_ST_ACCUM_DOUBLE(spinor,sid) \
959  double2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
960  double2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
961  double2 accum2 = spinor[sid + 2*(param.sp_stride)];
962 
963 #define READ_ST_ACCUM_SINGLE(spinor,sid) \
964  float2 accum0 = spinor[sid + 0*(param.sp_stride)]; \
965  float2 accum1 = spinor[sid + 1*(param.sp_stride)]; \
966  float2 accum2 = spinor[sid + 2*(param.sp_stride)];
967 
968 #define READ_ST_ACCUM_HALF(spinor,sid) \
969  float2 accum0, accum1, accum2; \
970  { \
971  short2 S0 = x[sid + 0*param.sp_stride]; \
972  short2 S1 = x[sid + 1*param.sp_stride]; \
973  short2 S2 = x[sid + 2*param.sp_stride]; \
974  float C = spinor##Norm[sid]; \
975  accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \
976  accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \
977  accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \
978  }
979 
980 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \
981  extern __shared__ char s_data[]; \
982  spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
983  ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
984  sh[0*SHARED_STRIDE] = reg##00_re; \
985  sh[1*SHARED_STRIDE] = reg##00_im; \
986  sh[2*SHARED_STRIDE] = reg##01_re; \
987  sh[3*SHARED_STRIDE] = reg##01_im; \
988  sh[4*SHARED_STRIDE] = reg##02_re; \
989  sh[5*SHARED_STRIDE] = reg##02_im; \
990  sh[6*SHARED_STRIDE] = reg##10_re; \
991  sh[7*SHARED_STRIDE] = reg##10_im; \
992  sh[8*SHARED_STRIDE] = reg##11_re; \
993  sh[9*SHARED_STRIDE] = reg##11_im; \
994  sh[10*SHARED_STRIDE] = reg##12_re; \
995  sh[11*SHARED_STRIDE] = reg##12_im; \
996  sh[12*SHARED_STRIDE] = reg##20_re; \
997  sh[13*SHARED_STRIDE] = reg##20_im; \
998  sh[14*SHARED_STRIDE] = reg##21_re; \
999  sh[15*SHARED_STRIDE] = reg##21_im; \
1000  sh[16*SHARED_STRIDE] = reg##22_re; \
1001  sh[17*SHARED_STRIDE] = reg##22_im; \
1002  sh[18*SHARED_STRIDE] = reg##30_re; \
1003  sh[19*SHARED_STRIDE] = reg##30_im; \
1004  sh[20*SHARED_STRIDE] = reg##31_re; \
1005  sh[21*SHARED_STRIDE] = reg##31_im; \
1006  sh[22*SHARED_STRIDE] = reg##32_re; \
1007  sh[23*SHARED_STRIDE] = reg##32_im;
1008 
1009 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
1010 
1011 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \
1012  extern __shared__ char s_data[]; \
1013  double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
1014  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
1015  double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \
1016  double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
1017  double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \
1018  double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
1019  double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \
1020  double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
1021  double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
1022  double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
1023  double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
1024  double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
1025  double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
1026  double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
1027 
1028 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
1029 
1030 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
1031 
1032 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
1033  extern __shared__ char s_data[]; \
1034  float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
1035  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
1036  float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
1037  float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
1038  float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
1039  float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
1040  float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
1041  float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
1042 
1043 #else // 8-byte shared memory words
1044 
1045 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \
1046  extern __shared__ char s_data[]; \
1047  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
1048  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
1049  sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \
1050  sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \
1051  sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \
1052  sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \
1053  sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \
1054  sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \
1055  sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \
1056  sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \
1057  sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \
1058  sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \
1059  sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \
1060  sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
1061 
1062 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \
1063  extern __shared__ char s_data[]; \
1064  float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
1065  ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
1066  float2 tmp1, tmp2; \
1067  tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
1068  tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
1069  tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
1070  tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
1071  tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
1072  tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y);
1073 
1074 #endif
1075 
1076 
1078 #define READ_ACCUM_FLAVOR_DOUBLE(spinor, stride, fl_stride) \
1079  double2 flv1_accum0 = spinor[sid + 0*stride]; \
1080  double2 flv1_accum1 = spinor[sid + 1*stride]; \
1081  double2 flv1_accum2 = spinor[sid + 2*stride]; \
1082  double2 flv1_accum3 = spinor[sid + 3*stride]; \
1083  double2 flv1_accum4 = spinor[sid + 4*stride]; \
1084  double2 flv1_accum5 = spinor[sid + 5*stride]; \
1085  double2 flv1_accum6 = spinor[sid + 6*stride]; \
1086  double2 flv1_accum7 = spinor[sid + 7*stride]; \
1087  double2 flv1_accum8 = spinor[sid + 8*stride]; \
1088  double2 flv1_accum9 = spinor[sid + 9*stride]; \
1089  double2 flv1_accum10 = spinor[sid + 10*stride]; \
1090  double2 flv1_accum11 = spinor[sid + 11*stride]; \
1091  double2 flv2_accum0 = spinor[sid + fl_stride + 0*stride]; \
1092  double2 flv2_accum1 = spinor[sid + fl_stride + 1*stride]; \
1093  double2 flv2_accum2 = spinor[sid + fl_stride + 2*stride]; \
1094  double2 flv2_accum3 = spinor[sid + fl_stride + 3*stride]; \
1095  double2 flv2_accum4 = spinor[sid + fl_stride + 4*stride]; \
1096  double2 flv2_accum5 = spinor[sid + fl_stride + 5*stride]; \
1097  double2 flv2_accum6 = spinor[sid + fl_stride + 6*stride]; \
1098  double2 flv2_accum7 = spinor[sid + fl_stride + 7*stride]; \
1099  double2 flv2_accum8 = spinor[sid + fl_stride + 8*stride]; \
1100  double2 flv2_accum9 = spinor[sid + fl_stride + 9*stride]; \
1101  double2 flv2_accum10 = spinor[sid + fl_stride + 10*stride]; \
1102  double2 flv2_accum11 = spinor[sid + fl_stride + 11*stride];
1103 
1104 
1105 #define READ_ACCUM_FLAVOR_SINGLE(spinor, stride, flv_stride) \
1106  float4 flv1_accum0 = spinor[sid + 0*(stride)]; \
1107  float4 flv1_accum1 = spinor[sid + 1*(stride)]; \
1108  float4 flv1_accum2 = spinor[sid + 2*(stride)]; \
1109  float4 flv1_accum3 = spinor[sid + 3*(stride)]; \
1110  float4 flv1_accum4 = spinor[sid + 4*(stride)]; \
1111  float4 flv1_accum5 = spinor[sid + 5*(stride)]; \
1112  float4 flv2_accum0 = spinor[sid + flv_stride + 0*(stride)]; \
1113  float4 flv2_accum1 = spinor[sid + flv_stride + 1*(stride)]; \
1114  float4 flv2_accum2 = spinor[sid + flv_stride + 2*(stride)]; \
1115  float4 flv2_accum3 = spinor[sid + flv_stride + 3*(stride)]; \
1116  float4 flv2_accum4 = spinor[sid + flv_stride + 4*(stride)]; \
1117  float4 flv2_accum5 = spinor[sid + flv_stride + 5*(stride)];
1118 
1119 
1120 #define READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride) \
1121  float4 flv1_accum0 = short42float4(spinor[sid + 0*stride]); \
1122  float4 flv1_accum1 = short42float4(spinor[sid + 1*stride]); \
1123  float4 flv1_accum2 = short42float4(spinor[sid + 2*stride]); \
1124  float4 flv1_accum3 = short42float4(spinor[sid + 3*stride]); \
1125  float4 flv1_accum4 = short42float4(spinor[sid + 4*stride]); \
1126  float4 flv1_accum5 = short42float4(spinor[sid + 5*stride]); \
1127  float C = (spinor ## Norm)[sid]; \
1128  flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
1129  flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
1130  flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
1131  flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
1132  flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
1133  flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
1134  float4 flv2_accum0 = short42float4(spinor[sid + flv_stride + 0*stride]); \
1135  float4 flv2_accum1 = short42float4(spinor[sid + flv_stride + 1*stride]); \
1136  float4 flv2_accum2 = short42float4(spinor[sid + flv_stride + 2*stride]); \
1137  float4 flv2_accum3 = short42float4(spinor[sid + flv_stride + 3*stride]); \
1138  float4 flv2_accum4 = short42float4(spinor[sid + flv_stride + 4*stride]); \
1139  float4 flv2_accum5 = short42float4(spinor[sid + flv_stride + 5*stride]); \
1140  C = (spinor ## Norm)[sid + fl_stride]; \
1141  flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
1142  flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
1143  flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
1144  flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
1145  flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
1146  flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
1147 
1148 #define READ_ACCUM_FLAVOR_HALF(spinor, stride, flv_stride) READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride)
1149 
1150 
1151 #define READ_ACCUM_FLAVOR_DOUBLE_TEX(spinor, stride, flv_stride) \
1152  double2 flv1_accum0 = fetch_double2((spinor), sid + 0*(stride)); \
1153  double2 flv1_accum1 = fetch_double2((spinor), sid + 1*(stride)); \
1154  double2 flv1_accum2 = fetch_double2((spinor), sid + 2*(stride)); \
1155  double2 flv1_accum3 = fetch_double2((spinor), sid + 3*(stride)); \
1156  double2 flv1_accum4 = fetch_double2((spinor), sid + 4*(stride)); \
1157  double2 flv1_accum5 = fetch_double2((spinor), sid + 5*(stride)); \
1158  double2 flv1_accum6 = fetch_double2((spinor), sid + 6*(stride)); \
1159  double2 flv1_accum7 = fetch_double2((spinor), sid + 7*(stride)); \
1160  double2 flv1_accum8 = fetch_double2((spinor), sid + 8*(stride)); \
1161  double2 flv1_accum9 = fetch_double2((spinor), sid + 9*(stride)); \
1162  double2 flv1_accum10 = fetch_double2((spinor), sid + 10*(stride)); \
1163  double2 flv1_accum11 = fetch_double2((spinor), sid + 11*(stride)); \
1164  double2 flv2_accum0 = fetch_double2((spinor), sid + flv_stride + 0*(stride)); \
1165  double2 flv2_accum1 = fetch_double2((spinor), sid + flv_stride + 1*(stride)); \
1166  double2 flv2_accum2 = fetch_double2((spinor), sid + flv_stride + 2*(stride)); \
1167  double2 flv2_accum3 = fetch_double2((spinor), sid + flv_stride + 3*(stride)); \
1168  double2 flv2_accum4 = fetch_double2((spinor), sid + flv_stride + 4*(stride)); \
1169  double2 flv2_accum5 = fetch_double2((spinor), sid + flv_stride + 5*(stride)); \
1170  double2 flv2_accum6 = fetch_double2((spinor), sid + flv_stride + 6*(stride)); \
1171  double2 flv2_accum7 = fetch_double2((spinor), sid + flv_stride + 7*(stride)); \
1172  double2 flv2_accum8 = fetch_double2((spinor), sid + flv_stride + 8*(stride)); \
1173  double2 flv2_accum9 = fetch_double2((spinor), sid + flv_stride + 9*(stride)); \
1174  double2 flv2_accum10 = fetch_double2((spinor), sid + flv_stride + 10*(stride)); \
1175  double2 flv2_accum11 = fetch_double2((spinor), sid + flv_stride + 11*(stride));
1176 
1177 
1178 #define READ_ACCUM_FLAVOR_SINGLE_TEX(spinor, stride, flv_stride) \
1179  float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
1180  float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
1181  float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
1182  float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
1183  float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
1184  float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
1185  float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
1186  float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
1187  float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
1188  float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
1189  float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
1190  float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride));
1191 
1192 #define READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride) \
1193  float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \
1194  float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \
1195  float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \
1196  float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \
1197  float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \
1198  float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \
1199  float C = TEX1DFETCH(float, (spinor ## Norm), sid); \
1200  flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \
1201  flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \
1202  flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \
1203  flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \
1204  flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \
1205  flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \
1206  float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \
1207  float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \
1208  float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \
1209  float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \
1210  float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \
1211  float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride)); \
1212  C = TEX1DFETCH(float, (spinor ## Norm), sid + flv_stride); \
1213  flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \
1214  flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \
1215  flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \
1216  flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \
1217  flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \
1218  flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C;
1219 
1220 
1221 #define READ_ACCUM_FLAVOR_HALF_TEX(spinor, stride, flv_stride) READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride)
1222 
1223 //single-flavor macros:
1224 
1225 #define ASSN_ACCUM_DOUBLE(spinor, stride, fl_stride) \
1226  accum0 = spinor[sid + fl_stride + 0*stride]; \
1227  accum1 = spinor[sid + fl_stride + 1*stride]; \
1228  accum2 = spinor[sid + fl_stride + 2*stride]; \
1229  accum3 = spinor[sid + fl_stride + 3*stride]; \
1230  accum4 = spinor[sid + fl_stride + 4*stride]; \
1231  accum5 = spinor[sid + fl_stride + 5*stride]; \
1232  accum6 = spinor[sid + fl_stride + 6*stride]; \
1233  accum7 = spinor[sid + fl_stride + 7*stride]; \
1234  accum8 = spinor[sid + fl_stride + 8*stride]; \
1235  accum9 = spinor[sid + fl_stride + 9*stride]; \
1236  accum10 = spinor[sid + fl_stride + 10*stride]; \
1237  accum11 = spinor[sid + fl_stride + 11*stride];
1238 
1239 #define ASSN_ACCUM_SINGLE(spinor, stride, fl_stride) \
1240  accum0 = spinor[sid + fl_stride + 0*(stride)]; \
1241  accum1 = spinor[sid + fl_stride + 1*(stride)]; \
1242  accum2 = spinor[sid + fl_stride + 2*(stride)]; \
1243  accum3 = spinor[sid + fl_stride + 3*(stride)]; \
1244  accum4 = spinor[sid + fl_stride + 4*(stride)]; \
1245  accum5 = spinor[sid + fl_stride + 5*(stride)];
1246 
1247 #define ASSN_ACCUM_HALF_(spinor, stride, fl_stride) \
1248  accum0 = short42float4(spinor[sid + fl_stride + 0*stride]); \
1249  accum1 = short42float4(spinor[sid + fl_stride + 1*stride]); \
1250  accum2 = short42float4(spinor[sid + fl_stride + 2*stride]); \
1251  accum3 = short42float4(spinor[sid + fl_stride + 3*stride]); \
1252  accum4 = short42float4(spinor[sid + fl_stride + 4*stride]); \
1253  accum5 = short42float4(spinor[sid + fl_stride + 5*stride]); \
1254  {\
1255  float C = (spinor ## Norm)[sid + fl_stride]; \
1256  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1257  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1258  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1259  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1260  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1261  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1262  }
1263 
1264 
1265 #define ASSN_ACCUM_HALF(spinor, stride, fl_stride) ASSN_ACCUM_HALF_(spinor, stride, fl_stride)
1266 
1267 //single-flavor macros:
1268 
1269 #define ASSN_ACCUM_DOUBLE_TEX(spinor, stride, fl_stride) \
1270  accum0 = fetch_double2((spinor), sid + fl_stride + 0*(stride)); \
1271  accum1 = fetch_double2((spinor), sid + fl_stride + 1*(stride)); \
1272  accum2 = fetch_double2((spinor), sid + fl_stride + 2*(stride)); \
1273  accum3 = fetch_double2((spinor), sid + fl_stride + 3*(stride)); \
1274  accum4 = fetch_double2((spinor), sid + fl_stride + 4*(stride)); \
1275  accum5 = fetch_double2((spinor), sid + fl_stride + 5*(stride)); \
1276  accum6 = fetch_double2((spinor), sid + fl_stride + 6*(stride)); \
1277  accum7 = fetch_double2((spinor), sid + fl_stride + 7*(stride)); \
1278  accum8 = fetch_double2((spinor), sid + fl_stride + 8*(stride)); \
1279  accum9 = fetch_double2((spinor), sid + fl_stride + 9*(stride)); \
1280  accum10 = fetch_double2((spinor), sid + fl_stride + 10*(stride)); \
1281  accum11 = fetch_double2((spinor), sid + fl_stride + 11*(stride));
1282 
1283 
1284 #define ASSN_ACCUM_SINGLE_TEX(spinor, stride, fl_stride) \
1285  accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1286  accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1287  accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1288  accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1289  accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1290  accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride));
1291 
1292 #define ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride) \
1293  accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \
1294  accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \
1295  accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \
1296  accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \
1297  accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \
1298  accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride)); \
1299  {\
1300  float C = TEX1DFETCH(float, (spinor ## Norm), sid + fl_stride); \
1301  accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
1302  accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
1303  accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
1304  accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
1305  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
1306  accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \
1307  }
1308 
1309 #define ASSN_ACCUM_HALF_TEX(spinor, stride, fl_stride) ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride)
1310 
1312 
1313 //apply b*(1 + i*a*gamma_5) to the input spinor
1314 #define APPLY_TWIST_INV(a, b, reg)\
1315 {\
1316  spinorFloat tmp_re, tmp_im;\
1317  tmp_re = reg##00_re - a * reg##20_im;\
1318  tmp_im = reg##00_im + a * reg##20_re;\
1319  reg##20_re -= a * reg##00_im;\
1320  reg##20_im += a * reg##00_re;\
1321  \
1322  reg##00_re = b * tmp_re;\
1323  reg##00_im = b * tmp_im;\
1324  reg##20_re *= b;\
1325  reg##20_im *= b;\
1326  \
1327  tmp_re = reg##10_re - a * reg##30_im;\
1328  tmp_im = reg##10_im + a * reg##30_re;\
1329  reg##30_re -= a * reg##10_im;\
1330  reg##30_im += a * reg##10_re;\
1331  \
1332  reg##10_re = b * tmp_re;\
1333  reg##10_im = b * tmp_im;\
1334  reg##30_re *= b;\
1335  reg##30_im *= b;\
1336  \
1337  tmp_re = reg##01_re - a * reg##21_im;\
1338  tmp_im = reg##01_im + a * reg##21_re;\
1339  reg##21_re -= a * reg##01_im;\
1340  reg##21_im += a * reg##01_re;\
1341  \
1342  reg##01_re = b * tmp_re;\
1343  reg##01_im = b * tmp_im;\
1344  reg##21_re *= b;\
1345  reg##21_im *= b;\
1346  \
1347  tmp_re = reg##11_re - a * reg##31_im;\
1348  tmp_im = reg##11_im + a * reg##31_re;\
1349  reg##31_re -= a * reg##11_im;\
1350  reg##31_im += a * reg##11_re;\
1351  \
1352  reg##11_re = b * tmp_re;\
1353  reg##11_im = b * tmp_im;\
1354  reg##31_re *= b;\
1355  reg##31_im *= b;\
1356  \
1357  tmp_re = reg##02_re - a * reg##22_im;\
1358  tmp_im = reg##02_im + a * reg##22_re;\
1359  reg##22_re -= a * reg##02_im;\
1360  reg##22_im += a * reg##02_re;\
1361  \
1362  reg##02_re = b * tmp_re;\
1363  reg##02_im = b * tmp_im;\
1364  reg##22_re *= b;\
1365  reg##22_im *= b;\
1366  \
1367  tmp_re = reg##12_re - a * reg##32_im;\
1368  tmp_im = reg##12_im + a * reg##32_re;\
1369  reg##32_re -= a * reg##12_im;\
1370  reg##32_im += a * reg##12_re;\
1371  \
1372  reg##12_re = b * tmp_re;\
1373  reg##12_im = b * tmp_im;\
1374  reg##32_re *= b;\
1375  reg##32_im *= b;\
1376 }
1377 
1378 
1379 #define APPLY_TWIST(a, reg)\
1380 {\
1381  spinorFloat tmp_re, tmp_im;\
1382  tmp_re = reg##00_re - a * reg##20_im;\
1383  tmp_im = reg##00_im + a * reg##20_re;\
1384  reg##20_re -= a * reg##00_im;\
1385  reg##20_im += a * reg##00_re;\
1386  \
1387  reg##00_re = tmp_re;\
1388  reg##00_im = tmp_im;\
1389  \
1390  tmp_re = reg##10_re - a * reg##30_im;\
1391  tmp_im = reg##10_im + a * reg##30_re;\
1392  reg##30_re -= a * reg##10_im;\
1393  reg##30_im += a * reg##10_re;\
1394  \
1395  reg##10_re = tmp_re;\
1396  reg##10_im = tmp_im;\
1397  \
1398  tmp_re = reg##01_re - a * reg##21_im;\
1399  tmp_im = reg##01_im + a * reg##21_re;\
1400  reg##21_re -= a * reg##01_im;\
1401  reg##21_im += a * reg##01_re;\
1402  \
1403  reg##01_re = tmp_re;\
1404  reg##01_im = tmp_im;\
1405  \
1406  tmp_re = reg##11_re - a * reg##31_im;\
1407  tmp_im = reg##11_im + a * reg##31_re;\
1408  reg##31_re -= a * reg##11_im;\
1409  reg##31_im += a * reg##11_re;\
1410  \
1411  reg##11_re = tmp_re;\
1412  reg##11_im = tmp_im;\
1413  \
1414  tmp_re = reg##02_re - a * reg##22_im;\
1415  tmp_im = reg##02_im + a * reg##22_re;\
1416  reg##22_re -= a * reg##02_im;\
1417  reg##22_im += a * reg##02_re;\
1418  \
1419  reg##02_re = tmp_re;\
1420  reg##02_im = tmp_im;\
1421  \
1422  tmp_re = reg##12_re - a * reg##32_im;\
1423  tmp_im = reg##12_im + a * reg##32_re;\
1424  reg##32_re -= a * reg##12_im;\
1425  reg##32_im += a * reg##12_re;\
1426  \
1427  reg##12_re = tmp_re;\
1428  reg##12_im = tmp_im;\
1429 }
1430 
1431 
1432