QUDA  0.9.0
read_clover.h
Go to the documentation of this file.
1 #define READ_CLOVER_DOUBLE(clover_, chi) \
2  double2* clover = (double2*)clover_; \
3  double2 C0 = clover[sid + (18*chi+0)*param.cl_stride]; \
4  double2 C1 = clover[sid + (18*chi+1)*param.cl_stride]; \
5  double2 C2 = clover[sid + (18*chi+2)*param.cl_stride]; \
6  double2 C3 = clover[sid + (18*chi+3)*param.cl_stride]; \
7  double2 C4 = clover[sid + (18*chi+4)*param.cl_stride]; \
8  double2 C5 = clover[sid + (18*chi+5)*param.cl_stride]; \
9  double2 C6 = clover[sid + (18*chi+6)*param.cl_stride]; \
10  double2 C7 = clover[sid + (18*chi+7)*param.cl_stride]; \
11  double2 C8 = clover[sid + (18*chi+8)*param.cl_stride]; \
12  double2 C9 = clover[sid + (18*chi+9)*param.cl_stride]; \
13  double2 C10 = clover[sid + (18*chi+10)*param.cl_stride]; \
14  double2 C11 = clover[sid + (18*chi+11)*param.cl_stride]; \
15  double2 C12 = clover[sid + (18*chi+12)*param.cl_stride]; \
16  double2 C13 = clover[sid + (18*chi+13)*param.cl_stride]; \
17  double2 C14 = clover[sid + (18*chi+14)*param.cl_stride]; \
18  double2 C15 = clover[sid + (18*chi+15)*param.cl_stride]; \
19  double2 C16 = clover[sid + (18*chi+16)*param.cl_stride]; \
20  double2 C17 = clover[sid + (18*chi+17)*param.cl_stride];
21 
22 #define READ_CLOVER_DOUBLE_STR(clover_, chi) \
23  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \
24  double2 C10, C11, C12, C13, C14, C15, C16, C17; \
25  double2* clover = (double2*)clover_; \
26  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
27  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
28  load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \
29  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
30  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
31  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
32  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
33  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
34  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
35  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
36  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
37  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
38  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
39  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
40  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
41  load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \
42  load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \
43  load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]);
44 
45 #define READ_CLOVER2_DOUBLE_STR(clover_, chi) \
46  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \
47  double2 C10, C11, C12, C13, C14, C15, C16, C17; \
48  double2* clover = (double2*)clover_; \
49  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
50  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
51  double diag = 0.5*(C0.x + C1.y); \
52  double diag_inv = 1.0/diag; \
53  C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \
54  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
55  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
56  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
57  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
58  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
59  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
60  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
61  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
62  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
63  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
64  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
65  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
66  C15 = make_double2(-C3.x,-C3.y); \
67  C16 = make_double2(-C4.x,-C4.y); \
68  C17 = make_double2(-C8.x,-C8.y); \
69  C0.x += param.rho; C0.y += param.rho; C1.x += param.rho; \
70  C1.y += param.rho; C2.x += param.rho; C2.y += param.rho;
71 
72 #define READ_CLOVER_SINGLE(clover_, chi) \
73  float4 *clover = (float4*)clover_; \
74  float4 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \
75  float4 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \
76  float4 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \
77  float4 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \
78  float4 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \
79  float4 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \
80  float4 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \
81  float4 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \
82  float4 C8 = clover[sid + (9*chi+8)*param.cl_stride];
83 
84 #define READ_CLOVER2_SINGLE(clover_, chi) \
85  float4 *clover = (float4*)clover_; \
86  float4 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \
87  float4 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \
88  float4 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \
89  float4 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \
90  float4 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \
91  float4 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \
92  float4 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \
93  float4 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \
94  float4 C8 = make_float4(-C2.x,-C2.y,-C4.x,-C4.y); \
95  C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \
96  C0.w += param.rho; C1.x += param.rho; C1.y += param.rho;
97 
98 #define READ_CLOVER_HALF(clover_, chi) \
99  short4 *clover = (short4*)clover_; \
100  float4 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \
101  float4 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \
102  float4 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \
103  float4 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \
104  float4 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \
105  float4 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \
106  float4 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \
107  float4 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \
108  float4 C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \
109  float K = CLOVERTEXNORM[sid + chi*param.cl_stride]; \
110  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
111  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
112  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
113  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
114  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
115  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
116  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
117  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
118  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
119 
120 #define READ_CLOVER2_HALF(clover_, chi) \
121  short4 *clover = (short4*)clover_; \
122  float4 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \
123  float4 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \
124  float4 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \
125  float4 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \
126  float4 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \
127  float4 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \
128  float4 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \
129  float4 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \
130  float K = CLOVERTEXNORM[sid + chi*param.cl_stride]; \
131  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
132  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
133  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
134  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
135  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
136  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
137  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
138  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
139  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; \
140  float4 C8 = make_float4(-C2.x, -C2.y, -C4.x, -C4.y); \
141  C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \
142  C0.w += param.rho; C1.x += param.rho; C1.y += param.rho;
143 
144 #define READ_CLOVER_DOUBLE_TEX(clover, chi) \
145  double2 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \
146  double2 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \
147  double2 C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \
148  double2 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \
149  double2 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \
150  double2 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \
151  double2 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \
152  double2 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \
153  double2 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \
154  double2 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \
155  double2 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \
156  double2 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \
157  double2 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \
158  double2 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \
159  double2 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \
160  double2 C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \
161  double2 C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \
162  double2 C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride);
163 
164 // minimize the reads using the symmetry of the clover matrix
165 #define READ_CLOVER2_DOUBLE_TEX(clover, chi) \
166  double2 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \
167  double2 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \
168  double diag = 0.5*(C0.x + C1.y); \
169  double diag_inv = 1.0/diag; \
170  double2 C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \
171  double2 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \
172  double2 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \
173  double2 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \
174  double2 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \
175  double2 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \
176  double2 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \
177  double2 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \
178  double2 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \
179  double2 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \
180  double2 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \
181  double2 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \
182  double2 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \
183  double2 C15 = make_double2(-C3.x,-C3.y); \
184  double2 C16 = make_double2(-C4.x,-C4.y); \
185  double2 C17 = make_double2(-C8.x,-C8.y); \
186  C0.x += param.rho; C0.y += param.rho; C1.x += param.rho; \
187  C1.y += param.rho; C2.x += param.rho; C2.y += param.rho;
188 
189 #define READ_CLOVER_SINGLE_TEX(clover, chi) \
190  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
191  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
192  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
193  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
194  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
195  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
196  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
197  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
198  float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride);
199 
200 #define READ_CLOVER2_SINGLE_TEX(clover, chi) \
201  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
202  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
203  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
204  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
205  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
206  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
207  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
208  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
209  float4 C8 = make_float4(-C2.x,-C2.y,-C4.x,-C4.y); \
210  C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \
211  C0.w += param.rho; C1.x += param.rho; C1.y += param.rho;
212 
213 #define READ_CLOVER_HALF_TEX(clover, chi) \
214  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
215  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
216  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
217  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
218  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
219  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
220  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
221  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
222  float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \
223  float K = TEX1DFETCH(float, (CLOVERTEXNORM), sid + chi*param.cl_stride); \
224  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
225  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
226  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
227  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
228  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
229  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
230  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
231  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
232  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
233 
234 #define READ_CLOVER2_HALF_TEX(clover, chi) \
235  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
236  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
237  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
238  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
239  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
240  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
241  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
242  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
243  float K = TEX1DFETCH(float, (CLOVERTEXNORM), sid + chi*param.cl_stride); \
244  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
245  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
246  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
247  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
248  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
249  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
250  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
251  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
252  float4 C8 = make_float4(-C2.x, -C2.y, -C4.x, -C4.y); \
253  C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \
254  C0.w += param.rho; C1.x += param.rho; C1.y += param.rho;
255 
256 #define ASSN_CLOVER_DOUBLE(clover, chi) \
257  C0 = clover[sid + (18*chi+0)*param.cl_stride]; \
258  C1 = clover[sid + (18*chi+1)*param.cl_stride]; \
259  C2 = clover[sid + (18*chi+2)*param.cl_stride]; \
260  C3 = clover[sid + (18*chi+3)*param.cl_stride]; \
261  C4 = clover[sid + (18*chi+4)*param.cl_stride]; \
262  C5 = clover[sid + (18*chi+5)*param.cl_stride]; \
263  C6 = clover[sid + (18*chi+6)*param.cl_stride]; \
264  C7 = clover[sid + (18*chi+7)*param.cl_stride]; \
265  C8 = clover[sid + (18*chi+8)*param.cl_stride]; \
266  C9 = clover[sid + (18*chi+9)*param.cl_stride]; \
267  C10 = clover[sid + (18*chi+10)*param.cl_stride]; \
268  C11 = clover[sid + (18*chi+11)*param.cl_stride]; \
269  C12 = clover[sid + (18*chi+12)*param.cl_stride]; \
270  C13 = clover[sid + (18*chi+13)*param.cl_stride]; \
271  C14 = clover[sid + (18*chi+14)*param.cl_stride]; \
272  C15 = clover[sid + (18*chi+15)*param.cl_stride]; \
273  C16 = clover[sid + (18*chi+16)*param.cl_stride]; \
274  C17 = clover[sid + (18*chi+17)*param.cl_stride];
275 
276 #define ASSN_CLOVER_DOUBLE_STR(clover, chi) \
277  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
278  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
279  load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \
280  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
281  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
282  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
283  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
284  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
285  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
286  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
287  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
288  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
289  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
290  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
291  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
292  load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \
293  load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \
294  load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]);
295 
296 #define ASSN_CLOVER_SINGLE(clover, chi) \
297  C0 = clover[sid + (9*chi+0)*param.cl_stride]; \
298  C1 = clover[sid + (9*chi+1)*param.cl_stride]; \
299  C2 = clover[sid + (9*chi+2)*param.cl_stride]; \
300  C3 = clover[sid + (9*chi+3)*param.cl_stride]; \
301  C4 = clover[sid + (9*chi+4)*param.cl_stride]; \
302  C5 = clover[sid + (9*chi+5)*param.cl_stride]; \
303  C6 = clover[sid + (9*chi+6)*param.cl_stride]; \
304  C7 = clover[sid + (9*chi+7)*param.cl_stride]; \
305  C8 = clover[sid + (9*chi+8)*param.cl_stride];
306 
307 #define ASSN_CLOVER_HALF(clover, chi) \
308  C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \
309  C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \
310  C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \
311  C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \
312  C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \
313  C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \
314  C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \
315  C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \
316  C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \
317  K = TMCLOVERTEXNORM[sid + chi*param.cl_stride]; \
318  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
319  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
320  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
321  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
322  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
323  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
324  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
325  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
326  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
327 
328 #define ASSN_CLOVER_DOUBLE_TEX(clover, chi) \
329  C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \
330  C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \
331  C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \
332  C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \
333  C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \
334  C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \
335  C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \
336  C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \
337  C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \
338  C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \
339  C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \
340  C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \
341  C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \
342  C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \
343  C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \
344  C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \
345  C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \
346  C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride);
347 
348 #define ASSN_CLOVER_SINGLE_TEX(clover, chi) \
349  C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
350  C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
351  C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
352  C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
353  C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
354  C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
355  C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
356  C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
357  C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride);
358 
359 #define ASSN_CLOVER_HALF_TEX(clover, chi) \
360  C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
361  C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
362  C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
363  C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
364  C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
365  C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
366  C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
367  C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
368  C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \
369  K = TEX1DFETCH(float, (TMCLOVERTEXNORM), sid + chi*param.cl_stride); \
370  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
371  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
372  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
373  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
374  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
375  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
376  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
377  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
378  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
379 
380 #define PACK_CLOVER_DOUBLE(clover, chi) \
381  double2 C0 = clover[idx + (18*chi+0)*param.cl_stride]; \
382  double2 C1 = clover[idx + (18*chi+1)*param.cl_stride]; \
383  double2 C2 = clover[idx + (18*chi+2)*param.cl_stride]; \
384  double2 C3 = clover[idx + (18*chi+3)*param.cl_stride]; \
385  double2 C4 = clover[idx + (18*chi+4)*param.cl_stride]; \
386  double2 C5 = clover[idx + (18*chi+5)*param.cl_stride]; \
387  double2 C6 = clover[idx + (18*chi+6)*param.cl_stride]; \
388  double2 C7 = clover[idx + (18*chi+7)*param.cl_stride]; \
389  double2 C8 = clover[idx + (18*chi+8)*param.cl_stride]; \
390  double2 C9 = clover[idx + (18*chi+9)*param.cl_stride]; \
391  double2 C10 = clover[idx + (18*chi+10)*param.cl_stride]; \
392  double2 C11 = clover[idx + (18*chi+11)*param.cl_stride]; \
393  double2 C12 = clover[idx + (18*chi+12)*param.cl_stride]; \
394  double2 C13 = clover[idx + (18*chi+13)*param.cl_stride]; \
395  double2 C14 = clover[idx + (18*chi+14)*param.cl_stride]; \
396  double2 C15 = clover[idx + (18*chi+15)*param.cl_stride]; \
397  double2 C16 = clover[idx + (18*chi+16)*param.cl_stride]; \
398  double2 C17 = clover[idx + (18*chi+17)*param.cl_stride];
399 
400 #define PACK_CLOVER_SINGLE(clover, chi) \
401  float4 C0 = clover[idx + (9*chi+0)*param.cl_stride]; \
402  float4 C1 = clover[idx + (9*chi+1)*param.cl_stride]; \
403  float4 C2 = clover[idx + (9*chi+2)*param.cl_stride]; \
404  float4 C3 = clover[idx + (9*chi+3)*param.cl_stride]; \
405  float4 C4 = clover[idx + (9*chi+4)*param.cl_stride]; \
406  float4 C5 = clover[idx + (9*chi+5)*param.cl_stride]; \
407  float4 C6 = clover[idx + (9*chi+6)*param.cl_stride]; \
408  float4 C7 = clover[idx + (9*chi+7)*param.cl_stride]; \
409  float4 C8 = clover[idx + (9*chi+8)*param.cl_stride];
410 
411 #define PACK_CLOVER_HALF(clover, chi) \
412  float4 C0 = short42float4(clover[idx + (9*chi+0)*param.cl_stride]); \
413  float4 C1 = short42float4(clover[idx + (9*chi+1)*param.cl_stride]); \
414  float4 C2 = short42float4(clover[idx + (9*chi+2)*param.cl_stride]); \
415  float4 C3 = short42float4(clover[idx + (9*chi+3)*param.cl_stride]); \
416  float4 C4 = short42float4(clover[idx + (9*chi+4)*param.cl_stride]); \
417  float4 C5 = short42float4(clover[idx + (9*chi+5)*param.cl_stride]); \
418  float4 C6 = short42float4(clover[idx + (9*chi+6)*param.cl_stride]); \
419  float4 C7 = short42float4(clover[idx + (9*chi+7)*param.cl_stride]); \
420  float4 C8 = short42float4(clover[idx + (9*chi+8)*param.cl_stride]); \
421  float K = cloverNorm[idx + chi*param.cl_stride]; \
422  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
423  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
424  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
425  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
426  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
427  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
428  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
429  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
430  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
431 
432 #define PACK_CLOVER_DOUBLE_TEX(clover, chi) \
433  double2 C0 = fetch_double2((clover), idx + (18*chi+0)*param.cl_stride); \
434  double2 C1 = fetch_double2((clover), idx + (18*chi+1)*param.cl_stride); \
435  double2 C2 = fetch_double2((clover), idx + (18*chi+2)*param.cl_stride); \
436  double2 C3 = fetch_double2((clover), idx + (18*chi+3)*param.cl_stride); \
437  double2 C4 = fetch_double2((clover), idx + (18*chi+4)*param.cl_stride); \
438  double2 C5 = fetch_double2((clover), idx + (18*chi+5)*param.cl_stride); \
439  double2 C6 = fetch_double2((clover), idx + (18*chi+6)*param.cl_stride); \
440  double2 C7 = fetch_double2((clover), idx + (18*chi+7)*param.cl_stride); \
441  double2 C8 = fetch_double2((clover), idx + (18*chi+8)*param.cl_stride); \
442  double2 C9 = fetch_double2((clover), idx + (18*chi+9)*param.cl_stride); \
443  double2 C10 = fetch_double2((clover), idx + (18*chi+10)*param.cl_stride); \
444  double2 C11 = fetch_double2((clover), idx + (18*chi+11)*param.cl_stride); \
445  double2 C12 = fetch_double2((clover), idx + (18*chi+12)*param.cl_stride); \
446  double2 C13 = fetch_double2((clover), idx + (18*chi+13)*param.cl_stride); \
447  double2 C14 = fetch_double2((clover), idx + (18*chi+14)*param.cl_stride); \
448  double2 C15 = fetch_double2((clover), idx + (18*chi+15)*param.cl_stride); \
449  double2 C16 = fetch_double2((clover), idx + (18*chi+16)*param.cl_stride); \
450  double2 C17 = fetch_double2((clover), idx + (18*chi+17)*param.cl_stride);
451 
452 #define PACK_CLOVER_SINGLE_TEX(clover, chi) \
453  float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \
454  float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \
455  float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \
456  float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \
457  float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \
458  float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \
459  float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \
460  float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \
461  float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride);
462 
463 #define PACK_CLOVER_HALF_TEX(clover, chi) \
464  float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \
465  float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \
466  float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \
467  float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \
468  float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \
469  float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \
470  float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \
471  float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \
472  float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride); \
473  float K = TEX1DFETCH(float, (TMCLOVERTEXNORM), idx + chi*param.cl_stride); \
474  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
475  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
476  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
477  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
478  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
479  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
480  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
481  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
482  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
483