QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
read_clover.h
Go to the documentation of this file.
1 #define READ_CLOVER_DOUBLE(clover, chi) \
2  double2 C0 = clover[sid + (18*chi+0)*param.cl_stride]; \
3  double2 C1 = clover[sid + (18*chi+1)*param.cl_stride]; \
4  double2 C2 = clover[sid + (18*chi+2)*param.cl_stride]; \
5  double2 C3 = clover[sid + (18*chi+3)*param.cl_stride]; \
6  double2 C4 = clover[sid + (18*chi+4)*param.cl_stride]; \
7  double2 C5 = clover[sid + (18*chi+5)*param.cl_stride]; \
8  double2 C6 = clover[sid + (18*chi+6)*param.cl_stride]; \
9  double2 C7 = clover[sid + (18*chi+7)*param.cl_stride]; \
10  double2 C8 = clover[sid + (18*chi+8)*param.cl_stride]; \
11  double2 C9 = clover[sid + (18*chi+9)*param.cl_stride]; \
12  double2 C10 = clover[sid + (18*chi+10)*param.cl_stride]; \
13  double2 C11 = clover[sid + (18*chi+11)*param.cl_stride]; \
14  double2 C12 = clover[sid + (18*chi+12)*param.cl_stride]; \
15  double2 C13 = clover[sid + (18*chi+13)*param.cl_stride]; \
16  double2 C14 = clover[sid + (18*chi+14)*param.cl_stride]; \
17  double2 C15 = clover[sid + (18*chi+15)*param.cl_stride]; \
18  double2 C16 = clover[sid + (18*chi+16)*param.cl_stride]; \
19  double2 C17 = clover[sid + (18*chi+17)*param.cl_stride];
20 
21 #define READ_CLOVER_DOUBLE_STR(clover, chi) \
22  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \
23  double2 C10, C11, C12, C13, C14, C15, C16, C17; \
24  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
25  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
26  load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \
27  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
28  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
29  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
30  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
31  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
32  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
33  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
34  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
35  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
36  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
37  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
38  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
39  load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \
40  load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \
41  load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]);
42 
43 #define READ_CLOVER_SINGLE(clover, chi) \
44  float4 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \
45  float4 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \
46  float4 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \
47  float4 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \
48  float4 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \
49  float4 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \
50  float4 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \
51  float4 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \
52  float4 C8 = clover[sid + (9*chi+8)*param.cl_stride];
53 
54 #define READ_CLOVER_HALF(clover, chi) \
55  float4 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \
56  float4 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \
57  float4 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \
58  float4 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \
59  float4 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \
60  float4 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \
61  float4 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \
62  float4 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \
63  float4 C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \
64  float K = cloverNorm[sid + chi*param.cl_stride]; \
65  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
66  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
67  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
68  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
69  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
70  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
71  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
72  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
73  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
74 
75 #define READ_CLOVER_DOUBLE_TEX(clover, chi) \
76  double2 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \
77  double2 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \
78  double2 C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \
79  double2 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \
80  double2 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \
81  double2 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \
82  double2 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \
83  double2 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \
84  double2 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \
85  double2 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \
86  double2 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \
87  double2 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \
88  double2 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \
89  double2 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \
90  double2 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \
91  double2 C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \
92  double2 C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \
93  double2 C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride);
94 
95 //#endif // USE_TEXTURE_OBJECTS
96 
97 #define READ_CLOVER_SINGLE_TEX(clover, chi) \
98  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
99  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
100  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
101  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
102  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
103  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
104  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
105  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
106  float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride);
107 
108 #define READ_CLOVER_HALF_TEX(clover, chi) \
109  float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
110  float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
111  float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
112  float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
113  float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
114  float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
115  float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
116  float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
117  float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \
118  float K = TEX1DFETCH(float, (CLOVERTEXNORM), sid + chi*param.cl_stride); \
119  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
120  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
121  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
122  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
123  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
124  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
125  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
126  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
127  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
128 
129 #define ASSN_CLOVER_DOUBLE(clover, chi) \
130  C0 = clover[sid + (18*chi+0)*param.cl_stride]; \
131  C1 = clover[sid + (18*chi+1)*param.cl_stride]; \
132  C2 = clover[sid + (18*chi+2)*param.cl_stride]; \
133  C3 = clover[sid + (18*chi+3)*param.cl_stride]; \
134  C4 = clover[sid + (18*chi+4)*param.cl_stride]; \
135  C5 = clover[sid + (18*chi+5)*param.cl_stride]; \
136  C6 = clover[sid + (18*chi+6)*param.cl_stride]; \
137  C7 = clover[sid + (18*chi+7)*param.cl_stride]; \
138  C8 = clover[sid + (18*chi+8)*param.cl_stride]; \
139  C9 = clover[sid + (18*chi+9)*param.cl_stride]; \
140  C10 = clover[sid + (18*chi+10)*param.cl_stride]; \
141  C11 = clover[sid + (18*chi+11)*param.cl_stride]; \
142  C12 = clover[sid + (18*chi+12)*param.cl_stride]; \
143  C13 = clover[sid + (18*chi+13)*param.cl_stride]; \
144  C14 = clover[sid + (18*chi+14)*param.cl_stride]; \
145  C15 = clover[sid + (18*chi+15)*param.cl_stride]; \
146  C16 = clover[sid + (18*chi+16)*param.cl_stride]; \
147  C17 = clover[sid + (18*chi+17)*param.cl_stride];
148 
149 #define ASSN_CLOVER_DOUBLE_STR(clover, chi) \
150  load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \
151  load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \
152  load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \
153  load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \
154  load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \
155  load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \
156  load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \
157  load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \
158  load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \
159  load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \
160  load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \
161  load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \
162  load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \
163  load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \
164  load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \
165  load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \
166  load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \
167  load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]);
168 
169 #define ASSN_CLOVER_SINGLE(clover, chi) \
170  C0 = clover[sid + (9*chi+0)*param.cl_stride]; \
171  C1 = clover[sid + (9*chi+1)*param.cl_stride]; \
172  C2 = clover[sid + (9*chi+2)*param.cl_stride]; \
173  C3 = clover[sid + (9*chi+3)*param.cl_stride]; \
174  C4 = clover[sid + (9*chi+4)*param.cl_stride]; \
175  C5 = clover[sid + (9*chi+5)*param.cl_stride]; \
176  C6 = clover[sid + (9*chi+6)*param.cl_stride]; \
177  C7 = clover[sid + (9*chi+7)*param.cl_stride]; \
178  C8 = clover[sid + (9*chi+8)*param.cl_stride];
179 
180 #define ASSN_CLOVER_HALF(clover, chi) \
181  C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \
182  C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \
183  C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \
184  C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \
185  C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \
186  C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \
187  C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \
188  C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \
189  C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \
190  K = cloverNorm[sid + chi*param.cl_stride]; \
191  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
192  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
193  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
194  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
195  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
196  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
197  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
198  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
199  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
200 
201 #define ASSN_CLOVER_DOUBLE_TEX(clover, chi) \
202  C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \
203  C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \
204  C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \
205  C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \
206  C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \
207  C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \
208  C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \
209  C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \
210  C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \
211  C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \
212  C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \
213  C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \
214  C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \
215  C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \
216  C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \
217  C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \
218  C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \
219  C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride);
220 
221 //#endif // USE_TEXTURE_OBJECTS
222 
223 #define ASSN_CLOVER_SINGLE_TEX(clover, chi) \
224  C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
225  C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
226  C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
227  C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
228  C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
229  C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
230  C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
231  C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
232  C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride);
233 
234 #define ASSN_CLOVER_HALF_TEX(clover, chi) \
235  C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \
236  C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \
237  C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \
238  C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \
239  C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \
240  C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \
241  C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \
242  C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \
243  C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \
244  K = TEX1DFETCH(float, (TMCLOVERTEXNORM), sid + chi*param.cl_stride); \
245  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
246  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
247  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
248  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
249  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
250  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
251  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
252  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
253  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
254 
255 #define PACK_CLOVER_DOUBLE(clover, chi) \
256  double2 C0 = clover[idx + (18*chi+0)*param.cl_stride]; \
257  double2 C1 = clover[idx + (18*chi+1)*param.cl_stride]; \
258  double2 C2 = clover[idx + (18*chi+2)*param.cl_stride]; \
259  double2 C3 = clover[idx + (18*chi+3)*param.cl_stride]; \
260  double2 C4 = clover[idx + (18*chi+4)*param.cl_stride]; \
261  double2 C5 = clover[idx + (18*chi+5)*param.cl_stride]; \
262  double2 C6 = clover[idx + (18*chi+6)*param.cl_stride]; \
263  double2 C7 = clover[idx + (18*chi+7)*param.cl_stride]; \
264  double2 C8 = clover[idx + (18*chi+8)*param.cl_stride]; \
265  double2 C9 = clover[idx + (18*chi+9)*param.cl_stride]; \
266  double2 C10 = clover[idx + (18*chi+10)*param.cl_stride]; \
267  double2 C11 = clover[idx + (18*chi+11)*param.cl_stride]; \
268  double2 C12 = clover[idx + (18*chi+12)*param.cl_stride]; \
269  double2 C13 = clover[idx + (18*chi+13)*param.cl_stride]; \
270  double2 C14 = clover[idx + (18*chi+14)*param.cl_stride]; \
271  double2 C15 = clover[idx + (18*chi+15)*param.cl_stride]; \
272  double2 C16 = clover[idx + (18*chi+16)*param.cl_stride]; \
273  double2 C17 = clover[idx + (18*chi+17)*param.cl_stride];
274 
275 #define PACK_CLOVER_SINGLE(clover, chi) \
276  float4 C0 = clover[idx + (9*chi+0)*param.cl_stride]; \
277  float4 C1 = clover[idx + (9*chi+1)*param.cl_stride]; \
278  float4 C2 = clover[idx + (9*chi+2)*param.cl_stride]; \
279  float4 C3 = clover[idx + (9*chi+3)*param.cl_stride]; \
280  float4 C4 = clover[idx + (9*chi+4)*param.cl_stride]; \
281  float4 C5 = clover[idx + (9*chi+5)*param.cl_stride]; \
282  float4 C6 = clover[idx + (9*chi+6)*param.cl_stride]; \
283  float4 C7 = clover[idx + (9*chi+7)*param.cl_stride]; \
284  float4 C8 = clover[idx + (9*chi+8)*param.cl_stride];
285 
286 #define PACK_CLOVER_HALF(clover, chi) \
287  float4 C0 = short42float4(clover[idx + (9*chi+0)*param.cl_stride]); \
288  float4 C1 = short42float4(clover[idx + (9*chi+1)*param.cl_stride]); \
289  float4 C2 = short42float4(clover[idx + (9*chi+2)*param.cl_stride]); \
290  float4 C3 = short42float4(clover[idx + (9*chi+3)*param.cl_stride]); \
291  float4 C4 = short42float4(clover[idx + (9*chi+4)*param.cl_stride]); \
292  float4 C5 = short42float4(clover[idx + (9*chi+5)*param.cl_stride]); \
293  float4 C6 = short42float4(clover[idx + (9*chi+6)*param.cl_stride]); \
294  float4 C7 = short42float4(clover[idx + (9*chi+7)*param.cl_stride]); \
295  float4 C8 = short42float4(clover[idx + (9*chi+8)*param.cl_stride]); \
296  float K = cloverNorm[idx + chi*param.cl_stride]; \
297  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
298  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
299  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
300  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
301  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
302  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
303  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
304  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
305  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
306 
307 #define PACK_CLOVER_DOUBLE_TEX(clover, chi) \
308  double2 C0 = fetch_double2((clover), idx + (18*chi+0)*param.cl_stride); \
309  double2 C1 = fetch_double2((clover), idx + (18*chi+1)*param.cl_stride); \
310  double2 C2 = fetch_double2((clover), idx + (18*chi+2)*param.cl_stride); \
311  double2 C3 = fetch_double2((clover), idx + (18*chi+3)*param.cl_stride); \
312  double2 C4 = fetch_double2((clover), idx + (18*chi+4)*param.cl_stride); \
313  double2 C5 = fetch_double2((clover), idx + (18*chi+5)*param.cl_stride); \
314  double2 C6 = fetch_double2((clover), idx + (18*chi+6)*param.cl_stride); \
315  double2 C7 = fetch_double2((clover), idx + (18*chi+7)*param.cl_stride); \
316  double2 C8 = fetch_double2((clover), idx + (18*chi+8)*param.cl_stride); \
317  double2 C9 = fetch_double2((clover), idx + (18*chi+9)*param.cl_stride); \
318  double2 C10 = fetch_double2((clover), idx + (18*chi+10)*param.cl_stride); \
319  double2 C11 = fetch_double2((clover), idx + (18*chi+11)*param.cl_stride); \
320  double2 C12 = fetch_double2((clover), idx + (18*chi+12)*param.cl_stride); \
321  double2 C13 = fetch_double2((clover), idx + (18*chi+13)*param.cl_stride); \
322  double2 C14 = fetch_double2((clover), idx + (18*chi+14)*param.cl_stride); \
323  double2 C15 = fetch_double2((clover), idx + (18*chi+15)*param.cl_stride); \
324  double2 C16 = fetch_double2((clover), idx + (18*chi+16)*param.cl_stride); \
325  double2 C17 = fetch_double2((clover), idx + (18*chi+17)*param.cl_stride);
326 
327 #define PACK_CLOVER_SINGLE_TEX(clover, chi) \
328  float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \
329  float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \
330  float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \
331  float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \
332  float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \
333  float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \
334  float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \
335  float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \
336  float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride);
337 
338 #define PACK_CLOVER_HALF_TEX(clover, chi) \
339  float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \
340  float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \
341  float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \
342  float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \
343  float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \
344  float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \
345  float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \
346  float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \
347  float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride); \
348  float K = TEX1DFETCH(float, (TMCLOVERTEXNORM), idx + chi*param.cl_stride); \
349  C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
350  C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
351  C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
352  C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \
353  C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \
354  C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \
355  C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \
356  C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \
357  C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;
358