QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
staggered_dslash_def.h
Go to the documentation of this file.
1 // staggered_dslash_def.h - staggered Dslash kernel definitions
2 //
3 // See comments in wilson_dslash_def.h
4 
5 // initialize on first iteration
6 
7 #ifndef DD_LOOP
8 #define DD_LOOP
9 
10 #define DD_AXPY 0
11 #define DD_RECON 8
12 #define DD_PREC 0
13 #endif
14 
15 // set options for current iteration
16 
17 #if (DD_IMPROVED==1)
18 #define DD_FNAME improvedStaggeredDslash
19 #else
20 #define DD_FNAME staggeredDslash
21 #endif
22 
23 #if (DD_AXPY==0) // no axpy
24 #define DD_AXPY_F
25 #else // axpy
26 #define DD_AXPY_F Axpy
27 #define DSLASH_AXPY
28 #endif
29 
30 #if (DD_PREC == 0)
31 #define DD_PARAM_AXPY const double2 *x, const float *xNorm, const double a, const DslashParam param
32 #elif (DD_PREC == 1)
33 #define DD_PARAM_AXPY const float2 *x, const float *xNorm, const float a, const DslashParam param
34 #else
35 #define DD_PARAM_AXPY const short2 *x, const float *xNorm, const float a, const DslashParam param
36 #endif
37 
38 
39 #define READ_LONG_PHASE(phase, dir, idx, stride) // May be a problem below with redefinitions
40 
41 #if (DD_RECON==8) // reconstruct from 8 reals
42 #define DD_RECON_F 8
43 
44 #if (DD_PREC==0) // DOUBLE PRECISION
45 
46 #if (DD_IMPROVED==1)
47 #if (__COMPUTE_CAPABILITY__ >= 200)
48 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1, const double* longPhase0, const double* longPhase1
49 #else
50 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1
51 #endif
52 #else
53 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1
54 #endif
55 
56 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE
57 
58 #ifdef DIRECT_ACCESS_FAT_LINK
59 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(FAT, gauge, dir, idx, stride)
60 #else
61 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(FAT, gauge, dir, idx, stride)
62 #endif // DIRECT_ACCESS_FAT_LINK
63 #ifdef DIRECT_ACCESS_LONG_LINK
64 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_DOUBLE2(LONG, gauge, dir, idx, stride)
65 #else
66 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_DOUBLE2_TEX(LONG, gauge, dir, idx, stride)
67 #endif // DIRECT_ACCESS_LONG_LINK
68 
69 #elif (DD_PREC==1) // SINGLE PRECISION
70 #if (DD_IMPROVED==1)
71 #if (__COMPUTE_CAPABILITY__ >= 200)
72 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1, const float* longPhase0, const float* longPhase1
73 #else
74 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1
75 #endif
76 #else
77 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1
78 #endif
79 
80 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_8_SINGLE
81 
82 #ifdef DIRECT_ACCESS_FAT_LINK
83 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(FAT, gauge, dir, idx, stride)
84 #else
85 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(FAT, gauge, dir, idx, stride)
86 #endif // DIRECT_ACCESS_FAT_LINK
87 #ifdef DIRECT_ACCESS_LONG_LINK
88 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_FLOAT4(LONG, gauge, dir, idx, stride)
89 #else
90 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_FLOAT4_TEX(LONG, gauge, dir, idx, stride)
91 #endif // DIRECT_ACCESS_LONG_LINK
92 
93 #else // HALF PRECISION
94 #if (DD_IMPROVED==1)
95 #if (__COMPUTE_CAPABILITY__ >= 200)
96 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1, const short4* longGauge0, const short4* longGauge1, const short* longPhase0, const short* longPhase1
97 #else
98 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1, const short4* longGauge0, const short4* longGauge1
99 #endif
100 #else
101 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1
102 #endif
103 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_8_SINGLE
104 
105 /*#ifdef DIRECT_ACCESS_FAT_LINK
106 #define READ_FAT_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(FAT, gauge, dir, idx, fat_ga_stride); RESCALE2(FAT, fat_link_max);
107 #else*/
108 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(FAT, gauge, dir, idx, stride); RESCALE2(FAT, fat_link_max);
109 /*#endif // DIRECT_ACCESS_FAT_LINK
110 #ifdef DIRECT_ACCESS_LONG_LINK
111 #define READ_LONG_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_8_SHORT4(LONG, gauge, dir, idx, long_ga_stride)
112 #else*/
113 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_SHORT4_TEX(LONG, gauge, dir, idx, stride)
114 //#endif // DIRECT_ACCESS_LONG_LINK
115 
116 #endif // DD_PREC
117 
118 #elif (DD_RECON == 9) // reconstruct from 9 reals
119 
120 #define DD_RECON_F 9
121 
122 #if (DD_PREC==0) // DOUBLE PRECISION
123 #if (DD_IMPROVED==1)
124 #if (__COMPUTE_CAPABILITY__ >= 200)
125 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1, const double* longPhase0, const double* longPhase1
126 #else
127 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1
128 #endif
129 #else
130 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1
131 #endif
132 
133 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_9_DOUBLE
134 
135 #ifdef DIRECT_ACCESS_FAT_LINK
136 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(FAT, gauge, dir, idx, stride)
137 #else
138 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(FAT, gauge, dir, idx, stride)
139 #endif // DIRECT_ACCESS_FAT_LINK
140 #undef READ_LONG_PHASE
141 #ifdef DIRECT_ACCESS_LONG_LINK
142 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_DOUBLE2(LONG, gauge, dir, idx, stride)
143 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_DOUBLE(PHASE, phase, dir, idx, stride);
144 #else
145 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_DOUBLE2_TEX(LONG, gauge, dir, idx, stride)
146 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_DOUBLE_TEX(PHASE, phase, dir, idx, stride);
147 #endif // DIRECT_ACCESS_LONG_LINK
148 
149 #elif (DD_PREC==1) // SINGLE PRECISION
150 #if (DD_IMPROVED==1)
151 #if (__COMPUTE_CAPABILITY__ >= 200)
152 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1, const float* longPhase0, const float* longPhase1
153 #else
154 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1
155 #endif
156 #else
157 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1
158 #endif
159 
160 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_9_SINGLE
161 
162 #ifdef DIRECT_ACCESS_FAT_LINK
163 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(FAT, gauge, dir, idx, stride)
164 #else
165 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(FAT, gauge, dir, idx, stride)
166 #endif // DIRECT_ACCESS_FAT_LINK
167 #undef READ_LONG_PHASE
168 #ifdef DIRECT_ACCESS_LONG_LINK
169 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_FLOAT4(LONG, gauge, dir, idx, stride)
170 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT(PHASE, phase, dir, idx, stride);
171 #else
172 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_FLOAT4_TEX(LONG, gauge, dir, idx, stride)
173 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT_TEX(PHASE, phase, dir, idx, stride);
174 #endif // DIRECT_ACCESS_LONG_LINK
175 
176 #else // HALF PRECISION
177 #if (DD_IMPROVED==1)
178 #if (__COMPUTE_CAPABILITY__ >= 200)
179 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1, const short4* longGauge0, const short4* longGauge1, const short* longPhase0, const short* longPhase1
180 #else
181 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1, const short4* longGauge0, const short4* longGauge1
182 #endif
183 #else
184 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2* fatGauge1
185 #endif
186 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_9_SINGLE
187 
188 /*#ifdef DIRECT_ACCESS_FAT_LINK
189 #define READ_FAT_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(FAT, gauge, dir, idx, fat_ga_stride); RESCALE2(FAT, fat_link_max);
190 #else*/
191 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(FAT, gauge, dir, idx, stride); RESCALE2(FAT, fat_link_max);
192 /*#endif // DIRECT_ACCESS_FAT_LINK
193 #ifdef DIRECT_ACCESS_LONG_LINK
194 #define READ_LONG_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_8_SHORT4(LONG, gauge, dir, idx, long_ga_stride)
195 #else*/
196 #undef READ_LONG_PHASE
197 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_8_SHORT4_TEX(LONG, gauge, dir, idx, stride)
198 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_SHORT_TEX(PHASE, phase, dir, idx, stride);
199 //#endif // DIRECT_ACCESS_LONG_LINK
200 
201 #endif // DD_PREC
202 
203 #elif (DD_RECON == 12)// reconstruct from 12 reals
204 
205 #define DD_RECON_F 12
206 
207 #if (DD_PREC==0) // DOUBLE PRECISION
208 #if (DD_IMPROVED==1)
209 #if (__COMPUTE_CAPABILITY__ >= 200)
210 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1, const double* longPhase0, const double* longPhase1
211 #else
212 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1
213 #endif
214 #else
215 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1
216 #endif
217 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE
218 
219 #ifdef DIRECT_ACCESS_FAT_LINK
220 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(FAT, gauge, dir, idx, stride)
221 #else
222 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(FAT, gauge, dir, idx, stride)
223 #endif // DIRECT_ACCESS_FAT_LINK
224 #ifdef DIRECT_ACCESS_LONG_LINK
225 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_DOUBLE2(LONG, gauge, dir, idx, stride)
226 #else
227 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_DOUBLE2_TEX(LONG, gauge, dir, idx, stride)
228 #endif // DIRECT_ACCESS_LONG_LINK
229 
230 #elif (DD_PREC==1) // SINGLE PRECISION
231 #if (DD_IMPROVED==1)
232 #if (__COMPUTE_CAPABILITY__ >= 200)
233 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1, const float* longPhase0, const float* longPhase1
234 #else
235 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1
236 #endif
237 #else
238 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1
239 #endif
240 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_12_SINGLE
241 
242 #ifdef DIRECT_ACCESS_FAT_LINK
243 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(FAT, gauge, dir, idx, stride)
244 #else
245 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(FAT, gauge, dir, idx, stride)
246 #endif // DIRECT_ACCESS_FAT_LINK
247 #ifdef DIRECT_ACCESS_LONG_LINK
248 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_FLOAT4(LONG, gauge, dir, idx, stride)
249 #else
250 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_FLOAT4_TEX(LONG, gauge, dir, idx, stride)
251 #endif // DIRECT_ACCESS_LONG_LINK
252 
253 #else // HALF PRECISION
254 #if (DD_IMPROVED==1)
255 #if (__COMPUTE_CAPABILITY__ >= 200)
256 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1, const short* longPhase0, const short* longPhase1
257 #else
258 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1
259 #endif
260 #else
261 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1
262 #endif
263 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_12_SINGLE
264 
265 /*#ifdef DIRECT_ACCESS_FAT_LINK
266 #define READ_FAT_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(FAT, gauge, dir, idx, fat_ga_stride); RESCALE2(FAT, fat_link_max);
267 #else*/
268 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(FAT, gauge, dir, idx, stride); RESCALE2(FAT, fat_link_max);
269 /*#endif // DIRECT_ACCCESS_FAT_LINK
270 #ifdef DIRECT_ACCESS_LONG_LINK
271 #define READ_LONG_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_12_SHORT4(LONG, gauge, dir, idx, long_ga_stride)
272 #else*/
273 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_SHORT4_TEX(LONG, gauge, dir, idx, stride)
274  //#endif // DIRECT_ACCCESS_LONG_LINK
275 
276 #endif // DD_PREC
277 
278 #elif (DD_RECON == 13)
279 #define DD_RECON_F 13
280 
281 #if (DD_PREC==0) // DOUBLE PRECISION
282 #if (DD_IMPROVED==1)
283 #if (__COMPUTE_CAPABILITY__ >= 200)
284 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1, const double* longPhase0, const double* longPhase1
285 #else
286 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1
287 #endif
288 #else
289 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1
290 #endif
291 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_13_DOUBLE
292 
293 #ifdef DIRECT_ACCESS_FAT_LINK
294 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(FAT, gauge, dir, idx, stride)
295 #else
296 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(FAT, gauge, dir, idx, stride)
297 #endif // DIRECT_ACCESS_FAT_LINK
298 #undef READ_LONG_PHASE
299 #ifdef DIRECT_ACCESS_LONG_LINK
300 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_DOUBLE2(LONG, gauge, dir, idx, stride)
301 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_DOUBLE(PHASE, phase, dir, idx, stride);
302 #else
303 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_DOUBLE2_TEX(LONG, gauge, dir, idx, stride)
304 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_DOUBLE_TEX(PHASE, phase, dir, idx, stride);
305 #endif // DIRECT_ACCESS_LONG_LINK
306 
307 #elif (DD_PREC==1) // SINGLE PRECISION
308 #if (DD_IMPROVED==1)
309 #if (__COMPUTE_CAPABILITY__ >= 200)
310 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1, const float* longPhase0, const float* longPhase1
311 #else
312 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1
313 #endif
314 #else
315 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1
316 #endif
317 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_13_SINGLE
318 
319 #ifdef DIRECT_ACCESS_FAT_LINK
320 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(FAT, gauge, dir, idx, stride)
321 #else
322 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(FAT, gauge, dir, idx, stride)
323 #endif // DIRECT_ACCESS_FAT_LINK
324 #undef READ_LONG_PHASE
325 #ifdef DIRECT_ACCESS_LONG_LINK
326 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_FLOAT4(LONG, gauge, dir, idx, stride)
327 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT(PHASE, phase, dir, idx, stride);
328 #else
329 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_FLOAT4_TEX(LONG, gauge, dir, idx, stride)
330 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_FLOAT_TEX(PHASE, phase, dir, idx, stride);
331 #endif // DIRECT_ACCESS_LONG_LINK
332 
333 #else // HALF PRECISION
334 #if (DD_IMPROVED==1)
335 #if (__COMPUTE_CAPABILITY__ >= 200)
336 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1, const short* longPhase0, const short* longPhase1
337 #else
338 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1
339 #endif
340 #else
341 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1
342 #endif
343 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_GAUGE_MATRIX_13_SINGLE
344 
345 /*#ifdef DIRECT_ACCESS_FAT_LINK
346 #define READ_FAT_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(FAT, gauge, dir, idx, fat_ga_stride); RESCALE2(FAT, fat_link_max);
347 #else*/
348 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(FAT, gauge, dir, idx, stride); RESCALE2(FAT, fat_link_max);
349 /*#endif // DIRECT_ACCCESS_FAT_LINK
350 #ifdef DIRECT_ACCESS_LONG_LINK
351 #define READ_LONG_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_12_SHORT4(LONG, gauge, dir, idx, long_ga_stride)
352 #else*/
353 #undef READ_LONG_PHASE
354 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_12_SHORT4_TEX(LONG, gauge, dir, idx, stride)
355 #define READ_LONG_PHASE(phase, dir, idx, stride) READ_GAUGE_PHASE_SHORT_TEX(PHASE, phase, dir, idx, stride);
356  //#endif // DIRECT_ACCCESS_LONG_LINK
357 
358 #endif // DD_PREC
359 
360 #else //18 reconstruct
361 #define DD_RECON_F 18
362 #define RECONSTRUCT_GAUGE_MATRIX(dir, gauge, idx, sign)
363 
364 #if (DD_PREC==0) // DOUBLE PRECISION
365 #if (DD_IMPROVED==1)
366 #if (__COMPUTE_CAPABILITY__ >= 200)
367 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1, const double* longPhase0, const double* longPhase1
368 #else
369 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1, const double2* longGauge0, const double2* longGauge1
370 #endif
371 #else
372 #define DD_PARAM_GAUGE const double2 *fatGauge0, const double2 *fatGauge1
373 #endif
374 #ifdef DIRECT_ACCESS_FAT_LINK
375 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(FAT, gauge, dir, idx, stride)
376 #else
377 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(FAT, gauge, dir, idx, stride)
378 #endif // DIRECT_ACCCESS_FAT_LINK
379 #ifdef DIRECT_ACCESS_LONG_LINK
380 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2(LONG, gauge, dir, idx, stride)
381 #else
382 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_DOUBLE2_TEX(LONG, gauge, dir, idx, stride)
383 #endif // DIRECT_ACCCESS_LONG_LINK
384 
385 #elif (DD_PREC==1) // SINGLE PRECISION
386 
387 #if (DD_IMPROVED==1)
388 #if (__COMPUTE_CAPABILITY__ >= 200)
389 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1, const float* longPhase0, const float* longPhase1
390 #else
391 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1, const float4* longGauge0, const float4* longGauge1
392 #endif
393 #else
394 #define DD_PARAM_GAUGE const float2 *fatGauge0, const float2 *fatGauge1
395 #endif
396 
397 #ifdef DIRECT_ACCESS_FAT_LINK
398 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(FAT, gauge, dir, idx, stride)
399 #else
400 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(FAT, gauge, dir, idx, stride)
401 #endif // DIRECT_ACCCESS_FAT_LINK
402 #ifdef DIRECT_ACCESS_LONG_LINK
403 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2(LONG, gauge, dir, idx, stride)
404 #else
405 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_FLOAT2_TEX(LONG, gauge, dir, idx, stride)
406 #endif // DIRECT_ACCCESS_LONG_LINK
407 
408 #else // HALF PRECISION
409 
410 #if (DD_IMPROVED==1)
411 #if (__COMPUTE_CAPABILITY__ >= 200)
412 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1, const short* longPhase0, const short* longPhase1
413 #else // Tesla doesn't support reconstruct 9/13
414 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1, const short4* longGauge0, const short4* longGauge1
415 #endif
416 #else
417 #define DD_PARAM_GAUGE const short2 *fatGauge0, const short2 *fatGauge1
418 #endif
419 
420 /*#ifdef DIRECT_ACCESS_FAT_LINK
421 #define READ_FAT_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(FAT, gauge, dir, idx, fat_ga_stride); RESCALE2(FAT, fat_link_max);
422 #else*/
423 #define READ_FAT_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(FAT, gauge, dir, idx, stride); RESCALE2(FAT, fat_link_max);
424  /*#endif // DIRECT_ACCESS_FAT_LINK
425 #ifdef DIRECT_ACCESS_LONG_LINK
426 #define READ_LONG_MATRIX(gauge, dir, idx) READ_GAUGE_MATRIX_18_SHORT2(LONG, gauge, dir, idx, long_ga_stride)
427 #else*/
428 #define READ_LONG_MATRIX(gauge, dir, idx, stride) READ_GAUGE_MATRIX_18_SHORT2_TEX(LONG, gauge, dir, idx, stride)
429  //#endif // DIRECT_ACCCESS_LONG_LINK
430 
431 #endif // DD_PREC
432 
433 #endif // DD_RECON
434 
435 #if (DD_PREC==0) // double-precision fields
436 
437 // gauge field
438 #define DD_PREC_F D
439 #if (defined DIRECT_ACCESS_FAT_LINK) || (defined FERMI_NO_DBLE_TEX)
440 #define FATLINK0TEX fatGauge0
441 #define FATLINK1TEX fatGauge1
442 #else
443 #ifdef USE_TEXTURE_OBJECTS
444 #define FATLINK0TEX param.gauge0Tex
445 #define FATLINK1TEX param.gauge1Tex
446 #else
447 #define FATLINK0TEX fatGauge0TexDouble
448 #define FATLINK1TEX fatGauge1TexDouble
449 #endif // USE_TEXTURE_OBJECTS
450 #endif
451 
452 #if (defined DIRECT_ACCESS_LONG_LINK) || (defined FERMI_NO_DBLE_TEX)
453 #define LONGLINK0TEX longGauge0
454 #define LONGLINK1TEX longGauge1
455 #define LONGPHASE0TEX longPhase0
456 #define LONGPHASE1TEX longPhase1
457 #else
458 #ifdef USE_TEXTURE_OBJECTS
459 #define LONGLINK0TEX param.longGauge0Tex
460 #define LONGLINK1TEX param.longGauge1Tex
461 #define LONGPHASE0TEX param.longPhase0Tex
462 #define LONGPHASE1TEX param.longPhase1Tex
463 #else
464 #define LONGLINK0TEX longGauge0TexDouble
465 #define LONGLINK1TEX longGauge1TexDouble
466 #define LONGPHASE0TEX longPhase0TexDouble
467 #define LONGPHASE1TEX longPhase1TexDouble
468 #endif // USE_TEXTURE_OBJECTS
469 #endif
470 
471 #define GAUGE_DOUBLE
472 
473 // spinor fields
474 #define DD_PARAM_OUT double2* out, float *null1
475 #define DD_PARAM_IN const double2* in, const float *null4
476 #if (defined DIRECT_ACCESS_SPINOR) || (defined FERMI_NO_DBLE_TEX)
477 #define SPINORTEX in
478 #define READ_1ST_NBR_SPINOR READ_1ST_NBR_SPINOR_DOUBLE
479 #define READ_3RD_NBR_SPINOR READ_KS_NBR_SPINOR_DOUBLE
480 #else
481 #ifdef USE_TEXTURE_OBJECTS
482 #define SPINORTEX param.inTex
483 #else
484 #define SPINORTEX spinorTexDouble
485 #endif // USE_TEXTURE_OBJECTS
486 #define READ_1ST_NBR_SPINOR READ_1ST_NBR_SPINOR_DOUBLE_TEX
487 #define READ_3RD_NBR_SPINOR READ_KS_NBR_SPINOR_DOUBLE_TEX
488 #endif
489 #if (defined DIRECT_ACCESS_INTER) || (defined FERMI_NO_DBLE_TEX)
490 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR
491 #define INTERTEX out
492 #else
493 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR_DOUBLE_TEX
494 #ifdef USE_TEXTURE_OBJECTS
495 #define INTERTEX param.outTex
496 #else
497 #define INTERTEX interTexDouble
498 #endif
499 #endif
500 #define WRITE_SPINOR WRITE_ST_SPINOR_DOUBLE2
501 #define SPINOR_DOUBLE
502 #if (DD_AXPY==1)
503 #if (defined DIRECT_ACCESS_ACCUM) || (defined FERMI_NO_DBLE_TEX)
504 #define ACCUMTEX x
505 #define READ_ACCUM READ_ST_ACCUM_DOUBLE
506 #else
507 #ifdef USE_TEXTURE_OBJECTS
508 #define ACCUMTEX param.xTex
509 #else
510 #define ACCUMTEX accumTexDouble
511 #endif // USE_TEXTURE_OBJECTS
512 #define READ_ACCUM READ_ST_ACCUM_DOUBLE_TEX
513 #endif
514 #endif // DD_AXPY
515 
516 
517 #elif (DD_PREC==1) // single-precision fields
518 
519 // gauge fields
520 #define DD_PREC_F S
521 
522 #ifndef DIRECT_ACCESS_FAT_LINK
523 #ifdef USE_TEXTURE_OBJECTS
524 #define FATLINK0TEX param.gauge0Tex
525 #define FATLINK1TEX param.gauge1Tex
526 #else
527 #define FATLINK0TEX fatGauge0TexSingle
528 #define FATLINK1TEX fatGauge1TexSingle
529 #endif
530 #else
531 #define FATLINK0TEX fatGauge0
532 #define FATLINK1TEX fatGauge1
533 #endif
534 
535 #ifndef DIRECT_ACCESS_LONG_LINK //longlink access
536 #ifdef USE_TEXTURE_OBJECTS
537 #define LONGLINK0TEX param.longGauge0Tex
538 #define LONGLINK1TEX param.longGauge1Tex
539 #define LONGPHASE0TEX param.longPhase0Tex
540 #define LONGPHASE1TEX param.longPhase1Tex
541 #else
542 #if (DD_RECON ==18)
543 #define LONGLINK0TEX longGauge0TexSingle_norecon
544 #define LONGLINK1TEX longGauge1TexSingle_norecon
545 #else
546 #define LONGLINK0TEX longGauge0TexSingle
547 #define LONGLINK1TEX longGauge1TexSingle
548 #define LONGPHASE0TEX longPhase0TexSingle
549 #define LONGPHASE1TEX longPhase1TexSingle
550 #endif
551 #endif // USE_TEXTURE_OBJECTS
552 #else
553 #define LONGLINK0TEX longGauge0
554 #define LONGLINK1TEX longGauge1
555 #define LONGPHASE0TEX longPhase0
556 #define LONGPHASE1TEX longPhase1
557 #endif
558 
559 // spinor fields
560 #define DD_PARAM_OUT float2* out, float *null1
561 #define DD_PARAM_IN const float2* in, const float *null4
562 #ifndef DIRECT_ACCESS_SPINOR
563 #ifdef USE_TEXTURE_OBJECTS
564 #define SPINORTEX param.inTex
565 #else
566 #define SPINORTEX spinorTexSingle2
567 #endif // USE_TEXTURE_OBJECTS
568 #define READ_1ST_NBR_SPINOR READ_1ST_NBR_SPINOR_SINGLE_TEX
569 #define READ_3RD_NBR_SPINOR READ_KS_NBR_SPINOR_SINGLE_TEX
570 #else
571 #define SPINORTEX in
572 #define READ_1ST_NBR_SPINOR READ_1ST_NBR_SPINOR_SINGLE
573 #define READ_3RD_NBR_SPINOR READ_KS_NBR_SPINOR_SINGLE
574 #endif
575 #if (defined DIRECT_ACCESS_INTER)
576 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR
577 #define INTERTEX out
578 #else
579 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR_SINGLE_TEX
580 #ifdef USE_TEXTURE_OBJECTS
581 #define INTERTEX param.outTex
582 #else
583 #define INTERTEX interTexSingle2
584 #endif // USE_TEXTURE_OBJECTS
585 #endif
586 #define WRITE_SPINOR WRITE_ST_SPINOR_FLOAT2
587 #if (DD_AXPY==1)
588 #if (defined DIRECT_ACCESS_ACCUM)
589 #define ACCUMTEX x
590 #define READ_ACCUM READ_ST_ACCUM_SINGLE
591 #else
592 #ifdef USE_TEXTURE_OBJECTS
593 #define ACCUMTEX param.xTex
594 #else
595 #define ACCUMTEX accumTexSingle2
596 #endif // USE_TEXTURE_OBJECTS
597 #define READ_ACCUM READ_ST_ACCUM_SINGLE_TEX
598 #endif
599 #endif // DD_AXPY
600 
601 
602 #else // half-precision fields
603 
604 // all reads done through texture cache regardless
605 
606 // gauge fields
607 #define DD_PREC_F H
608 #ifdef USE_TEXTURE_OBJECTS
609 #define FATLINK0TEX param.gauge0Tex
610 #define FATLINK1TEX param.gauge1Tex
611 #define LONGLINK0TEX param.longGauge0Tex
612 #define LONGLINK1TEX param.longGauge1Tex
613 #define LONGPHASE0TEX param.longPhase0Tex
614 #define LONGPHASE1TEX param.longPhase1Tex
615 #else
616 #define FATLINK0TEX fatGauge0TexHalf
617 #define FATLINK1TEX fatGauge1TexHalf
618 #if (DD_RECON ==18)
619 #define LONGLINK0TEX longGauge0TexHalf_norecon
620 #define LONGLINK1TEX longGauge1TexHalf_norecon
621 #else
622 #define LONGLINK0TEX longGauge0TexHalf
623 #define LONGLINK1TEX longGauge1TexHalf
624 #define LONGPHASE0TEX longPhase0TexHalf
625 #define LONGPHASE1TEX longPhase1TexHalf
626 #endif
627 #endif // USE_TEXTURE_OBJECTS
628 
629 #define READ_1ST_NBR_SPINOR READ_1ST_NBR_SPINOR_HALF_TEX
630 #define READ_3RD_NBR_SPINOR READ_KS_NBR_SPINOR_HALF_TEX
631 #ifdef USE_TEXTURE_OBJECTS
632 #define SPINORTEX param.inTex
633 #else
634 #define SPINORTEX spinorTexHalf2
635 #endif // USE_TEXTURE_OBJECTS
636 #define DD_PARAM_OUT short2* out, float *outNorm
637 #define DD_PARAM_IN const short2* in, const float *inNorm
638 #if (defined DIRECT_ACCESS_INTER)
639 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR_HALF
640 #define INTERTEX out
641 #else
642 #define READ_AND_SUM_SPINOR READ_AND_SUM_ST_SPINOR_HALF_TEX
643 #ifdef USE_TEXTURE_OBJECTS
644 #define INTERTEX param.outTex
645 #else
646 #define INTERTEX interTexHalf2
647 #endif // USE_TEXTURE_OBJECTS
648 #endif
649 #define WRITE_SPINOR WRITE_ST_SPINOR_SHORT2
650 #if (DD_AXPY==1)
651 #ifdef USE_TEXTURE_OBJECTS
652 #define ACCUMTEX param.xTex
653 #else
654 #define ACCUMTEX accumTexHalf2
655 #endif // USE_TEXTURE_OBJECTS
656 #define READ_ACCUM READ_ST_ACCUM_HALF_TEX
657 #endif // DD_AXPY
658 
659 #endif
660 
661 #ifdef GPU_STAGGERED_DIRAC
662 
663 // only build double precision if supported
664 #if !(__COMPUTE_CAPABILITY__ < 130 && DD_PREC == 0)
665 
666 #define DD_CONCAT(n,r,x) n ## r ## x ## Kernel
667 #define DD_FUNC(n,r,x) DD_CONCAT(n,r,x)
668 
669 // define the kernel
670 
671 #if (DD_IMPROVED==1)
672 
673 template <KernelType kernel_type>
674 __global__ void DD_FUNC(DD_FNAME, DD_RECON_F, DD_AXPY_F)
676 #ifdef GPU_STAGGERED_DIRAC
677  #include "staggered_dslash_core.h"
678 #endif
679 }
680 
681 template <>
684 #ifdef GPU_STAGGERED_DIRAC
686 #endif
687 }
688 
689 #else // naive staggered kernel
690 
691 #undef READ_LONG_MATRIX
692 #define READ_LONG_MATRIX(gauge, dir, idx, stride)
693 
694 #undef READ_LONG_PHASE
695 #define READ_LONG_PHASE(phase, dir, idx, stride)
696 
697 template <KernelType kernel_type>
698 __global__ void DD_FUNC(DD_FNAME, DD_RECON_F, DD_AXPY_F)
700 #ifdef GPU_STAGGERED_DIRAC
701  #include "staggered_dslash_core.h"
702 #endif
703 }
704 
705 template <>
708 #ifdef GPU_STAGGERED_DIRAC
710 #endif
711 }
712 
713 #endif
714 
715 #endif // !(__COMPUTE_CAPABILITY__ < 130 && DD_PREC == 0)
716 
717 #endif // ! GPU_STAGGERED_DIRAC
718 
719 // clean up
720 
721 #undef DD_PREC_F
722 #undef DD_RECON_F
723 #undef DD_AXPY_F
724 #undef DD_PARAM_OUT
725 #undef DD_PARAM_GAUGE
726 #undef DD_PARAM_IN
727 #undef DD_PARAM_AXPY
728 #undef DD_FNAME
729 #undef DD_CONCAT
730 #undef DD_FUNC
731 
732 #undef DSLASH_AXPY
733 #undef READ_GAUGE_MATRIX
734 #undef RECONSTRUCT_GAUGE_MATRIX
735 #undef FATLINK0TEX
736 #undef FATLINK1TEX
737 #undef LONGLINK0TEX
738 #undef LONGLINK1TEX
739 #undef LONGPHASE0TEX
740 #undef LONGPHASE1TEX
741 #undef SPINORTEX
742 #undef WRITE_SPINOR
743 #undef READ_AND_SUM_SPINOR
744 #undef INTERTEX
745 #undef ACCUMTEX
746 #undef READ_ACCUM
747 #undef CLOVERTEX
748 #undef READ_CLOVER
749 #undef DSLASH_CLOVER
750 #undef GAUGE_DOUBLE
751 #undef SPINOR_DOUBLE
752 #undef CLOVER_DOUBLE
753 #undef READ_FAT_MATRIX
754 #undef READ_LONG_MATRIX
755 #undef READ_LONG_PHASE
756 #undef READ_1ST_NBR_SPINOR
757 #undef READ_3RD_NBR_SPINOR
758 
759 
760 // prepare next set of options, or clean up after final iteration
761 
762 #if (DD_AXPY==0)
763 #undef DD_AXPY
764 #define DD_AXPY 1
765 #else
766 #undef DD_AXPY
767 #define DD_AXPY 0
768 
769 #if (DD_RECON==8)
770 #undef DD_RECON
771 #define DD_RECON 9
772 #elif (DD_RECON==9)
773 #undef DD_RECON
774 #define DD_RECON 12
775 #elif (DD_RECON==12)
776 #undef DD_RECON
777 #define DD_RECON 13
778 #elif (DD_RECON==13)
779 #undef DD_RECON
780 #define DD_RECON 18
781 #else
782 #undef DD_RECON
783 
784 #define DD_RECON 8
785 
786 #if (DD_PREC==0)
787 #undef DD_PREC
788 #define DD_PREC 1
789 #elif (DD_PREC==1)
790 #undef DD_PREC
791 #define DD_PREC 2
792 #else
793 #undef DD_PREC
794 #define DD_PREC 0
795 
796 #undef DD_LOOP
797 #undef DD_AXPY
798 #undef DD_RECON
799 #undef DD_PREC
800 
801 #endif // DD_PREC
802 #endif // DD_RECON
803 #endif // DD_AXPY
804 
805 #ifdef DD_LOOP
806 #include "staggered_dslash_def.h"
807 #endif
#define DD_PARAM_GAUGE
#define DD_AXPY_F
#define DD_RECON_F
#define DD_PARAM_IN
#define DD_FUNC(x)
Definition: clover_def.h:141
#define DD_FNAME
#define DD_PARAM_OUT
#define DD_PARAM_AXPY